diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index 5e71c32fed..a861d60a97 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -7,6 +7,9 @@ on: branches: [main] pull_request: +permissions: + contents: write + jobs: create-table-on-pr: if: github.event_name == 'pull_request' @@ -32,8 +35,6 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@v3 - with: - token: ${{ secrets.RELEASE }} - uses: actions/setup-python@v4 with: @@ -49,6 +50,8 @@ jobs: make build-docs - name: Push table + env: + GITHUB_TOKEN: ${{ github.token }} run: | git config --global user.email "github-actions[bot]@users.noreply.github.com" git config --global user.name "github-actions[bot]" @@ -57,6 +60,7 @@ jobs: echo "No changes detected" else git add docs/tasks.md - git commit -m "Update tasks table" + git add docs/benchmarks.md + git commit -m "Update tasks & benchmarks tables" git push fi diff --git a/Makefile b/Makefile index 206dfca695..5bd3eb7033 100644 --- a/Makefile +++ b/Makefile @@ -38,6 +38,7 @@ build-docs: @echo "--- 📚 Building documentation ---" # since we do not have a documentation site, this just build tables for the .md files python docs/create_tasks_table.py + python docs/create_benchmarks_table.py model-load-test: diff --git a/README.md b/README.md index 8f8342d0f9..ed4badd0fe 100644 --- a/README.md +++ b/README.md @@ -78,8 +78,9 @@ The following links to the main sections in the usage documentation. | **General** | | | [Evaluating a Model](docs/usage/usage.md#evaluating-a-model) | How to evaluate a model | | [Evaluating on different Modalities](docs/usage/usage.md#evaluating-on-different-modalities) | How to evaluate image and image-text tasks | +| [MIEB](docs/mieb/readme.md) | How to run the Massive Image Embedding Benchmark | | **Selecting Tasks** | | -| [Selecting a benchmark](docs/usage/usage.md#selecting-a-benchmark) | How to select and filter tasks | +| [Selecting a benchmark](docs/usage/usage.md#selecting-a-benchmark) | How to select benchmarks | | [Task selection](docs/usage/usage.md#task-selection) | How to select and filter tasks | | [Selecting Split and Subsets](docs/usage/usage.md#selecting-evaluation-split-or-subsets) | How to select evaluation splits or subsets | | [Using a Custom Task](docs/usage/usage.md#using-a-custom-task) | How to evaluate on a custom task | @@ -96,7 +97,8 @@ The following links to the main sections in the usage documentation. | **Leaderboard** | | | [Running the Leaderboard Locally](docs/usage/usage.md#running-the-leaderboard-locally) | How to run the leaderboard locally | | [Report Data Contamination](docs/usage/usage.md#annotate-contamination) | How to report data contamination for a model | -| [Fetching Result from the Leaderboard](docs/usage/usage.md#fetching-results-from-the-leaderboard) | How to fetch the raw results from the leaderboard | +| [Loading and working with Results](docs/usage/results.md) | How to load and working with the raw results from the leaderboard, including making result dataframes | + ## Overview @@ -107,8 +109,8 @@ The following links to the main sections in the usage documentation. | 📋 [Tasks] | Overview of available tasks | | 📐 [Benchmarks] | Overview of available benchmarks | | **Contributing** | | -| 🤖 [Adding a model] | Information related to how to submit a model to MTEB and to the leaderboard | -| 👩‍🔬 [Reproducible workflows] | Information related to how to create reproducible workflows with MTEB | +| 🤖 [Adding a model] | How to submit a model to MTEB and to the leaderboard | +| 👩‍🔬 [Reproducible workflows] | How to create reproducible workflows with MTEB | | 👩‍💻 [Adding a dataset] | How to add a new task/dataset to MTEB | | 👩‍💻 [Adding a benchmark] | How to add a new benchmark to MTEB and to the leaderboard | | 🤝 [Contributing] | How to contribute to MTEB and set it up for development | @@ -172,3 +174,4 @@ Some of these amazing publications include (ordered chronologically): - Dawei Zhu, Liang Wang, Nan Yang, Yifan Song, Wenhao Wu, Furu Wei, Sujian Li. "[LongEmbed: Extending Embedding Models for Long Context Retrieval](https://arxiv.org/abs/2404.12096)" arXiv 2024 - Kenneth Enevoldsen, Márton Kardos, Niklas Muennighoff, Kristoffer Laigaard Nielbo. "[The Scandinavian Embedding Benchmarks: Comprehensive Assessment of Multilingual and Monolingual Text Embedding](https://arxiv.org/abs/2406.02396)" arXiv 2024 - Ali Shiraee Kasmaee, Mohammad Khodadad, Mohammad Arshi Saloot, Nick Sherck, Stephen Dokas, Hamidreza Mahyar, Soheila Samiee. "[ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance & Efficiency on a Specific Domain](https://arxiv.org/abs/2412.00532)" arXiv 2024 +- Chenghao Xiao, Isaac Chung, Imene Kerboua, Jamie Stirling, Xin Zhang, Márton Kardos, Roman Solomatin, Noura Al Moubayed, Kenneth Enevoldsen, Niklas Muennighoff. "[MIEB: Massive Image Embedding Benchmark](https://arxiv.org/abs/2504.10471)" arXiv 2025 diff --git a/docs/benchmarks.md b/docs/benchmarks.md index a222a241a6..a004220d09 100644 --- a/docs/benchmarks.md +++ b/docs/benchmarks.md @@ -5,29 +5,43 @@ The following table gives you an overview of the benchmarks in MTEB. -| Name | # Tasks | Task Types | Domains | Languages | -|------|---------|------------|---------|-----------| -| [BRIGHT](https://brightbenchmark.github.io/) | 1 | {'Retrieval': 1} | [Non-fiction] | eng | -| [ChemTEB](https://arxiv.org/abs/2412.00532) | 27 | {'BitextMining': 1, 'Classification': 17, 'Clustering': 2, 'PairClassification': 5, 'Retrieval': 2} | [Chemistry] | nld,tur,eng,ces,kor,zho,spa,hin,jpn,deu,fra,msa,por | -| [CoIR](https://github.com/CoIR-team/coir) | 10 | {'Retrieval': 10} | [Written, Programming] | javascript,ruby,sql,go,eng,java,php,python,c++ | -| [LongEmbed](https://arxiv.org/abs/2404.12096v2) | 6 | {'Retrieval': 6} | [Fiction, Academic, Written, Blog, Non-fiction, Spoken, Encyclopaedic] | eng | -| [MINERSBitextMining](https://arxiv.org/pdf/2406.07424) | 7 | {'BitextMining': 7} | [Reviews, Written, Social] | sqi,ban,srp,jpn,nds,lat,por,mon,kur,bul,slv,mak,deu,uzb,yor,kzj,max,kat,cha,yid,zsm,spa,pms,mhr,min,fao,heb,nij,mui,tuk,rus,bew,swe,pes,slk,ceb,bjn,ido,abs,ukr,ina,kab,tgl,cor,dan,kaz,fry,rej,hrv,ces,lfn,glg,dsb,hau,ace,urd,ben,yue,nld,eng,epo,ron,xho,wuu,cmn,ind,ang,hsb,mad,pam,nov,swh,bbc,pcm,ara,hye,mkd,nno,ast,jav,lvs,mal,swg,nob,tat,arz,vie,ile,tam,est,ber,bre,csb,pol,afr,cbk,bug,tzl,kor,ibo,hun,war,aze,tha,mar,uig,gla,orv,hin,amh,bel,sun,fin,cat,awa,gsw,isl,oci,ell,cym,arq,ita,fra,bos,dtp,eus,bhp,tel,tur,khm,lit,gle | -| MTEB(Europe, beta) | 74 | {'BitextMining': 7, 'Classification': 21, 'Clustering': 8, 'Retrieval': 15, 'InstructionRetrieval': 3, 'MultilabelClassification': 2, 'PairClassification': 6, 'Reranking': 3, 'STS': 9} | [Web, Fiction, Social, Academic, Religious, Written, Medical, Blog, Constructed, Non-fiction, Legal, News, Government, Reviews, Spoken, Encyclopaedic, Programming, Subtitles] | qvm,esk,nlg,toj,gup,llg,jpn,azj,for,lav,kmh,por,bsj,tna,upv,cta,smk,zty,qvz,ntj,ton,uvh,cjk,kgf,gaw,bak,seh,jiv,hui,ksr,uli,kwi,qvw,kkl,arl,msk,omw,aai,tet,yby,mva,fao,kgk,min,kac,dji,box,rus,chz,emp,ktm,bps,bon,nus,bss,cut,sue,meq,kpr,rwo,ceb,zaj,mib,aui,apc,kdl,mxb,okv,rai,big,reg,ulk,mlg,yap,tpt,hrv,nak,plu,nde,kyc,arp,hau,ary,alp,apr,caa,mbh,uvl,zat,bjp,urd,bki,lin,mek,hlt,iws,spl,xav,yml,lcm,ese,xho,are,mux,lww,ndg,ntu,tzj,ame,yss,zar,fil,aii,csy,gvs,zpm,amh,spp,ken,avt,ltz,swh,viv,kmk,zul,bqp,cav,wln,leu,tcs,tuf,mkd,clu,msy,too,ast,amx,quf,jav,yre,nhe,tat,lbk,maj,msm,rug,nor,tbc,prf,pad,zlm,kze,wnc,fai,cbs,mai,aoi,mxq,bao,kos,mlh,nep,mkl,roo,umb,poh,bod,nna,aey,afr,aly,cac,maa,aze,fon,tha,mhl,chd,tpi,tzm,acq,kyz,nbq,yle,ape,bco,att,nin,mkj,yuj,ata,djr,atb,enq,cpb,sxb,rmc,zas,guj,kbq,gfk,tgo,acm,cux,fin,npi,etr,tsn,dob,mpt,alq,byx,cak,cso,spy,oci,asm,ttc,nwi,srn,hmn,gyr,hto,ngu,cpa,tif,fuv,kue,yuw,ote,mgw,ssg,bos,mvn,dop,aso,mox,ndj,stp,mpp,nas,kon,mks,caf,mbs,mcd,wap,cco,tod,aon,aom,cnl,srp,zga,lat,sja,kpj,nhi,nko,swp,bho,blw,mih,mon,sna,bgs,als,kyf,kur,bul,uzb,knj,mam,yor,zos,gdr,aka,bam,bmh,gnw,lid,cha,msc,zpl,gun,qxn,zsm,spa,mgh,nca,cpc,quc,hvn,bvr,agu,ngp,aak,jni,mau,sab,wos,huv,swe,kea,tum,pes,som,pbt,mmo,amo,kgp,taq,sbe,mil,nhg,bmu,bvd,wrs,atg,muy,tpa,ign,vmy,uri,chf,cek,knf,pib,soy,boa,ces,xed,pma,hix,kbc,orm,sim,ace,nhw,kud,ppo,xnn,yut,snx,ilo,zaa,nld,bsp,aau,myk,grn,bkq,cme,bbb,ssd,fur,knc,knv,heg,urw,ayr,ons,sat,crx,rop,szl,suz,ncl,anh,kto,tca,chk,xla,qxh,ziw,ntp,azb,ara,tew,sot,cjv,djk,usa,ltg,cap,arz,lmo,vec,jao,wer,dhg,vie,ded,hop,khk,faa,tam,sus,mwc,ikk,kek,mie,trc,tue,ura,crh,bkd,bzj,kwj,klt,sps,jid,xsi,swa,qxo,lim,nqo,hns,tmd,mbt,mbc,ibo,hun,wrk,bnp,abt,kaq,car,kiz,nvm,nfa,gul,guo,uzn,beo,aer,nhy,otm,cjo,tgk,bel,eri,mca,wsk,rro,row,bsn,tpz,fij,tvk,msb,mpx,abx,poy,sgb,kas,tcz,top,dif,awk,cbc,bea,ell,myy,pus,bmr,ssx,pao,ebk,ajp,opm,wnu,gub,acr,tbf,ubr,cth,taj,aby,kde,mqj,zao,khm,hat,gle,azg,cbv,ian,apu,ptp,kbm,met,plt,sag,agd,pag,ydd,ckb,mzz,div,kmg,miz,tac,tuo,gvn,boj,tee,mph,mna,qwh,gng,agg,mle,rgu,haw,med,kyg,mig,nhu,tnc,waj,kat,lua,zpz,kpx,tof,ven,dzo,yaa,bqc,klv,qul,kqw,bef,gai,heb,nuy,zac,mcr,zpc,ssw,meu,tuk,gui,kmo,usp,otq,khs,ksj,xbi,nya,cya,aoj,kmr,grc,sny,snp,mir,piu,geb,tgl,dik,agn,dan,qvn,kaz,kbp,mto,tiy,xon,zav,dww,zap,kqa,lac,kne,wat,cbt,naf,inb,kwf,crn,azz,wim,ben,wro,poi,yue,awb,cgc,eng,mjc,amf,mps,mwe,ncu,cle,tdt,hne,zai,gdn,toc,bhl,kir,ron,fue,kyq,ixl,ghs,ncj,tbz,nnq,mio,kwd,mxp,beu,sbk,fuh,gym,ztq,mey,ikw,pab,kmb,cof,tso,ipi,byr,aia,wiv,agm,npl,ter,hye,iou,tku,nno,cnt,kqc,sll,lvs,gnn,nob,dah,nii,san,wuv,udu,gux,ots,zpq,cuk,mbj,nab,bjz,hbo,imo,mcf,glk,zam,twi,srd,sin,zca,qvc,agr,con,kjs,zaw,mav,gum,dov,ood,soq,tte,msa,chq,cbk,isn,kpf,ptu,mri,cao,aeb,cni,aaz,yon,pan,sgz,rom,mop,gwi,nou,uig,gla,far,atd,hin,tnp,bbr,kpg,huu,arn,jvn,cat,awa,amm,urb,run,mit,pir,gam,adz,tir,isl,pls,mlt,qve,nyu,txu,tbg,dwy,quy,ruf,kiw,shp,amr,ita,maq,dgr,fra,kin,ubu,gof,gaz,mgc,cmo,ctu,tel,eus,mcq,bpr,ino,snd,bgt,mwf,acu,jic,kkc,jac,lit,xtd,dyu,kvn,zyp,prs,cop,auc,wed,apb,sqi,ban,wal,poe,tnk,myu,otn,kje,ong,bkx,zsr,hch,agt,wiu,spm,zpu,scn,sri,myw,buk,kdc,zho,sbs,slv,deu,kqf,kvg,tgp,bhg,dwr,xtm,amu,wbp,tim,ory,tos,kan,kbh,mya,mwp,mcb,shn,bdd,cub,yrb,tbo,yal,lug,tah,txq,emi,hub,nso,slk,zpo,zpv,bmk,nss,bjn,nch,bzd,shj,ukr,mbl,tlf,kab,kew,kpw,luo,cpy,kmu,kup,zab,pri,snc,wbi,acf,gmv,glg,amp,qup,nop,srq,yka,apw,mqb,wmt,bch,ewe,sey,lbb,epo,qvh,taw,fuc,kql,ksd,smo,gvf,cmn,yad,ind,qvs,obo,wmw,nsn,anv,mic,pap,ake,fas,cbr,bjr,glv,mdy,tsw,gvc,noa,bus,bjv,cwe,pon,pio,snn,mal,nho,bba,jae,mxt,wol,nif,ycn,lao,tfr,ffm,qub,hus,bzh,mlp,mti,not,nys,tzo,arb,mos,kam,cuc,dgc,pah,pjt,est,bxh,hot,bre,kms,cot,awx,bjk,pwg,cpu,hla,mpm,fuf,pol,tnn,shi,auy,mpj,tuc,bug,kor,zad,war,ars,rkb,mni,cbu,lif,mar,dad,mee,dgz,mco,kik,apz,mkn,sco,mbb,maz,lij,khz,hmo,guh,sun,cbi,lgl,nhr,tiw,daa,amn,amk,tke,lex,mag,cym,eko,zia,mcp,gah,urt,sua,cab,quh,srm,vid,blz,mmx,apn,tur,rmy,bem,yaq,ctp,cui,lus,tav,cax,yva | -| MTEB(Indic, beta) | 23 | {'BitextMining': 4, 'Clustering': 1, 'Classification': 13, 'STS': 1, 'PairClassification': 1, 'Retrieval': 2, 'Reranking': 1} | [Web, Fiction, Social, Encyclopaedic, Religious, Written, Constructed, Non-fiction, Legal, News, Spoken, Reviews, Government] | ban,pag,ckb,ydd,srp,azj,jpn,bho,por,sna,als,scn,cjk,zho,mwr,bul,slv,deu,yor,bak,ory,aka,bam,kat,lua,kan,dzo,mya,zsm,spa,shn,min,nus,fao,heb,kac,lug,tuk,kea,rus,ssw,tum,swe,nso,pes,slk,som,mup,pbt,nya,ceb,bjn,kmr,apc,taq,ukr,kab,luo,tgl,dik,dan,kaz,kbp,hrv,ces,glg,ary,hau,ace,urd,ben,boy,ewe,ilo,yue,lin,nld,eng,hne,epo,kir,grn,ron,xho,smo,fur,knc,cmn,ind,ayr,sat,szl,pap,fas,kmb,tso,ltz,swh,brx,zul,azb,doi,ara,hye,mkd,nno,ast,jav,lvs,mal,lao,sot,wol,nob,ltg,tat,san,arz,lmo,vec,nor,vie,sag,khk,arb,mos,kam,tam,bgc,mai,gbm,srd,est,twi,crh,sin,nep,swa,umb,bod,pol,lim,nqo,afr,bug,kor,ibo,mri,hun,aeb,war,ars,mni,fon,tha,mar,tpi,tzm,acq,pan,uzn,kik,gla,uig,hin,lij,tgk,amh,bel,sun,acm,guj,fin,cat,awa,fij,npi,run,tsn,kas,tir,isl,asm,mlt,ell,oci,mag,cym,pus,gom,quy,ajp,raj,fuv,ita,kin,bos,fra,gaz,eus,tel,tur,snd,kon,khm,bem,dyu,gle,hat,lit,prs,lus,plt | -| MTEB(Medical) | 12 | {'Retrieval': 9, 'Clustering': 2, 'Reranking': 1} | [Web, Academic, Medical, Written, Non-fiction, Government] | rus,eng,kor,ara,spa,zho,vie,fra,pol,cmn | -| MTEB(Multilingual, beta) | 132 | {'BitextMining': 13, 'Classification': 43, 'Clustering': 17, 'Retrieval': 18, 'InstructionRetrieval': 3, 'MultilabelClassification': 5, 'PairClassification': 11, 'Reranking': 6, 'STS': 16} | [Web, Fiction, Social, Academic, Religious, Written, Medical, Blog, Constructed, Non-fiction, Legal, Government, News, Reviews, Spoken, Encyclopaedic, Programming, Subtitles] | qvm,esk,nlg,toj,gup,llg,jpn,azj,for,lav,kmh,por,bsj,tna,upv,cta,smk,zty,qvz,ntj,ton,uvh,cjk,kgf,gaw,bak,seh,jiv,hui,ksr,uli,kwi,qvw,kkl,arl,msk,omw,aai,tet,yby,mva,fao,kgk,min,kac,dji,mui,box,rus,chz,emp,bew,ktm,bps,bon,nus,bss,cut,sue,meq,kpr,rwo,ceb,zaj,mib,aui,apc,kdl,mxb,okv,rai,big,reg,ulk,mlg,yap,tpt,rej,hrv,nak,plu,nde,lfn,kyc,arp,hau,ary,alp,apr,caa,mbh,uvl,zat,bjp,urd,bki,lin,mek,hlt,iws,spl,xav,yml,lcm,ese,xho,are,mux,lww,ndg,ntu,tzj,ame,yss,zar,fil,aii,csy,gvs,zpm,amh,spp,ken,avt,ltz,swh,viv,kmk,zul,bqp,cav,wln,leu,tcs,tuf,mkd,clu,msy,too,ast,amx,quf,jav,yre,nhe,tat,lbk,maj,msm,rug,nor,tbc,prf,pad,zlm,kze,wnc,fai,cbs,mai,aoi,mxq,bao,kos,mlh,nep,mkl,roo,umb,poh,bod,nna,aey,afr,aly,cac,maa,aze,fon,tha,mhl,chd,tpi,tzm,acq,kyz,nbq,yle,ape,bco,att,nin,mkj,yuj,ata,djr,atb,enq,cpb,sxb,rmc,zas,guj,kbq,gfk,tgo,acm,cux,fin,npi,etr,tsn,dob,mpt,alq,byx,cak,cso,spy,oci,asm,ttc,nwi,srn,hmn,gyr,hto,arq,ngu,cpa,tif,fuv,raj,kue,yuw,ote,mgw,ssg,bos,mvn,dop,aso,mox,ndj,stp,mpp,nas,kon,mks,caf,mbs,mcd,wap,cco,tod,aon,aom,cnl,srp,zga,lat,sja,kpj,nhi,nko,swp,bho,blw,mih,mon,sna,bgs,als,kyf,kur,bul,uzb,knj,mam,yor,zos,gdr,aka,bam,bmh,gnw,lid,cha,msc,zpl,gun,qxn,zsm,spa,mgh,nca,yid,pms,mhr,cpc,quc,hvn,bvr,agu,svk,ngp,aak,jni,mau,sab,wos,huv,swe,kea,tum,pes,som,mup,pbt,mmo,amo,kgp,ido,taq,sbe,mil,nhg,bmu,bvd,wrs,atg,muy,tpa,chv,ign,vmy,cor,uri,fry,chf,cek,knf,pib,soy,boa,ces,xed,pma,hix,kbc,orm,sim,ace,nhw,kud,ppo,xnn,yut,boy,snx,ilo,zaa,nld,bsp,aau,myk,grn,bkq,cme,bbb,ssd,fur,knc,wuu,knv,heg,urw,ayr,ons,sat,crx,ang,hsb,rop,szl,suz,mad,ncl,anh,kto,tca,chk,xla,qxh,brx,ziw,ntp,azb,ara,tew,sot,cjv,djk,usa,ltg,cap,arz,lmo,vec,jao,wer,dhg,vie,ded,hop,khk,faa,tam,bgc,sus,mwc,ikk,kek,mie,trc,tue,ura,crh,ber,bkd,bzj,kwj,klt,sps,jid,xsi,swa,qxo,csb,lim,nqo,hns,tmd,mbt,mbc,ibo,hun,wrk,bnp,abt,kaq,car,kiz,nvm,nfa,gul,guo,uzn,beo,aer,nhy,otm,orv,cjo,tgk,bel,eri,mca,wsk,rro,row,bsn,tpz,fij,tvk,msb,mpx,abx,poy,sgb,kas,tcz,top,dif,awk,cbc,bea,ell,myy,pus,bmr,ssx,pao,ebk,ajp,opm,wnu,gub,acr,max,tbf,ubr,cth,taj,aby,kde,mqj,zao,tyv,khm,hat,gle,azg,cbv,ian,apu,ptp,kbm,met,plt,sag,agd,sah,pag,ydd,ckb,mzz,div,kmg,miz,tac,tuo,gvn,boj,tee,mph,mna,qwh,gng,agg,mle,mak,rgu,haw,med,kyg,mig,nhu,tnc,waj,kat,lua,zpz,kpx,tof,ven,dzo,yaa,bqc,klv,qul,kqw,bef,gai,heb,nuy,zac,mcr,zpc,ssw,meu,tuk,gui,kmo,usp,otq,khs,ksj,xbi,nya,cya,aoj,kmr,grc,sny,snp,mir,piu,geb,tgl,dik,agn,dan,qvn,kaz,kbp,mto,tiy,xon,zav,dww,zap,kqa,lac,kne,wat,cbt,naf,inb,kwf,crn,azz,wim,ben,wro,poi,yue,awb,cgc,eng,mjc,amf,mps,mwe,ncu,cle,tdt,hne,zai,gdn,toc,bhl,kir,ron,fue,kyq,ixl,ghs,ncj,tbz,nnq,mio,kwd,mxp,beu,sbk,fuh,gym,ztq,mey,ikw,pab,pam,kmb,cof,tso,ipi,byr,aia,wiv,pcm,agm,doi,npl,ter,hye,iou,tku,nno,cnt,kqc,sll,lvs,gnn,nob,dah,nii,san,wuv,udu,gux,ots,zpq,cuk,mbj,nab,bjz,hbo,imo,mcf,glk,zam,twi,srd,sin,zca,qvc,agr,con,kjs,zaw,mav,gum,dov,ood,soq,tte,msa,chq,cbk,tzl,isn,kpf,ptu,mri,cao,aeb,cni,aaz,yon,pan,sgz,rom,mop,gwi,nou,uig,gla,far,atd,hin,tnp,bbr,kpg,huu,arn,jvn,cat,awa,amm,urb,run,mit,pir,gam,adz,tir,isl,pls,mlt,gsw,qve,nyu,txu,tbg,dwy,quy,ruf,kiw,shp,amr,ita,maq,dgr,fra,kin,ubu,gof,gaz,mgc,cmo,ctu,tel,eus,mcq,bpr,ino,snd,bgt,mwf,acu,jic,kkc,jac,lit,xtd,dyu,kvn,zyp,prs,cop,auc,wed,apb,sqi,ban,wal,poe,tnk,myu,otn,kje,ong,bkx,zsr,nds,hch,agt,wiu,spm,zpu,scn,sri,myw,buk,kdc,zho,sbs,slv,mwr,deu,kqf,kvg,tgp,bhg,dwr,xtm,amu,wbp,tim,ory,kzj,tos,kan,kbh,mya,mwp,mcb,shn,bdd,cub,yrb,tbo,yal,nij,lug,tah,txq,emi,hub,nso,slk,zpo,zpv,bmk,nss,bjn,nch,abs,bzd,shj,ukr,mbl,ina,tlf,kab,kew,kpw,luo,cpy,kmu,kup,zab,pri,snc,wbi,acf,gmv,glg,dsb,amp,qup,nop,srq,yka,apw,mqb,wmt,bch,ewe,sey,lbb,epo,qvh,taw,fuc,kql,ksd,smo,gvf,cmn,yad,ind,qvs,obo,wmw,nsn,anv,mic,pap,ake,nov,fas,cbr,bjr,glv,mdy,bbc,tsw,gvc,noa,bus,bjv,cwe,pon,pio,snn,swg,mal,nho,bba,jae,mxt,wol,nif,ycn,lao,tfr,ffm,qub,hus,bzh,mlp,mti,not,nys,ile,tzo,arb,mos,kam,cuc,dgc,pah,pjt,gbm,est,bxh,hot,bre,kms,cot,awx,bjk,pwg,cpu,hla,mpm,fuf,pol,tnn,shi,auy,mpj,tuc,bug,kor,zad,war,ars,rkb,mni,cbu,lif,mar,krc,dad,mee,dgz,mco,kik,apz,mkn,sco,mbb,maz,lij,khz,hmo,guh,sun,cbi,lgl,nhr,tiw,daa,amn,amk,tke,lex,mag,cym,gom,eko,zia,mcp,gah,urt,sua,cab,quh,srm,dtp,vid,blz,bhp,mmx,apn,tur,rmy,bem,yaq,ctp,cui,lus,tav,cax,yva | -| [MTEB(Retrieval w/Instructions)](https://arxiv.org/abs/2403.15246) | 3 | {'InstructionRetrieval': 3} | [Written, News] | eng | -| [MTEB(Scandinavian)](https://kennethenevoldsen.github.io/scandinavian-embedding-benchmark/) | 28 | {'BitextMining': 2, 'Classification': 13, 'Retrieval': 7, 'Clustering': 6} | [Web, Fiction, Social, Written, Blog, Non-fiction, Legal, News, Spoken, Reviews, Government, Encyclopaedic] | swe,nno,isl,dan,fao,nob | -| MTEB(code) | 12 | {'Retrieval': 12} | [Written, Programming] | javascript,ruby,sql,go,c,eng,shell,typescript,rust,java,php,python,scala,swift,c++ | -| [MTEB(deu)](https://arxiv.org/html/2401.02709v1) | 19 | {'Classification': 6, 'Clustering': 4, 'PairClassification': 2, 'Reranking': 1, 'Retrieval': 4, 'STS': 2} | [Web, Written, News, Spoken, Reviews, Encyclopaedic] | pol,deu,fra,eng | -| MTEB(eng, beta) | 41 | {'Classification': 8, 'Retrieval': 10, 'Clustering': 8, 'Reranking': 2, 'STS': 9, 'PairClassification': 3, 'Summarization': 1} | [Web, Academic, Social, Written, Medical, Blog, Non-fiction, News, Spoken, Reviews, Encyclopaedic, Programming] | nld,tur,eng,ara,spa,ita,deu,fra,pol,cmn | -| MTEB(eng, classic) | 67 | {'Classification': 12, 'Retrieval': 26, 'Clustering': 11, 'Reranking': 4, 'STS': 10, 'PairClassification': 3, 'Summarization': 1} | [Web, Academic, Social, Written, Medical, Blog, Non-fiction, News, Spoken, Reviews, Encyclopaedic, Programming] | nld,tur,eng,ara,spa,ita,deu,fra,pol,cmn | -| [MTEB(fra)](https://arxiv.org/abs/2405.20468) | 26 | {'Classification': 6, 'Clustering': 7, 'PairClassification': 2, 'Reranking': 2, 'Retrieval': 5, 'STS': 3, 'Summarization': 1} | [Web, Academic, Social, Written, Non-fiction, Legal, News, Spoken, Reviews, Encyclopaedic] | pol,deu,fra,eng | -| [MTEB(jpn)](https://github.com/sbintuitions/JMTEB) | 16 | {'Clustering': 2, 'Classification': 4, 'STS': 2, 'PairClassification': 1, 'Retrieval': 6, 'Reranking': 1} | [Web, Academic, Written, Non-fiction, News, Spoken, Reviews, Encyclopaedic] | jpn | -| MTEB(kor) | 6 | {'Classification': 1, 'Reranking': 1, 'Retrieval': 2, 'STS': 2} | [Web, Written, News, Spoken, Reviews, Encyclopaedic] | kor | -| [MTEB(law)](https://aclanthology.org/2023.eacl-main.148/) | 8 | {'Retrieval': 8} | [Written, Legal] | deu,zho,eng | -| [MTEB(pol)](https://arxiv.org/abs/2405.10138) | 18 | {'Classification': 7, 'Clustering': 3, 'PairClassification': 4, 'STS': 4} | [Web, Fiction, Academic, Social, Written, Non-fiction, Legal, News, Spoken] | pol,deu,fra,eng | -| [MTEB(rus)](https://aclanthology.org/2023.eacl-main.148/) | 23 | {'Classification': 9, 'Clustering': 3, 'MultilabelClassification': 2, 'PairClassification': 1, 'Reranking': 2, 'Retrieval': 3, 'STS': 3} | [Web, Social, Academic, Written, Blog, News, Spoken, Reviews, Encyclopaedic] | rus | -| [NanoBEIR](https://huggingface.co/collections/zeta-alpha-ai/nanobeir-66e1a0af21dfd93e620cd9f6) | 13 | {'Retrieval': 13} | [Web, Academic, Social, Medical, Written, Non-fiction, News, Encyclopaedic] | eng | -| [RAR-b](https://arxiv.org/abs/2404.06347) | 17 | {'Retrieval': 17} | [Encyclopaedic, Written, Programming] | eng | + +| Name | Leaderboard name | # Tasks | Task Types | Domains | Languages | +|------|------------------|---------|------------|---------|-----------| +| [BEIR](https://arxiv.org/abs/2104.08663) | BEIR | 15 | Retrieval: 15 | [Academic, Blog, Encyclopaedic, Financial, Government, Medical, News, Non-fiction, Programming, Reviews, Social, Web, Written] | eng | +| [BEIR-NL](https://arxiv.org/abs/2412.08329) | BEIR-NL | 15 | Retrieval: 15 | [Academic, Encyclopaedic, Medical, Non-fiction, Web, Written] | nld | +| [BRIGHT](https://brightbenchmark.github.io/) | BRIGHT | 1 | Retrieval: 1 | [Non-fiction, Written] | eng | +| [BRIGHT (long)](https://brightbenchmark.github.io/) | BRIGHT (long) | 1 | Retrieval: 1 | [Non-fiction, Written] | eng | +| [BuiltBench(eng)](https://arxiv.org/abs/2411.12056) | BuiltBench(eng) | 4 | Clustering: 2, Retrieval: 1, Reranking: 1 | [Engineering, Written] | eng | +| [ChemTEB](https://arxiv.org/abs/2412.00532) | Chemical | 27 | BitextMining: 1, Classification: 17, Clustering: 2, PairClassification: 5, Retrieval: 2 | [Chemistry] | ces,deu,eng,fra,hin,jpn,kor,msa,nld,por,spa,tur,zho | +| [CoIR](https://github.com/CoIR-team/coir) | Code Information Retrieval | 10 | Retrieval: 10 | [Programming, Written] | c++,eng,go,java,javascript,php,python,ruby,sql | +| [CodeRAG](https://arxiv.org/abs/2406.14497) | CodeRAG | 4 | Reranking: 4 | [Programming] | python | +| [Encodechka](https://github.com/avidale/encodechka) | Encodechka | 7 | STS: 2, Classification: 4, PairClassification: 1 | [Fiction, Government, News, Non-fiction, Social, Web, Written] | rus | +| [FollowIR](https://arxiv.org/abs/2403.15246) | Instruction Following | 3 | InstructionRetrieval: 3 | [News, Written] | eng | +| [LongEmbed](https://arxiv.org/abs/2404.12096v2) | Long-context Retrieval | 6 | Retrieval: 6 | [Academic, Blog, Encyclopaedic, Fiction, Non-fiction, Spoken, Written] | eng | +| [MIEB(Img)](https://arxiv.org/abs/2504.10471) | Image only | 49 | Any2AnyRetrieval: 15, ImageClassification: 22, ImageClustering: 5, VisualSTS(eng): 5, VisualSTS(multi): 2 | [Blog, Encyclopaedic, Medical, News, Non-fiction, Reviews, Scene, Social, Spoken, Web, Written] | ara,cmn,deu,eng,fra,ita,kor,nld,pol,por,rus,spa,tur | +| MIEB(Multilingual) | MIEB(Multilingual) | 130 | ImageClassification: 22, ImageClustering: 5, ZeroShotClassification: 23, VisionCentricQA: 6, Compositionality: 7, VisualSTS(eng): 7, Any2AnyRetrieval: 45, DocumentUnderstanding: 10, Any2AnyMultilingualRetrieval: 3, VisualSTS(multi): 2 | [Academic, Blog, Constructed, Encyclopaedic, Medical, News, Non-fiction, Reviews, Scene, Social, Spoken, Web, Written] | ara,ben,bul,ces,cmn,dan,deu,ell,eng,est,fas,fil,fin,fra,heb,hin,hrv,hun,ind,ita,jpn,kor,mri,nld,nor,pol,por,quz,ron,rus,spa,swa,swe,tel,tha,tur,ukr,vie,zho | +| MIEB(eng) | MIEB(eng) | 125 | ImageClassification: 22, ImageClustering: 5, ZeroShotClassification: 23, VisionCentricQA: 6, Compositionality: 7, VisualSTS(eng): 7, Any2AnyRetrieval: 45, DocumentUnderstanding: 10 | [Academic, Blog, Constructed, Encyclopaedic, Medical, News, Non-fiction, Reviews, Scene, Social, Spoken, Web, Written] | eng | +| MIEB(lite) | MIEB(lite) | 51 | ImageClassification: 8, ImageClustering: 2, ZeroShotClassification: 7, VisionCentricQA: 5, Compositionality: 6, VisualSTS(eng): 2, VisualSTS(multi): 2, Any2AnyRetrieval: 11, DocumentUnderstanding: 6, Any2AnyMultilingualRetrieval: 2 | [Academic, Blog, Encyclopaedic, Medical, News, Non-fiction, Reviews, Scene, Social, Spoken, Web, Written] | ara,ben,bul,ces,cmn,dan,deu,ell,eng,est,fas,fil,fin,fra,heb,hin,hrv,hun,ind,ita,jpn,kor,mri,nld,nor,pol,por,quz,ron,rus,spa,swa,swe,tel,tha,tur,ukr,vie,zho | +| [MINERSBitextMining](https://arxiv.org/pdf/2406.07424) | MINERSBitextMining | 7 | BitextMining: 7 | [Reviews, Social, Written] | abs,ace,afr,amh,ang,ara,arq,arz,ast,awa,aze,ban,bbc,bel,ben,ber,bew,bhp,bjn,bos,bre,bug,bul,cat,cbk,ceb,ces,cha,cmn,cor,csb,cym,dan,deu,dsb,dtp,ell,eng,epo,est,eus,fao,fin,fra,fry,gla,gle,glg,gsw,hau,heb,hin,hrv,hsb,hun,hye,ibo,ido,ile,ina,ind,isl,ita,jav,jpn,kab,kat,kaz,khm,kor,kur,kzj,lat,lfn,lit,lvs,mad,mak,mal,mar,max,mhr,min,mkd,mon,mui,nds,nij,nld,nno,nob,nov,oci,orv,pam,pcm,pes,pms,pol,por,rej,ron,rus,slk,slv,spa,sqi,srp,sun,swe,swg,swh,tam,tat,tel,tgl,tha,tuk,tur,tzl,uig,ukr,urd,uzb,vie,war,wuu,xho,yid,yor,yue,zsm | +| MTEB(Code, v1) | Code | 12 | Retrieval: 12 | [Programming, Written] | c,c++,eng,go,java,javascript,php,python,ruby,rust,scala,shell,sql,swift,typescript | +| [MTEB(Europe, v1)](https://arxiv.org/abs/2502.13595) | European | 74 | BitextMining: 7, Classification: 21, Clustering: 8, Retrieval: 15, InstructionRetrieval: 3, MultilabelClassification: 2, PairClassification: 6, Reranking: 3, STS: 9 | [Academic, Blog, Constructed, Encyclopaedic, Fiction, Financial, Government, Legal, Medical, News, Non-fiction, Programming, Religious, Reviews, Social, Spoken, Subtitles, Web, Written] | bul,ces,dan,deu,ell,eng,est,eus,fao,fin,fra,gle,hrv,hun,isl,ita,lav,lit,mlt,nld,nno,nob,pol,por,rom,ron,slk,slv,spa,swe | +| [MTEB(Indic, v1)](https://arxiv.org/abs/2502.13595) | Indic | 23 | BitextMining: 4, Clustering: 1, Classification: 13, PairClassification: 1, Retrieval: 2, Reranking: 1, STS: 1 | [Constructed, Encyclopaedic, Fiction, Government, Legal, News, Non-fiction, Religious, Reviews, Social, Spoken, Web, Written] | asm,awa,ben,bgc,bho,bod,boy,brx,doi,eng,gbm,gom,guj,hin,hne,kan,kas,mai,mal,mar,mni,mup,mwr,nep,npi,ory,pan,pus,raj,san,sat,snd,tam,tel,urd | +| MTEB(Law, v1) | Legal | 8 | Retrieval: 8 | [Legal, Written] | deu,eng,zho | +| MTEB(Medical, v1) | Medical | 12 | Retrieval: 9, Clustering: 2, Reranking: 1 | [Academic, Government, Medical, Non-fiction, Web, Written] | ara,cmn,eng,fra,kor,pol,rus,spa,vie,zho | +| [MTEB(Multilingual, v2)](https://arxiv.org/abs/2502.13595) | Multilingual | 131 | BitextMining: 13, Classification: 43, Clustering: 16, Retrieval: 18, InstructionRetrieval: 3, MultilabelClassification: 5, PairClassification: 11, Reranking: 6, STS: 16 | [Academic, Blog, Constructed, Encyclopaedic, Entertainment, Fiction, Financial, Government, Legal, Medical, News, Non-fiction, Programming, Religious, Reviews, Social, Spoken, Subtitles, Web, Written] | aai,aak,aau,aaz,abs,abt,abx,aby,ace,acf,acm,acq,acr,acu,adz,aeb,aer,aey,afr,agd,agg,agm,agn,agr,agt,agu,aia,aii,ajp,aka,ake,alp,alq,als,aly,ame,amf,amh,amk,amm,amn,amo,amp,amr,amu,amx,ang,anh,anv,aoi,aoj,aom,aon,apb,apc,ape,apn,apr,apu,apw,apz,ara,arb,are,arl,arn,arp,arq,ars,ary,arz,asm,aso,ast,ata,atb,atd,atg,att,auc,aui,auy,avt,awa,awb,awk,awx,ayr,azb,aze,azg,azj,azz,bak,bam,ban,bao,bba,bbb,bbc,bbr,bch,bco,bdd,bea,bef,bel,bem,ben,beo,ber,beu,bew,bgc,bgs,bgt,bhg,bhl,bho,bhp,big,bjk,bjn,bjp,bjr,bjv,bjz,bkd,bki,bkq,bkx,blw,blz,bmh,bmk,bmr,bmu,bnp,boa,bod,boj,bon,bos,box,boy,bpr,bps,bqc,bqp,bre,brx,bsj,bsn,bsp,bss,bug,buk,bul,bus,bvd,bvr,bxh,byr,byx,bzd,bzh,bzj,caa,cab,cac,caf,cak,cao,cap,car,cat,cav,cax,cbc,cbi,cbk,cbr,cbs,cbt,cbu,cbv,cco,ceb,cek,ces,cgc,cha,chd,chf,chk,chq,chv,chz,cjk,cjo,cjv,ckb,cle,clu,cme,cmn,cmo,cni,cnl,cnt,cof,con,cop,cor,cot,cpa,cpb,cpc,cpu,cpy,crh,crn,crx,csb,cso,csy,cta,cth,ctp,ctu,cub,cuc,cui,cuk,cut,cux,cwe,cya,cym,daa,dad,dah,dan,ded,deu,dgc,dgr,dgz,dhg,dif,dik,div,dji,djk,djr,dob,doi,dop,dov,dsb,dtp,dwr,dww,dwy,dyu,dzo,ebk,eko,ell,emi,emp,eng,enq,epo,eri,ese,esk,est,etr,eus,ewe,faa,fai,fao,far,fas,ffm,fij,fil,fin,fon,for,fra,fry,fuc,fue,fuf,fuh,fur,fuv,gah,gai,gam,gaw,gaz,gbm,gdn,gdr,geb,gfk,ghs,gla,gle,glg,glk,glv,gmv,gng,gnn,gnw,gof,gom,grc,grn,gsw,gub,guh,gui,guj,gul,gum,gun,guo,gup,gux,gvc,gvf,gvn,gvs,gwi,gym,gyr,hat,hau,haw,hbo,hch,heb,heg,hin,hix,hla,hlt,hmn,hmo,hne,hns,hop,hot,hrv,hsb,hto,hub,hui,hun,hus,huu,huv,hvn,hye,ian,ibo,ido,ign,ikk,ikw,ile,ilo,imo,ina,inb,ind,ino,iou,ipi,isl,isn,ita,iws,ixl,jac,jae,jao,jav,jic,jid,jiv,jni,jpn,jvn,kab,kac,kam,kan,kaq,kas,kat,kaz,kbc,kbh,kbm,kbp,kbq,kdc,kde,kdl,kea,kek,ken,kew,kgf,kgk,kgp,khk,khm,khs,khz,kik,kin,kir,kiw,kiz,kje,kjs,kkc,kkl,klt,klv,kmb,kmg,kmh,kmk,kmo,kmr,kms,kmu,knc,kne,knf,knj,knv,kon,kor,kos,kpf,kpg,kpj,kpr,kpw,kpx,kqa,kqc,kqf,kql,kqw,krc,ksd,ksj,ksr,ktm,kto,kud,kue,kup,kur,kvg,kvn,kwd,kwf,kwi,kwj,kyc,kyf,kyg,kyq,kyz,kze,kzj,lac,lao,lat,lav,lbb,lbk,lcm,leu,lex,lfn,lgl,lid,lif,lij,lim,lin,lit,llg,lmo,ltg,ltz,lua,lug,luo,lus,lvs,lww,maa,mad,mag,mai,maj,mak,mal,mam,maq,mar,mau,mav,max,maz,mbb,mbc,mbh,mbj,mbl,mbs,mbt,mca,mcb,mcd,mcf,mco,mcp,mcq,mcr,mdy,med,mee,mek,meq,met,meu,mey,mgc,mgh,mgw,mhl,mhr,mib,mic,mie,mig,mih,mil,min,mio,mir,mit,miz,mjc,mkd,mkj,mkl,mkn,mks,mle,mlg,mlh,mlp,mlt,mmo,mmx,mna,mni,mon,mop,mos,mox,mph,mpj,mpm,mpp,mps,mpt,mpx,mqb,mqj,mri,msa,msb,msc,msk,msm,msy,mti,mto,mui,mup,mux,muy,mva,mvn,mwc,mwe,mwf,mwp,mwr,mxb,mxp,mxq,mxt,mya,myk,myu,myw,myy,mzz,nab,naf,nak,nas,nbq,nca,nch,ncj,ncl,ncu,nde,ndg,ndj,nds,nep,nfa,ngp,ngu,nhe,nhg,nhi,nho,nhr,nhu,nhw,nhy,nif,nii,nij,nin,nko,nld,nlg,nna,nno,nnq,noa,nob,nop,nor,not,nou,nov,npi,npl,nqo,nsn,nso,nss,ntj,ntp,ntu,nus,nuy,nvm,nwi,nya,nys,nyu,obo,oci,okv,omw,ong,ons,ood,opm,orm,orv,ory,ote,otm,otn,otq,ots,pab,pad,pag,pah,pam,pan,pao,pap,pbt,pcm,pes,pib,pio,pir,piu,pjt,pls,plt,plu,pma,pms,poe,poh,poi,pol,pon,por,poy,ppo,prf,pri,prs,ptp,ptu,pus,pwg,qub,quc,quf,quh,qul,qup,quy,qvc,qve,qvh,qvm,qvn,qvs,qvw,qvz,qwh,qxh,qxn,qxo,rai,raj,reg,rej,rgu,rkb,rmc,rmy,rom,ron,roo,rop,row,rro,ruf,rug,run,rus,rwo,sab,sag,sah,san,sat,sbe,sbk,sbs,scn,sco,seh,sey,sgb,sgz,shi,shj,shn,shp,sim,sin,sja,slk,sll,slv,smk,smo,sna,snc,snd,snn,snp,snx,sny,som,soq,sot,soy,spa,spl,spm,spp,sps,spy,sqi,srd,sri,srm,srn,srp,srq,ssd,ssg,ssw,ssx,stp,sua,sue,sun,sus,suz,svk,swa,swe,swg,swh,swp,sxb,szl,tac,tah,taj,tam,taq,tat,tav,taw,tbc,tbf,tbg,tbo,tbz,tca,tcs,tcz,tdt,tee,tel,ter,tet,tew,tfr,tgk,tgl,tgo,tgp,tha,tif,tim,tir,tiw,tiy,tke,tku,tlf,tmd,tna,tnc,tnk,tnn,tnp,toc,tod,tof,toj,ton,too,top,tos,tpa,tpi,tpt,tpz,trc,tsn,tso,tsw,ttc,tte,tuc,tue,tuf,tuk,tum,tuo,tur,tvk,twi,txq,txu,tyv,tzj,tzl,tzm,tzo,ubr,ubu,udu,uig,ukr,uli,ulk,umb,upv,ura,urb,urd,uri,urt,urw,usa,usp,uvh,uvl,uzb,uzn,vec,ven,vid,vie,viv,vmy,waj,wal,wap,war,wat,wbi,wbp,wed,wer,wim,wiu,wiv,wln,wmt,wmw,wnc,wnu,wol,wos,wrk,wro,wrs,wsk,wuu,wuv,xav,xbi,xed,xho,xla,xnn,xon,xsi,xtd,xtm,yaa,yad,yal,yap,yaq,yby,ycn,ydd,yid,yka,yle,yml,yon,yor,yrb,yre,yss,yue,yuj,yut,yuw,yva,zaa,zab,zac,zad,zai,zaj,zam,zao,zap,zar,zas,zat,zav,zaw,zca,zga,zho,zia,ziw,zlm,zos,zpc,zpl,zpm,zpo,zpq,zpu,zpv,zpz,zsm,zsr,ztq,zty,zul,zyp | +| [MTEB(Scandinavian, v1)](https://kennethenevoldsen.github.io/scandinavian-embedding-benchmark/) | Scandinavian | 28 | BitextMining: 2, Classification: 13, Retrieval: 7, Clustering: 6 | [Blog, Encyclopaedic, Fiction, Government, Legal, News, Non-fiction, Reviews, Social, Spoken, Web, Written] | dan,fao,isl,nno,nob,swe | +| [MTEB(cmn, v1)](https://github.com/FlagOpen/FlagEmbedding/tree/master/research/C_MTEB) | Chinese | 32 | Retrieval: 8, Reranking: 4, PairClassification: 2, Clustering: 4, STS: 7, Classification: 7 | [Academic, Entertainment, Financial, Government, Medical, Non-fiction, Written] | cmn | +| [MTEB(deu, v1)](https://arxiv.org/html/2401.02709v1) | German | 19 | Classification: 6, Clustering: 4, PairClassification: 2, Reranking: 1, Retrieval: 4, STS: 2 | [Encyclopaedic, Legal, News, Non-fiction, Reviews, Spoken, Web, Written] | deu | +| MTEB(eng, v1) | English Legacy | 56 | Classification: 12, Retrieval: 15, Clustering: 11, Reranking: 4, STS: 10, PairClassification: 3, Summarization: 1 | [Academic, Blog, Encyclopaedic, Financial, Government, Medical, News, Non-fiction, Programming, Reviews, Social, Spoken, Web, Written] | eng | +| MTEB(eng, v2) | English | 41 | Retrieval: 10, Clustering: 8, Reranking: 2, STS: 9, Classification: 8, PairClassification: 3, Summarization: 1 | [Academic, Blog, Encyclopaedic, Financial, Medical, News, Non-fiction, Programming, Reviews, Social, Spoken, Web, Written] | eng | +| MTEB(fas, beta) | Farsi (BETA) | 60 | Classification: 18, Clustering: 5, PairClassification: 8, Reranking: 2, Retrieval: 21, STS: 3, BitextMining: 3 | [Academic, Blog, Encyclopaedic, Medical, News, Religious, Reviews, Social, Spoken, Web, Written] | fas | +| [MTEB(fra, v1)](https://arxiv.org/abs/2405.20468) | French | 25 | Classification: 6, Clustering: 7, PairClassification: 1, Reranking: 2, Retrieval: 5, STS: 3, Summarization: 1 | [Academic, Encyclopaedic, Legal, News, Non-fiction, Reviews, Social, Spoken, Web, Written] | eng,fra | +| [MTEB(jpn, v1)](https://github.com/sbintuitions/JMTEB) | Japanese | 16 | Clustering: 2, Classification: 4, STS: 2, PairClassification: 1, Retrieval: 6, Reranking: 1 | [Academic, Encyclopaedic, News, Non-fiction, Reviews, Spoken, Web, Written] | jpn | +| MTEB(kor, v1) | Korean | 6 | Classification: 1, Reranking: 1, Retrieval: 2, STS: 2 | [Encyclopaedic, News, Reviews, Spoken, Web, Written] | kor | +| [MTEB(pol, v1)](https://arxiv.org/abs/2405.10138) | Polish | 17 | Classification: 7, Clustering: 3, PairClassification: 4, STS: 3 | [Academic, Fiction, Legal, News, Non-fiction, Reviews, Social, Spoken, Web, Written] | pol | +| [MTEB(rus, v1)](https://aclanthology.org/2023.eacl-main.148/) | Russian | 23 | Classification: 9, Clustering: 3, MultilabelClassification: 2, PairClassification: 1, Reranking: 2, Retrieval: 3, STS: 3 | [Academic, Blog, Encyclopaedic, News, Reviews, Social, Spoken, Web, Written] | rus | +| [NanoBEIR](https://huggingface.co/collections/zeta-alpha-ai/nanobeir-66e1a0af21dfd93e620cd9f6) | NanoBEIR | 13 | Retrieval: 13 | [Academic, Encyclopaedic, Medical, News, Non-fiction, Social, Web, Written] | eng | +| [RAR-b](https://arxiv.org/abs/2404.06347) | Reasoning retrieval | 17 | Retrieval: 17 | [Encyclopaedic, Programming, Written] | eng | + diff --git a/docs/create_benchmarks_table.py b/docs/create_benchmarks_table.py index 7fddf07c75..f22d2638bc 100644 --- a/docs/create_benchmarks_table.py +++ b/docs/create_benchmarks_table.py @@ -21,18 +21,26 @@ def benchmark_to_markdown_row(b: mteb.Benchmark) -> str: if t.metadata.languages: agg_langs.update(t.languages) + # to not infinitely trigger ci + agg_domains = sorted(agg_domains) + agg_langs = sorted(agg_langs) langs = ",".join(list(agg_langs)) domains = "[" + ", ".join(agg_domains) + "]" if agg_domains else "" - task_types = dict(Counter([t.metadata.type for t in b.tasks])) + task_types = ", ".join( + [ + f"{name}: {val}" + for name, val in Counter([t.metadata.type for t in b.tasks]).items() + ] + ) - return f"| {name_w_reference} | {n_tasks} | {task_types} | {domains} | {langs} |" + return f"| {name_w_reference} | {b.display_name if b.display_name else b.name} | {n_tasks} | {task_types} | {domains} | {langs} |" def create_benchmarks_table(benchmarks: list[mteb.Benchmark]) -> str: table = """ -| Name | # Tasks | Task Types | Domains | Languages | -|------|---------|------------|---------|-----------| +| Name | Leaderboard name | # Tasks | Task Types | Domains | Languages | +|------|------------------|---------|------------|---------|-----------| """ for benchmark in benchmarks: table += benchmark_to_markdown_row(benchmark) + "\n" diff --git a/docs/create_tasks_table.py b/docs/create_tasks_table.py index f7e79331ec..ac16f0313d 100644 --- a/docs/create_tasks_table.py +++ b/docs/create_tasks_table.py @@ -7,8 +7,8 @@ import polars as pl import mteb -from mteb.abstasks.TaskMetadata import PROGRAMMING_LANGS, TASK_TYPE -from mteb.languages import ISO_TO_FAM_LEVEL0, ISO_TO_LANGUAGE +from mteb.abstasks.TaskMetadata import TASK_TYPE +from mteb.languages import ISO_TO_FAM_LEVEL0, ISO_TO_LANGUAGE, PROGRAMMING_LANGS def author_from_bibtex(bibtex: str | None) -> str: diff --git a/docs/images/visualizations/result_objects.png b/docs/images/visualizations/result_objects.png new file mode 100644 index 0000000000..8eba2a7442 Binary files /dev/null and b/docs/images/visualizations/result_objects.png differ diff --git a/docs/mieb/readme.md b/docs/mieb/readme.md index 9aa6f6ce04..0a9c4eb53a 100644 --- a/docs/mieb/readme.md +++ b/docs/mieb/readme.md @@ -3,11 +3,66 @@ # Welcome to MIEB! 👋 -The Massive Image Embedding Benchmark (MIEB) is an image extension of [MTEB](https://arxiv.org/abs/2210.07316) to cover embedding tasks for image-text tasks. +The [Massive Image Embedding Benchmark (MIEB)](https://arxiv.org/abs/2504.10471) is an image extension of [MTEB](https://arxiv.org/abs/2210.07316) to cover embedding tasks for image-text tasks. ## 🌱 Background -MIEB intends to extend MTEB and MMTEB to cover image representation learning and image-text alignment tasks. +MIEB intends to extend MTEB and MMTEB to cover image representation learning and image-text alignment tasks. At the time of publishing, MIEB offers 130 tasks over 8 task categories. 3 benchmarks are offered: +1. `MIEB(Multilingual)` +2. `MIEB(eng)` +3. `MIEB(lite)` + +## 🚀 Running MIEB + +If you’re already familiar with how MTEB works, then run any benchmark, task, and model the same way! + + +### Run MIEB in 2 lines via CLI +First, install the `mieb` dependencies: +```sh +pip install mteb[image] +``` + +Then, run the multilingual benchmark with a selected model, e.g. CLIP: +```sh +mteb run -b ‘MIEB(Multilingual)’ -m openai/clip-vit-base-patch16 +``` + +### Run MIEB in Python + +Similarly, running the benchmark can be done in Python in 3 main steps: 1) Select the tasks, load the model, and run the evaluation. + +1. Select the whole benchmark +```python +import mteb + +tasks = mteb.get_benchmarks("MIEB(Multilingual)") +``` + +Alternatively, select a single task: +```python +tasks = mteb.get_tasks(tasks=["CIFAR10ZeroShot"]) +``` + +Or select tasks by categories: +```python +tasks = mteb.get_tasks(task_types=["Compositionality"]) +``` + +2. Load a Model: + +```python +model_name = "laion/CLIP-ViT-L-14-laion2B-s32B-b82K" +model = mteb.get_model(model_name=model_name) +``` + +3. Run the Evaluation: + +```python +evaluation = mteb.MTEB(tasks=tasks) +results = evaluation.run(model) +``` + ## 🪴 Contributing to MIEB @@ -21,13 +76,16 @@ There are a few ways for anyone to contribute to MIEB: Let's go through an example. -## Example +
+ Contribution Example (click to unfold) + +### Example Here is an example implementing a zero-shot image classification from scratch. Let's say we wish to implement CIFAR10 as a task and evaluate an OpenCLIP model on it. To solve this task, we need to encode the `images`, encode the `class label candidates with prompts` (e.g. "this is a dog pic", "this is a cat pic"), and compare them by calculating similarity, and then argmax out the class prediction for each image. We begin by implementing a model wrapper. -### Model Wrapper +#### Model Wrapper See the [`ImageEncoder` class](https://github.com/embeddings-benchmark/mteb/blob/mieb/mteb/encoder_interface.py) for more details. The model class implements `get_text_embeddings`, `get_image_embeddings`, and `calculate_probs` methods. As an example, [`OpenCLIPWrapper`](https://github.com/embeddings-benchmark/mteb/blob/mieb/mteb/models/openclip_models.py) is first implemented, with metadata defined below. ```python @@ -36,7 +94,7 @@ class OpenCLIPWrapper: ``` See also [adding a model](adding_a_model.md) for reference. -### X Evaluator +#### X Evaluator With the model, [ZeroShotClassificationEvaluator](https://github.com/embeddings-benchmark/mteb/blob/mieb/mteb/evaluation/evaluators/Image/ZeroShotClassificationEvaluator.py) is implemented here. This defines how the model are used to do zero-shot classification and get back results on desired metrics. ```python class ZeroShotClassificationEvaluator(Evaluator): @@ -47,7 +105,7 @@ class ZeroShotClassificationEvaluator(Evaluator): ... ``` -### AbsTask X +#### AbsTask X With the evaluator, [AbsTaskZeroShotClassification](https://github.com/embeddings-benchmark/mteb/blob/mieb/mteb/abstasks/Image/AbsTaskZeroShotClassification.py) is defined, operating on the dataset, calling the defined Evaluator, and gives out results. ```python class AbsTaskZeroShotClassification(AbsTask): @@ -55,7 +113,7 @@ class AbsTaskZeroShotClassification(AbsTask): ``` -### Dataset class +#### Dataset class With all these, we can then define the dataset. [CIFAR10](https://github.com/embeddings-benchmark/mteb/blob/mieb/mteb/tasks/Image/ZeroShotClassification/eng/CIFAR.py) is implemented like this, subclassing `AbsTaskZeroShotClassification`, and overwrite the `get_candidate_labels` function, which gives `["a photo of {label_name}"]` to be used in the evaluator. ```python class CIFAR10ZeroShotClassification(AbsTaskZeroShotClassification): @@ -66,7 +124,7 @@ class CIFAR10ZeroShotClassification(AbsTaskZeroShotClassification): ``` See also [adding a dataset](adding_a_dataset.md) for reference. -### Putting them all together +#### Putting them all together With all these, we can then ```python import mteb @@ -80,3 +138,21 @@ results = evaluation.run(model) ``` By default, results will be under `results/laion__CLIP-ViT-L-14-laion2B-s32B-b82K/REVISION/CIFAR10ZeroShot.json`. Sometimes metrics can be a bit different than what the original paper claimed. This might be due to the resolution/layout difference of images in the remake of the dataset. + +
+ +## Citing + +When using `mieb`, we recommend you use the following citation: + +```bibtex +@article{xiao2025mieb, + author = {Chenghao Xiao and Isaac Chung and Imene Kerboua and Jamie Stirling and Xin Zhang and Márton Kardos and Roman Solomatin and Noura Al Moubayed and Kenneth Enevoldsen and Niklas Muennighoff}, + title = {MIEB: Massive Image Embedding Benchmark}, + publisher = {arXiv}, + journal={arXiv preprint arXiv:2504.10471}, + year = {2025}, + url = {https://arxiv.org/abs/2504.10471}, + doi = {10.48550/ARXIV.2504.10471}, +} +``` diff --git a/docs/tasks.md b/docs/tasks.md index 5e31fbe2b2..b58d7fa455 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -8,25 +8,25 @@ The following tables give you an overview of the tasks in MTEB. | Name | Languages | Type | Category | Domains | # Samples | Dataset statistics | |------|-----------|------|----------|---------|-----------|--------------------| -| [AFQMC](https://aclanthology.org/2021.emnlp-main.357) | ['cmn'] | STS | s2s | | None | None | -| [AILACasedocs](https://zenodo.org/records/4063986) | ['eng'] | Retrieval | p2p | [Legal, Written] | None | None | -| [AILAStatutes](https://zenodo.org/records/4063986) | ['eng'] | Retrieval | p2p | [Legal, Written] | None | None | +| [AFQMC](https://aclanthology.org/2021.emnlp-main.357) (Raghu et al., 2021) | ['cmn'] | STS | s2s | | None | None | +| [AILACasedocs](https://zenodo.org/records/4063986) (Paheli Bhattacharya and Kripabandhu Ghosh and Saptarshi Ghosh and Arindam Pal and Parth Mehta and Arnab Bhattacharya and Prasenjit Majumder, 2020) | ['eng'] | Retrieval | p2p | [Legal, Written] | None | None | +| [AILAStatutes](https://zenodo.org/records/4063986) (Paheli Bhattacharya and Kripabandhu Ghosh and Saptarshi Ghosh and Arindam Pal and Parth Mehta and Arnab Bhattacharya and Prasenjit Majumder, 2020) | ['eng'] | Retrieval | p2p | [Legal, Written] | None | None | | [AJGT](https://link.springer.com/chapter/10.1007/978-3-319-60042-0_66/) (Alomari et al., 2017) | ['ara'] | Classification | s2s | [Social, Written] | None | None | -| [ARCChallenge](https://allenai.org/data/arc) (Xiao et al., 2024) | ['eng'] | Retrieval | s2s | [Encyclopaedic, Written] | None | None | +| [ARCChallenge](https://allenai.org/data/arc) (Clark et al., 2018) | ['eng'] | Retrieval | s2s | [Encyclopaedic, Written] | None | None | | [AROCocoOrder](https://openreview.net/forum?id=KRLUvxh8uaX) (Yuksekgonul et al., 2023) | ['eng'] | Compositionality | i2t | [Encyclopaedic] | {'test': 25010} | {'test': {'num_samples': 25010, 'num_images': 25010, 'num_texts': 125050, 'num_unique_texts': 119661, 'min_text_length': 26, 'average_text_length': 51.52, 'max_text_length': 191}} | | [AROFlickrOrder](https://openreview.net/forum?id=KRLUvxh8uaX) (Yuksekgonul et al., 2023) | ['eng'] | Compositionality | i2t | [Encyclopaedic] | {'test': 5000} | {'test': {'num_samples': 5000, 'num_images': 5000, 'num_texts': 25000, 'num_unique_texts': 23892, 'min_text_length': 11, 'average_text_length': 62.37, 'max_text_length': 185}} | | [AROVisualAttribution](https://openreview.net/forum?id=KRLUvxh8uaX) (Yuksekgonul et al., 2023) | ['eng'] | Compositionality | i2t | [Encyclopaedic] | {'test': 28748} | {'test': {'num_samples': 28748, 'num_images': 28748, 'num_texts': 57496, 'num_unique_texts': 52146, 'min_text_length': 27, 'average_text_length': 35.98, 'max_text_length': 61}} | | [AROVisualRelation](https://openreview.net/forum?id=KRLUvxh8uaX) (Yuksekgonul et al., 2023) | ['eng'] | Compositionality | i2t | [Encyclopaedic] | {'test': 23937} | {'test': {'num_samples': 23937, 'num_images': 23937, 'num_texts': 47874, 'num_unique_texts': 26706, 'min_text_length': 21, 'average_text_length': 34.83, 'max_text_length': 57}} | -| [ATEC](https://aclanthology.org/2021.emnlp-main.357) | ['cmn'] | STS | s2s | | None | None | -| [AfriSentiClassification](https://arxiv.org/abs/2302.08956) | ['amh', 'arq', 'ary', 'hau', 'ibo', 'kin', 'pcm', 'por', 'swa', 'tso', 'twi', 'yor'] | Classification | s2s | [Social, Written] | None | None | +| [ATEC](https://aclanthology.org/2021.emnlp-main.357) (Raghu et al., 2021) | ['cmn'] | STS | s2s | | None | None | +| [AfriSentiClassification](https://arxiv.org/abs/2302.08956) (Shamsuddeen Hassan Muhammad, 2023) | ['amh', 'arq', 'ary', 'hau', 'ibo', 'kin', 'pcm', 'por', 'swa', 'tso', 'twi', 'yor'] | Classification | s2s | [Social, Written] | None | None | | [AfriSentiLangClassification](https://huggingface.co/datasets/HausaNLP/afrisenti-lid-data/) | ['amh', 'arq', 'ary', 'hau', 'ibo', 'kin', 'pcm', 'por', 'swa', 'tso', 'twi', 'yor'] | Classification | s2s | [Social, Written] | None | None | -| [AllegroReviews](https://aclanthology.org/2020.acl-main.111.pdf) | ['pol'] | Classification | s2s | | None | None | +| [AllegroReviews](https://aclanthology.org/2020.acl-main.111.pdf) | ['pol'] | Classification | s2s | [Reviews] | None | None | | [AlloProfClusteringP2P.v2](https://huggingface.co/datasets/lyon-nlp/alloprof) (Lefebvre-Brossard et al., 2023) | ['fra'] | Clustering | p2p | [Encyclopaedic, Written] | None | None | | [AlloProfClusteringS2S.v2](https://huggingface.co/datasets/lyon-nlp/alloprof) (Lefebvre-Brossard et al., 2023) | ['fra'] | Clustering | s2s | [Encyclopaedic, Written] | None | None | | [AlloprofReranking](https://huggingface.co/datasets/antoinelb7/alloprof) (Lefebvre-Brossard et al., 2023) | ['fra'] | Reranking | s2p | [Academic, Web, Written] | None | None | | [AlloprofRetrieval](https://huggingface.co/datasets/antoinelb7/alloprof) (Lefebvre-Brossard et al., 2023) | ['fra'] | Retrieval | s2p | [Encyclopaedic, Written] | None | None | -| [AlphaNLI](https://leaderboard.allenai.org/anli/submissions/get-started) (Xiao et al., 2024) | ['eng'] | Retrieval | s2s | [Encyclopaedic, Written] | None | None | -| [AmazonCounterfactualClassification](https://arxiv.org/abs/2104.06893) | ['deu', 'eng', 'jpn'] | Classification | s2s | [Reviews, Written] | None | None | +| [AlphaNLI](https://leaderboard.allenai.org/anli/submissions/get-started) (Bhagavatula et al., 2019) | ['eng'] | Retrieval | s2s | [Encyclopaedic, Written] | None | None | +| [AmazonCounterfactualClassification](https://arxiv.org/abs/2104.06893) (O{', 2021) | ['deu', 'eng', 'jpn'] | Classification | s2s | [Reviews, Written] | None | None | | [AmazonPolarityClassification](https://huggingface.co/datasets/amazon_polarity) (Julian McAuley, 2013) | ['eng'] | Classification | p2p | [Reviews, Written] | None | None | | [AmazonReviewsClassification](https://arxiv.org/abs/2010.02573) (Phillip Keung, 2020) | ['cmn', 'deu', 'eng', 'fra', 'jpn', 'spa'] | Classification | s2s | [Reviews, Written] | None | None | | [AngryTweetsClassification](https://aclanthology.org/2021.nodalida-main.53/) (Pauli et al., 2021) | ['dan'] | Classification | s2s | [Social, Written] | None | None | @@ -40,7 +40,7 @@ The following tables give you an overview of the tasks in MTEB. | [ArguAna-PL](https://huggingface.co/datasets/clarin-knext/arguana-pl) (Konrad Wojtasik, 2024) | ['pol'] | Retrieval | s2p | [Medical, Written] | None | None | | [ArmenianParaphrasePC](https://github.com/ivannikov-lab/arpa-paraphrase-corpus) (Arthur Malajyan, 2020) | ['hye'] | PairClassification | s2s | [News, Written] | None | None | | [ArxivClassification](https://ieeexplore.ieee.org/document/8675939) (He et al., 2019) | ['eng'] | Classification | s2s | [Academic, Written] | None | None | -| [AskUbuntuDupQuestions](https://github.com/taolei87/askubuntu) | ['eng'] | Reranking | s2s | [Programming, Web] | {'test': 375} | {'test': {'num_samples': 375, 'number_of_characters': 413674, 'num_positive': 2255, 'num_negative': 5245, 'min_query_length': 17, 'avg_query_length': 50.21, 'max_query_length': 148, 'unique_query': 374, 'min_positive_length': 15, 'avg_positive_length': 52.54, 'max_positive_length': 152, 'unique_positive': 2165, 'min_negative_length': 15, 'avg_negative_length': 52.69, 'max_negative_length': 148, 'unique_negative': 5002}} | +| [AskUbuntuDupQuestions](https://github.com/taolei87/askubuntu) (Wang et al., 2021) | ['eng'] | Reranking | s2s | [Programming, Web] | {'test': 375} | {'test': {'num_samples': 375, 'number_of_characters': 413674, 'num_positive': 2255, 'num_negative': 5245, 'min_query_length': 17, 'avg_query_length': 50.21, 'max_query_length': 148, 'unique_query': 374, 'min_positive_length': 15, 'avg_positive_length': 52.54, 'max_positive_length': 152, 'unique_positive': 2165, 'min_negative_length': 15, 'avg_negative_length': 52.69, 'max_negative_length': 148, 'unique_negative': 5002}} | | [Assin2RTE](https://link.springer.com/chapter/10.1007/978-3-030-41505-1_39) (Real et al., 2020) | ['por'] | PairClassification | s2s | [Written] | None | None | | [Assin2STS](https://link.springer.com/chapter/10.1007/978-3-030-41505-1_39) (Real et al., 2020) | ['por'] | STS | s2s | [Written] | None | None | | [AutoRAGRetrieval](https://arxiv.org/abs/2410.20878) (Dongkyu Kim, 2024) | ['kor'] | Retrieval | s2p | [Financial, Government, Legal, Medical, Social] | {'test': 834} | {'test': {'number_of_characters': 894.22, 'num_samples': 834, 'num_queries': 114, 'num_documents': 720, 'average_document_length': 1.15, 'average_query_length': 0.61, 'average_relevant_docs_per_query': 1.0}} | @@ -51,15 +51,16 @@ The following tables give you an overview of the tasks in MTEB. | [BLINKIT2TRetrieval](https://arxiv.org/abs/2404.12390) (Fu et al., 2024) | ['eng'] | Any2AnyRetrieval | it2t | [Encyclopaedic] | {'test': 813} | {'test': {'number_of_characters': 54272, 'num_samples': 813, 'num_queries': 793, 'num_documents': 20, 'min_document_length': 1, 'average_document_length': 5.8, 'max_document_length': 14, 'unique_documents': 20, 'num_document_images': 0, 'min_query_length': 22, 'average_query_length': 68.29, 'max_query_length': 135, 'unique_queries': 347, 'num_query_images': 793, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 20}} | | [BQ](https://aclanthology.org/2021.emnlp-main.357) (Shitao Xiao, 2024) | ['cmn'] | STS | s2s | | None | None | | [BSARDRetrieval](https://huggingface.co/datasets/maastrichtlawtech/bsard) (Louis et al., 2022) | ['fra'] | Retrieval | s2p | [Legal, Spoken] | None | None | -| [BUCC.v2](https://comparable.limsi.fr/bucc2018/bucc2018-task.html) | ['cmn', 'deu', 'eng', 'fra', 'rus'] | BitextMining | s2s | [Written] | {'test': 35000} | {'test': {'num_samples': 35000, 'number_of_characters': 6640032, 'unique_pairs': 34978, 'min_sentence1_length': 16, 'average_sentence1_length': 99.11, 'max_sentence1_length': 204, 'unique_sentence1': 34978, 'min_sentence2_length': 42, 'average_sentence2_length': 90.61, 'max_sentence2_length': 159, 'unique_sentence2': 25306, 'hf_subset_descriptive_stats': {'de-en': {'num_samples': 9580, 'number_of_characters': 1919197, 'unique_pairs': 9573, 'min_sentence1_length': 50, 'average_sentence1_length': 109.08, 'max_sentence1_length': 204, 'unique_sentence1': 9573, 'min_sentence2_length': 46, 'average_sentence2_length': 91.25, 'max_sentence2_length': 155, 'unique_sentence2': 9570}, 'fr-en': {'num_samples': 9086, 'number_of_characters': 1677545, 'unique_pairs': 9081, 'min_sentence1_length': 43, 'average_sentence1_length': 99.32, 'max_sentence1_length': 174, 'unique_sentence1': 9081, 'min_sentence2_length': 42, 'average_sentence2_length': 85.31, 'max_sentence2_length': 159, 'unique_sentence2': 9076}, 'ru-en': {'num_samples': 14435, 'number_of_characters': 2808206, 'unique_pairs': 14425, 'min_sentence1_length': 40, 'average_sentence1_length': 101.66, 'max_sentence1_length': 186, 'unique_sentence1': 14425, 'min_sentence2_length': 45, 'average_sentence2_length': 92.88, 'max_sentence2_length': 159, 'unique_sentence2': 14424}, 'zh-en': {'num_samples': 1899, 'number_of_characters': 235084, 'unique_pairs': 1899, 'min_sentence1_length': 16, 'average_sentence1_length': 28.43, 'max_sentence1_length': 40, 'unique_sentence1': 1899, 'min_sentence2_length': 48, 'average_sentence2_length': 95.36, 'max_sentence2_length': 159, 'unique_sentence2': 1899}}}} | -| [Banking77Classification](https://arxiv.org/abs/2003.04807) | ['eng'] | Classification | s2s | [Written] | None | None | +| [BUCC.v2](https://comparable.limsi.fr/bucc2018/bucc2018-task.html) (Zweigenbaum et al., 2017) | ['cmn', 'deu', 'eng', 'fra', 'rus'] | BitextMining | s2s | [Written] | {'test': 35000} | {'test': {'num_samples': 35000, 'number_of_characters': 6640032, 'unique_pairs': 34978, 'min_sentence1_length': 16, 'average_sentence1_length': 99.11, 'max_sentence1_length': 204, 'unique_sentence1': 34978, 'min_sentence2_length': 42, 'average_sentence2_length': 90.61, 'max_sentence2_length': 159, 'unique_sentence2': 25306, 'hf_subset_descriptive_stats': {'de-en': {'num_samples': 9580, 'number_of_characters': 1919197, 'unique_pairs': 9573, 'min_sentence1_length': 50, 'average_sentence1_length': 109.08, 'max_sentence1_length': 204, 'unique_sentence1': 9573, 'min_sentence2_length': 46, 'average_sentence2_length': 91.25, 'max_sentence2_length': 155, 'unique_sentence2': 9570}, 'fr-en': {'num_samples': 9086, 'number_of_characters': 1677545, 'unique_pairs': 9081, 'min_sentence1_length': 43, 'average_sentence1_length': 99.32, 'max_sentence1_length': 174, 'unique_sentence1': 9081, 'min_sentence2_length': 42, 'average_sentence2_length': 85.31, 'max_sentence2_length': 159, 'unique_sentence2': 9076}, 'ru-en': {'num_samples': 14435, 'number_of_characters': 2808206, 'unique_pairs': 14425, 'min_sentence1_length': 40, 'average_sentence1_length': 101.66, 'max_sentence1_length': 186, 'unique_sentence1': 14425, 'min_sentence2_length': 45, 'average_sentence2_length': 92.88, 'max_sentence2_length': 159, 'unique_sentence2': 14424}, 'zh-en': {'num_samples': 1899, 'number_of_characters': 235084, 'unique_pairs': 1899, 'min_sentence1_length': 16, 'average_sentence1_length': 28.43, 'max_sentence1_length': 40, 'unique_sentence1': 1899, 'min_sentence2_length': 48, 'average_sentence2_length': 95.36, 'max_sentence2_length': 159, 'unique_sentence2': 1899}}}} | +| [Banking77Classification](https://arxiv.org/abs/2003.04807) (Casanueva et al., 2020) | ['eng'] | Classification | s2s | [Written] | None | None | +| [BeijingOpera](https://huggingface.co/datasets/silky1708/BeijingOpera) (Tian et al., 2014) | ['eng'] | AudioClassification | a2t | [Music] | None | None | | [BelebeleRetrieval](https://arxiv.org/abs/2308.16884) (Lucas Bandarkar, 2023) | ['acm', 'afr', 'als', 'amh', 'apc', 'arb', 'ars', 'ary', 'arz', 'asm', 'azj', 'bam', 'ben', 'bod', 'bul', 'cat', 'ceb', 'ces', 'ckb', 'dan', 'deu', 'ell', 'eng', 'est', 'eus', 'fin', 'fra', 'fuv', 'gaz', 'grn', 'guj', 'hat', 'hau', 'heb', 'hin', 'hrv', 'hun', 'hye', 'ibo', 'ilo', 'ind', 'isl', 'ita', 'jav', 'jpn', 'kac', 'kan', 'kat', 'kaz', 'kea', 'khk', 'khm', 'kin', 'kir', 'kor', 'lao', 'lin', 'lit', 'lug', 'luo', 'lvs', 'mal', 'mar', 'mkd', 'mlt', 'mri', 'mya', 'nld', 'nob', 'npi', 'nso', 'nya', 'ory', 'pan', 'pbt', 'pes', 'plt', 'pol', 'por', 'ron', 'rus', 'shn', 'sin', 'slk', 'slv', 'sna', 'snd', 'som', 'sot', 'spa', 'srp', 'ssw', 'sun', 'swe', 'swh', 'tam', 'tel', 'tgk', 'tgl', 'tha', 'tir', 'tsn', 'tso', 'tur', 'ukr', 'urd', 'uzn', 'vie', 'war', 'wol', 'xho', 'yor', 'zho', 'zsm', 'zul'] | Retrieval | s2p | [News, Web, Written] | {'test': 521866} | {'test': {'number_of_characters': 25574620, 'num_samples': 521866, 'num_queries': 338378, 'num_documents': 183488, 'min_document_length': 4, 'average_document_length': 137.38, 'max_document_length': 237, 'unique_documents': 183488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 338378, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 2, 'unique_relevant_docs': 183488, 'hf_subset_descriptive_stats': {'acm_Arab-acm_Arab': {'number_of_characters': 51232, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 102.98, 'max_document_length': 129, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'acm_Arab-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-acm_Arab': {'number_of_characters': 51232, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 102.98, 'max_document_length': 129, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'afr_Latn-afr_Latn': {'number_of_characters': 71217, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 17, 'average_document_length': 143.94, 'max_document_length': 159, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'afr_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-afr_Latn': {'number_of_characters': 71217, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 17, 'average_document_length': 143.94, 'max_document_length': 159, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'als_Latn-als_Latn': {'number_of_characters': 69498, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 140.41, 'max_document_length': 175, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'als_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-als_Latn': {'number_of_characters': 69498, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 140.41, 'max_document_length': 175, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'amh_Ethi-amh_Ethi': {'number_of_characters': 45221, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 90.67, 'max_document_length': 100, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'amh_Ethi-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-amh_Ethi': {'number_of_characters': 45221, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 90.67, 'max_document_length': 100, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'apc_Arab-apc_Arab': {'number_of_characters': 51248, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 103.02, 'max_document_length': 134, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'apc_Arab-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-apc_Arab': {'number_of_characters': 51248, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 103.02, 'max_document_length': 134, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'arb_Arab-arb_Arab': {'number_of_characters': 53671, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 107.98, 'max_document_length': 134, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'arb_Arab-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-arb_Arab': {'number_of_characters': 53671, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 107.98, 'max_document_length': 134, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'arb_Latn-arb_Latn': {'number_of_characters': 61298, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 123.61, 'max_document_length': 160, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'arb_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-arb_Latn': {'number_of_characters': 61298, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 123.61, 'max_document_length': 160, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ars_Arab-ars_Arab': {'number_of_characters': 51765, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 104.08, 'max_document_length': 119, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ars_Arab-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-ars_Arab': {'number_of_characters': 51765, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 104.08, 'max_document_length': 119, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ary_Arab-ary_Arab': {'number_of_characters': 60261, 'num_samples': 1386, 'num_queries': 898, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 121.49, 'max_document_length': 138, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.09, 'max_query_length': 2, 'unique_queries': 898, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ary_Arab-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-ary_Arab': {'number_of_characters': 60261, 'num_samples': 1386, 'num_queries': 898, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 121.49, 'max_document_length': 138, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.09, 'max_query_length': 2, 'unique_queries': 898, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'arz_Arab-arz_Arab': {'number_of_characters': 52403, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 105.38, 'max_document_length': 115, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'arz_Arab-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-arz_Arab': {'number_of_characters': 52403, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 105.38, 'max_document_length': 115, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'asm_Beng-asm_Beng': {'number_of_characters': 62410, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 4, 'average_document_length': 125.89, 'max_document_length': 158, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'asm_Beng-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-asm_Beng': {'number_of_characters': 62410, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 4, 'average_document_length': 125.89, 'max_document_length': 158, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'azj_Latn-azj_Latn': {'number_of_characters': 67137, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 135.58, 'max_document_length': 156, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'azj_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-azj_Latn': {'number_of_characters': 67137, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 135.58, 'max_document_length': 156, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'bam_Latn-bam_Latn': {'number_of_characters': 66084, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 133.42, 'max_document_length': 166, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'bam_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-bam_Latn': {'number_of_characters': 66084, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 133.42, 'max_document_length': 166, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ben_Beng-ben_Beng': {'number_of_characters': 63512, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 9, 'average_document_length': 128.15, 'max_document_length': 175, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ben_Beng-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-ben_Beng': {'number_of_characters': 63512, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 9, 'average_document_length': 128.15, 'max_document_length': 175, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ben_Latn-ben_Latn': {'number_of_characters': 68285, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 9, 'average_document_length': 137.93, 'max_document_length': 185, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ben_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-ben_Latn': {'number_of_characters': 68285, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 9, 'average_document_length': 137.93, 'max_document_length': 185, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'bod_Tibt-bod_Tibt': {'number_of_characters': 79188, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 160.27, 'max_document_length': 213, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'bod_Tibt-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-bod_Tibt': {'number_of_characters': 79188, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 160.27, 'max_document_length': 213, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'bul_Cyrl-bul_Cyrl': {'number_of_characters': 66577, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 134.43, 'max_document_length': 177, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'bul_Cyrl-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-bul_Cyrl': {'number_of_characters': 66577, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 134.43, 'max_document_length': 177, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'cat_Latn-cat_Latn': {'number_of_characters': 68842, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 15, 'average_document_length': 139.07, 'max_document_length': 163, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'cat_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-cat_Latn': {'number_of_characters': 68842, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 15, 'average_document_length': 139.07, 'max_document_length': 163, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ceb_Latn-ceb_Latn': {'number_of_characters': 74053, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 15, 'average_document_length': 149.75, 'max_document_length': 184, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ceb_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-ceb_Latn': {'number_of_characters': 74053, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 15, 'average_document_length': 149.75, 'max_document_length': 184, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ces_Latn-ces_Latn': {'number_of_characters': 61936, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 124.92, 'max_document_length': 139, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ces_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-ces_Latn': {'number_of_characters': 61936, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 124.92, 'max_document_length': 139, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ckb_Arab-ckb_Arab': {'number_of_characters': 64917, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 11, 'average_document_length': 131.03, 'max_document_length': 178, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ckb_Arab-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-ckb_Arab': {'number_of_characters': 64917, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 11, 'average_document_length': 131.03, 'max_document_length': 178, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'dan_Latn-dan_Latn': {'number_of_characters': 66648, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 134.57, 'max_document_length': 159, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'dan_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-dan_Latn': {'number_of_characters': 66648, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 134.57, 'max_document_length': 159, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'deu_Latn-deu_Latn': {'number_of_characters': 68768, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 138.92, 'max_document_length': 182, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'deu_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-deu_Latn': {'number_of_characters': 68768, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 138.92, 'max_document_length': 182, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ell_Grek-ell_Grek': {'number_of_characters': 79210, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 160.32, 'max_document_length': 212, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ell_Grek-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-ell_Grek': {'number_of_characters': 79210, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 160.32, 'max_document_length': 212, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'est_Latn-est_Latn': {'number_of_characters': 61779, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 124.6, 'max_document_length': 164, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'est_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-est_Latn': {'number_of_characters': 61779, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 124.6, 'max_document_length': 164, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eus_Latn-eus_Latn': {'number_of_characters': 67979, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 137.3, 'max_document_length': 169, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eus_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-eus_Latn': {'number_of_characters': 67979, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 137.3, 'max_document_length': 169, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'fin_Latn-fin_Latn': {'number_of_characters': 66234, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 133.73, 'max_document_length': 161, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'fin_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-fin_Latn': {'number_of_characters': 66234, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 133.73, 'max_document_length': 161, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'fra_Latn-fra_Latn': {'number_of_characters': 82464, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 19, 'average_document_length': 166.98, 'max_document_length': 204, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'fra_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-fra_Latn': {'number_of_characters': 82464, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 19, 'average_document_length': 166.98, 'max_document_length': 204, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'fuv_Latn-fuv_Latn': {'number_of_characters': 53555, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 107.74, 'max_document_length': 122, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'fuv_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-fuv_Latn': {'number_of_characters': 53555, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 107.74, 'max_document_length': 122, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'gaz_Latn-gaz_Latn': {'number_of_characters': 78315, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 158.48, 'max_document_length': 191, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'gaz_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-gaz_Latn': {'number_of_characters': 78315, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 158.48, 'max_document_length': 191, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'grn_Latn-grn_Latn': {'number_of_characters': 68572, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 17, 'average_document_length': 138.52, 'max_document_length': 161, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'grn_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-grn_Latn': {'number_of_characters': 68572, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 17, 'average_document_length': 138.52, 'max_document_length': 161, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'guj_Gujr-guj_Gujr': {'number_of_characters': 57007, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 114.82, 'max_document_length': 138, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'guj_Gujr-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-guj_Gujr': {'number_of_characters': 57007, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 114.82, 'max_document_length': 138, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'hat_Latn-hat_Latn': {'number_of_characters': 64558, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 17, 'average_document_length': 130.29, 'max_document_length': 179, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'hat_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-hat_Latn': {'number_of_characters': 64558, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 17, 'average_document_length': 130.29, 'max_document_length': 179, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'hau_Latn-hau_Latn': {'number_of_characters': 78240, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 158.33, 'max_document_length': 183, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'hau_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-hau_Latn': {'number_of_characters': 78240, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 158.33, 'max_document_length': 183, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'heb_Hebr-heb_Hebr': {'number_of_characters': 50598, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 101.68, 'max_document_length': 134, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'heb_Hebr-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-heb_Hebr': {'number_of_characters': 50598, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 101.68, 'max_document_length': 134, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'hin_Deva-hin_Deva': {'number_of_characters': 66332, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 133.93, 'max_document_length': 165, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'hin_Deva-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-hin_Deva': {'number_of_characters': 66332, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 133.93, 'max_document_length': 165, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'hin_Latn-hin_Latn': {'number_of_characters': 68307, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 137.97, 'max_document_length': 170, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'hin_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-hin_Latn': {'number_of_characters': 68307, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 137.97, 'max_document_length': 170, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'hrv_Latn-hrv_Latn': {'number_of_characters': 62928, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 126.95, 'max_document_length': 175, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'hrv_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-hrv_Latn': {'number_of_characters': 62928, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 126.95, 'max_document_length': 175, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'hun_Latn-hun_Latn': {'number_of_characters': 67941, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 15, 'average_document_length': 137.22, 'max_document_length': 176, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'hun_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-hun_Latn': {'number_of_characters': 67941, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 15, 'average_document_length': 137.22, 'max_document_length': 176, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'hye_Armn-hye_Armn': {'number_of_characters': 68859, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 15, 'average_document_length': 139.1, 'max_document_length': 193, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'hye_Armn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-hye_Armn': {'number_of_characters': 68859, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 15, 'average_document_length': 139.1, 'max_document_length': 193, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ibo_Latn-ibo_Latn': {'number_of_characters': 66167, 'num_samples': 1387, 'num_queries': 899, 'num_documents': 488, 'min_document_length': 19, 'average_document_length': 133.59, 'max_document_length': 156, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.09, 'max_query_length': 2, 'unique_queries': 899, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 2, 'unique_relevant_docs': 488}, 'ibo_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-ibo_Latn': {'number_of_characters': 66167, 'num_samples': 1387, 'num_queries': 899, 'num_documents': 488, 'min_document_length': 19, 'average_document_length': 133.59, 'max_document_length': 156, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.09, 'max_query_length': 2, 'unique_queries': 899, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 2, 'unique_relevant_docs': 488}, 'ilo_Latn-ilo_Latn': {'number_of_characters': 78161, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 158.17, 'max_document_length': 187, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ilo_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-ilo_Latn': {'number_of_characters': 78161, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 158.17, 'max_document_length': 187, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ind_Latn-ind_Latn': {'number_of_characters': 74871, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 19, 'average_document_length': 151.42, 'max_document_length': 207, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ind_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-ind_Latn': {'number_of_characters': 74871, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 19, 'average_document_length': 151.42, 'max_document_length': 207, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'isl_Latn-isl_Latn': {'number_of_characters': 70522, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 142.51, 'max_document_length': 170, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'isl_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-isl_Latn': {'number_of_characters': 70522, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 142.51, 'max_document_length': 170, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ita_Latn-ita_Latn': {'number_of_characters': 76124, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 153.99, 'max_document_length': 185, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ita_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-ita_Latn': {'number_of_characters': 76124, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 153.99, 'max_document_length': 185, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'jav_Latn-jav_Latn': {'number_of_characters': 71722, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 144.97, 'max_document_length': 174, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'jav_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-jav_Latn': {'number_of_characters': 71722, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 144.97, 'max_document_length': 174, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'jpn_Jpan-jpn_Jpan': {'number_of_characters': 33187, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 66.01, 'max_document_length': 76, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'jpn_Jpan-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-jpn_Jpan': {'number_of_characters': 33187, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 66.01, 'max_document_length': 76, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'kac_Latn-kac_Latn': {'number_of_characters': 89655, 'num_samples': 1387, 'num_queries': 899, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 181.72, 'max_document_length': 195, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.09, 'max_query_length': 2, 'unique_queries': 899, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'kac_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-kac_Latn': {'number_of_characters': 89655, 'num_samples': 1387, 'num_queries': 899, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 181.72, 'max_document_length': 195, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.09, 'max_query_length': 2, 'unique_queries': 899, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'kan_Knda-kan_Knda': {'number_of_characters': 65899, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 133.04, 'max_document_length': 165, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'kan_Knda-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-kan_Knda': {'number_of_characters': 65899, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 133.04, 'max_document_length': 165, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'kat_Geor-kat_Geor': {'number_of_characters': 68309, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 137.98, 'max_document_length': 175, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'kat_Geor-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-kat_Geor': {'number_of_characters': 68309, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 137.98, 'max_document_length': 175, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'kaz_Cyrl-kaz_Cyrl': {'number_of_characters': 64657, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 17, 'average_document_length': 130.49, 'max_document_length': 158, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'kaz_Cyrl-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-kaz_Cyrl': {'number_of_characters': 64657, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 17, 'average_document_length': 130.49, 'max_document_length': 158, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'kea_Latn-kea_Latn': {'number_of_characters': 69323, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 140.06, 'max_document_length': 183, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'kea_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-kea_Latn': {'number_of_characters': 69323, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 140.06, 'max_document_length': 183, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'khk_Cyrl-khk_Cyrl': {'number_of_characters': 66977, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 135.25, 'max_document_length': 162, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'khk_Cyrl-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-khk_Cyrl': {'number_of_characters': 66977, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 135.25, 'max_document_length': 162, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'khm_Khmr-khm_Khmr': {'number_of_characters': 69150, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 139.7, 'max_document_length': 169, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'khm_Khmr-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-khm_Khmr': {'number_of_characters': 69150, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 139.7, 'max_document_length': 169, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'kin_Latn-kin_Latn': {'number_of_characters': 72803, 'num_samples': 1387, 'num_queries': 899, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 147.19, 'max_document_length': 194, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.09, 'max_query_length': 2, 'unique_queries': 899, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 2, 'unique_relevant_docs': 488}, 'kin_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-kin_Latn': {'number_of_characters': 72803, 'num_samples': 1387, 'num_queries': 899, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 147.19, 'max_document_length': 194, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.09, 'max_query_length': 2, 'unique_queries': 899, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 2, 'unique_relevant_docs': 488}, 'kir_Cyrl-kir_Cyrl': {'number_of_characters': 67957, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 17, 'average_document_length': 137.26, 'max_document_length': 182, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'kir_Cyrl-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-kir_Cyrl': {'number_of_characters': 67957, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 17, 'average_document_length': 137.26, 'max_document_length': 182, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'kor_Hang-kor_Hang': {'number_of_characters': 32708, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 65.02, 'max_document_length': 88, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'kor_Hang-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-kor_Hang': {'number_of_characters': 32708, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 65.02, 'max_document_length': 88, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'lao_Laoo-lao_Laoo': {'number_of_characters': 57958, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 15, 'average_document_length': 116.77, 'max_document_length': 142, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'lao_Laoo-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-lao_Laoo': {'number_of_characters': 57958, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 15, 'average_document_length': 116.77, 'max_document_length': 142, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'lin_Latn-lin_Latn': {'number_of_characters': 74223, 'num_samples': 1386, 'num_queries': 898, 'num_documents': 488, 'min_document_length': 17, 'average_document_length': 150.1, 'max_document_length': 183, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.09, 'max_query_length': 2, 'unique_queries': 898, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 2, 'unique_relevant_docs': 488}, 'lin_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-lin_Latn': {'number_of_characters': 74223, 'num_samples': 1386, 'num_queries': 898, 'num_documents': 488, 'min_document_length': 17, 'average_document_length': 150.1, 'max_document_length': 183, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.09, 'max_query_length': 2, 'unique_queries': 898, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 2, 'unique_relevant_docs': 488}, 'lit_Latn-lit_Latn': {'number_of_characters': 62805, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 126.7, 'max_document_length': 167, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'lit_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-lit_Latn': {'number_of_characters': 62805, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 126.7, 'max_document_length': 167, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'lug_Latn-lug_Latn': {'number_of_characters': 71566, 'num_samples': 1387, 'num_queries': 899, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 144.65, 'max_document_length': 237, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.09, 'max_query_length': 2, 'unique_queries': 899, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 2, 'unique_relevant_docs': 488}, 'lug_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-lug_Latn': {'number_of_characters': 71566, 'num_samples': 1387, 'num_queries': 899, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 144.65, 'max_document_length': 237, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.09, 'max_query_length': 2, 'unique_queries': 899, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 2, 'unique_relevant_docs': 488}, 'luo_Latn-luo_Latn': {'number_of_characters': 66805, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 15, 'average_document_length': 134.9, 'max_document_length': 178, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'luo_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-luo_Latn': {'number_of_characters': 66805, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 15, 'average_document_length': 134.9, 'max_document_length': 178, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'lvs_Latn-lvs_Latn': {'number_of_characters': 63957, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 129.06, 'max_document_length': 172, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'lvs_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-lvs_Latn': {'number_of_characters': 63957, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 129.06, 'max_document_length': 172, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'mal_Mlym-mal_Mlym': {'number_of_characters': 73599, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 148.82, 'max_document_length': 191, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'mal_Mlym-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-mal_Mlym': {'number_of_characters': 73599, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 148.82, 'max_document_length': 191, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'mar_Deva-mar_Deva': {'number_of_characters': 62671, 'num_samples': 1387, 'num_queries': 899, 'num_documents': 488, 'min_document_length': 15, 'average_document_length': 126.42, 'max_document_length': 160, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.09, 'max_query_length': 2, 'unique_queries': 899, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 2, 'unique_relevant_docs': 488}, 'mar_Deva-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-mar_Deva': {'number_of_characters': 62671, 'num_samples': 1387, 'num_queries': 899, 'num_documents': 488, 'min_document_length': 15, 'average_document_length': 126.42, 'max_document_length': 160, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.09, 'max_query_length': 2, 'unique_queries': 899, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 2, 'unique_relevant_docs': 488}, 'mkd_Cyrl-mkd_Cyrl': {'number_of_characters': 67588, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 136.5, 'max_document_length': 180, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'mkd_Cyrl-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-mkd_Cyrl': {'number_of_characters': 67588, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 136.5, 'max_document_length': 180, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'mlt_Latn-mlt_Latn': {'number_of_characters': 68480, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 138.33, 'max_document_length': 185, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'mlt_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-mlt_Latn': {'number_of_characters': 68480, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 138.33, 'max_document_length': 185, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'mri_Latn-mri_Latn': {'number_of_characters': 74519, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 150.7, 'max_document_length': 185, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'mri_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-mri_Latn': {'number_of_characters': 74519, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 150.7, 'max_document_length': 185, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'mya_Mymr-mya_Mymr': {'number_of_characters': 81331, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 164.66, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'mya_Mymr-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-mya_Mymr': {'number_of_characters': 81331, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 164.66, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'nld_Latn-nld_Latn': {'number_of_characters': 68789, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 138.96, 'max_document_length': 183, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'nld_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-nld_Latn': {'number_of_characters': 68789, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 138.96, 'max_document_length': 183, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'nob_Latn-nob_Latn': {'number_of_characters': 64917, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 131.03, 'max_document_length': 168, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'nob_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-nob_Latn': {'number_of_characters': 64917, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 131.03, 'max_document_length': 168, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'npi_Deva-npi_Deva': {'number_of_characters': 61183, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 123.38, 'max_document_length': 154, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'npi_Deva-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-npi_Deva': {'number_of_characters': 61183, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 123.38, 'max_document_length': 154, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'npi_Latn-npi_Latn': {'number_of_characters': 65683, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 20, 'average_document_length': 132.6, 'max_document_length': 154, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'npi_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-npi_Latn': {'number_of_characters': 65683, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 20, 'average_document_length': 132.6, 'max_document_length': 154, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'nso_Latn-nso_Latn': {'number_of_characters': 79073, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 160.03, 'max_document_length': 235, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'nso_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-nso_Latn': {'number_of_characters': 79073, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 160.03, 'max_document_length': 235, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'nya_Latn-nya_Latn': {'number_of_characters': 82685, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 167.44, 'max_document_length': 215, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'nya_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-nya_Latn': {'number_of_characters': 82685, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 167.44, 'max_document_length': 215, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ory_Orya-ory_Orya': {'number_of_characters': 66638, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 10, 'average_document_length': 134.55, 'max_document_length': 168, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ory_Orya-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-ory_Orya': {'number_of_characters': 66638, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 10, 'average_document_length': 134.55, 'max_document_length': 168, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'pan_Guru-pan_Guru': {'number_of_characters': 66944, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 135.18, 'max_document_length': 157, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'pan_Guru-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-pan_Guru': {'number_of_characters': 66944, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 135.18, 'max_document_length': 157, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'pbt_Arab-pbt_Arab': {'number_of_characters': 61880, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 15, 'average_document_length': 124.8, 'max_document_length': 155, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'pbt_Arab-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-pbt_Arab': {'number_of_characters': 61880, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 15, 'average_document_length': 124.8, 'max_document_length': 155, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'pes_Arab-pes_Arab': {'number_of_characters': 59252, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 119.42, 'max_document_length': 152, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'pes_Arab-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-pes_Arab': {'number_of_characters': 59252, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 119.42, 'max_document_length': 152, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'plt_Latn-plt_Latn': {'number_of_characters': 86472, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 15, 'average_document_length': 175.2, 'max_document_length': 222, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'plt_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-plt_Latn': {'number_of_characters': 86472, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 15, 'average_document_length': 175.2, 'max_document_length': 222, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'pol_Latn-pol_Latn': {'number_of_characters': 67664, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 136.66, 'max_document_length': 196, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'pol_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-pol_Latn': {'number_of_characters': 67664, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 136.66, 'max_document_length': 196, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'por_Latn-por_Latn': {'number_of_characters': 71281, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 144.07, 'max_document_length': 179, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'por_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-por_Latn': {'number_of_characters': 71281, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 144.07, 'max_document_length': 179, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ron_Latn-ron_Latn': {'number_of_characters': 71844, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 145.22, 'max_document_length': 181, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ron_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-ron_Latn': {'number_of_characters': 71844, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 145.22, 'max_document_length': 181, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'rus_Cyrl-rus_Cyrl': {'number_of_characters': 75823, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 153.38, 'max_document_length': 196, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'rus_Cyrl-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-rus_Cyrl': {'number_of_characters': 75823, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 153.38, 'max_document_length': 196, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'shn_Mymr-shn_Mymr': {'number_of_characters': 69288, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 139.98, 'max_document_length': 159, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'shn_Mymr-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-shn_Mymr': {'number_of_characters': 69288, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 139.98, 'max_document_length': 159, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'sin_Latn-sin_Latn': {'number_of_characters': 85996, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 19, 'average_document_length': 174.22, 'max_document_length': 224, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'sin_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-sin_Latn': {'number_of_characters': 85996, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 19, 'average_document_length': 174.22, 'max_document_length': 224, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'sin_Sinh-sin_Sinh': {'number_of_characters': 63902, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 17, 'average_document_length': 128.95, 'max_document_length': 159, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'sin_Sinh-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-sin_Sinh': {'number_of_characters': 63902, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 17, 'average_document_length': 128.95, 'max_document_length': 159, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'slk_Latn-slk_Latn': {'number_of_characters': 62663, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 126.41, 'max_document_length': 146, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'slk_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-slk_Latn': {'number_of_characters': 62663, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 126.41, 'max_document_length': 146, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'slv_Latn-slv_Latn': {'number_of_characters': 62895, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 126.88, 'max_document_length': 176, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'slv_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-slv_Latn': {'number_of_characters': 62895, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 126.88, 'max_document_length': 176, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'sna_Latn-sna_Latn': {'number_of_characters': 74071, 'num_samples': 1387, 'num_queries': 899, 'num_documents': 488, 'min_document_length': 20, 'average_document_length': 149.78, 'max_document_length': 191, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.09, 'max_query_length': 2, 'unique_queries': 899, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'sna_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-sna_Latn': {'number_of_characters': 74071, 'num_samples': 1387, 'num_queries': 899, 'num_documents': 488, 'min_document_length': 20, 'average_document_length': 149.78, 'max_document_length': 191, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.09, 'max_query_length': 2, 'unique_queries': 899, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'snd_Arab-snd_Arab': {'number_of_characters': 58057, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 116.97, 'max_document_length': 164, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'snd_Arab-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-snd_Arab': {'number_of_characters': 58057, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 116.97, 'max_document_length': 164, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'som_Latn-som_Latn': {'number_of_characters': 82838, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 167.75, 'max_document_length': 201, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'som_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-som_Latn': {'number_of_characters': 82838, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 167.75, 'max_document_length': 201, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'sot_Latn-sot_Latn': {'number_of_characters': 75794, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 11, 'average_document_length': 153.32, 'max_document_length': 186, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'sot_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-sot_Latn': {'number_of_characters': 75794, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 11, 'average_document_length': 153.32, 'max_document_length': 186, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'spa_Latn-spa_Latn': {'number_of_characters': 74920, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 17, 'average_document_length': 151.52, 'max_document_length': 180, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'spa_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-spa_Latn': {'number_of_characters': 74920, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 17, 'average_document_length': 151.52, 'max_document_length': 180, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'srp_Cyrl-srp_Cyrl': {'number_of_characters': 61657, 'num_samples': 1387, 'num_queries': 899, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 124.35, 'max_document_length': 160, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.09, 'max_query_length': 2, 'unique_queries': 899, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 2, 'unique_relevant_docs': 488}, 'srp_Cyrl-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-srp_Cyrl': {'number_of_characters': 61657, 'num_samples': 1387, 'num_queries': 899, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 124.35, 'max_document_length': 160, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.09, 'max_query_length': 2, 'unique_queries': 899, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 2, 'unique_relevant_docs': 488}, 'ssw_Latn-ssw_Latn': {'number_of_characters': 73964, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 149.57, 'max_document_length': 182, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ssw_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-ssw_Latn': {'number_of_characters': 73964, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 149.57, 'max_document_length': 182, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'sun_Latn-sun_Latn': {'number_of_characters': 71320, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 15, 'average_document_length': 144.15, 'max_document_length': 173, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'sun_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-sun_Latn': {'number_of_characters': 71320, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 15, 'average_document_length': 144.15, 'max_document_length': 173, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'swe_Latn-swe_Latn': {'number_of_characters': 62785, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 126.66, 'max_document_length': 154, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'swe_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-swe_Latn': {'number_of_characters': 62785, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 126.66, 'max_document_length': 154, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'swh_Latn-swh_Latn': {'number_of_characters': 73480, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 148.57, 'max_document_length': 194, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'swh_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-swh_Latn': {'number_of_characters': 73480, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 148.57, 'max_document_length': 194, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'tam_Taml-tam_Taml': {'number_of_characters': 73991, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 20, 'average_document_length': 149.62, 'max_document_length': 181, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'tam_Taml-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-tam_Taml': {'number_of_characters': 73991, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 20, 'average_document_length': 149.62, 'max_document_length': 181, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'tel_Telu-tel_Telu': {'number_of_characters': 65945, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 133.13, 'max_document_length': 149, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'tel_Telu-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-tel_Telu': {'number_of_characters': 65945, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 133.13, 'max_document_length': 149, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'tgk_Cyrl-tgk_Cyrl': {'number_of_characters': 67829, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 11, 'average_document_length': 136.99, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'tgk_Cyrl-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-tgk_Cyrl': {'number_of_characters': 67829, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 11, 'average_document_length': 136.99, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'tgl_Latn-tgl_Latn': {'number_of_characters': 75087, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 151.87, 'max_document_length': 184, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'tgl_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-tgl_Latn': {'number_of_characters': 75087, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 151.87, 'max_document_length': 184, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'tha_Thai-tha_Thai': {'number_of_characters': 54496, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 109.67, 'max_document_length': 123, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'tha_Thai-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-tha_Thai': {'number_of_characters': 54496, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 109.67, 'max_document_length': 123, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'tir_Ethi-tir_Ethi': {'number_of_characters': 47775, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 95.9, 'max_document_length': 110, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'tir_Ethi-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-tir_Ethi': {'number_of_characters': 47775, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 95.9, 'max_document_length': 110, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'tsn_Latn-tsn_Latn': {'number_of_characters': 79391, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 160.69, 'max_document_length': 204, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'tsn_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-tsn_Latn': {'number_of_characters': 79391, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 160.69, 'max_document_length': 204, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'tso_Latn-tso_Latn': {'number_of_characters': 83501, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 169.11, 'max_document_length': 215, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'tso_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-tso_Latn': {'number_of_characters': 83501, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 169.11, 'max_document_length': 215, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'tur_Latn-tur_Latn': {'number_of_characters': 65382, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 131.98, 'max_document_length': 158, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'tur_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-tur_Latn': {'number_of_characters': 65382, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 131.98, 'max_document_length': 158, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ukr_Cyrl-ukr_Cyrl': {'number_of_characters': 65850, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 132.94, 'max_document_length': 159, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ukr_Cyrl-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-ukr_Cyrl': {'number_of_characters': 65850, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 132.94, 'max_document_length': 159, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'urd_Arab-urd_Arab': {'number_of_characters': 64450, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 11, 'average_document_length': 130.07, 'max_document_length': 187, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'urd_Arab-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-urd_Arab': {'number_of_characters': 64450, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 11, 'average_document_length': 130.07, 'max_document_length': 187, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'urd_Latn-urd_Latn': {'number_of_characters': 82039, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 15, 'average_document_length': 166.11, 'max_document_length': 230, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'urd_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-urd_Latn': {'number_of_characters': 82039, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 15, 'average_document_length': 166.11, 'max_document_length': 230, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'uzn_Latn-uzn_Latn': {'number_of_characters': 70828, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 11, 'average_document_length': 143.14, 'max_document_length': 175, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'uzn_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-uzn_Latn': {'number_of_characters': 70828, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 11, 'average_document_length': 143.14, 'max_document_length': 175, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'vie_Latn-vie_Latn': {'number_of_characters': 66724, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 134.73, 'max_document_length': 161, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'vie_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-vie_Latn': {'number_of_characters': 66724, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 134.73, 'max_document_length': 161, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'war_Latn-war_Latn': {'number_of_characters': 78444, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 158.75, 'max_document_length': 207, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'war_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-war_Latn': {'number_of_characters': 78444, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 158.75, 'max_document_length': 207, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'wol_Latn-wol_Latn': {'number_of_characters': 64521, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 15, 'average_document_length': 130.22, 'max_document_length': 139, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'wol_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-wol_Latn': {'number_of_characters': 64521, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 15, 'average_document_length': 130.22, 'max_document_length': 139, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'xho_Latn-xho_Latn': {'number_of_characters': 71629, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 144.78, 'max_document_length': 179, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'xho_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-xho_Latn': {'number_of_characters': 71629, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 144.78, 'max_document_length': 179, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'yor_Latn-yor_Latn': {'number_of_characters': 62752, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 11, 'average_document_length': 126.59, 'max_document_length': 143, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'yor_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-yor_Latn': {'number_of_characters': 62752, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 11, 'average_document_length': 126.59, 'max_document_length': 143, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'zho_Hans-zho_Hans': {'number_of_characters': 20549, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 7, 'average_document_length': 40.11, 'max_document_length': 64, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'zho_Hans-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-zho_Hans': {'number_of_characters': 20549, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 7, 'average_document_length': 40.11, 'max_document_length': 64, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'zho_Hant-zho_Hant': {'number_of_characters': 19947, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 7, 'average_document_length': 38.88, 'max_document_length': 45, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'zho_Hant-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-zho_Hant': {'number_of_characters': 19947, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 7, 'average_document_length': 38.88, 'max_document_length': 45, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'zsm_Latn-zsm_Latn': {'number_of_characters': 72008, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 145.56, 'max_document_length': 210, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'zsm_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-zsm_Latn': {'number_of_characters': 72008, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 145.56, 'max_document_length': 210, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'zul_Latn-zul_Latn': {'number_of_characters': 69413, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 140.24, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'zul_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-zul_Latn': {'number_of_characters': 69413, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 140.24, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'arb_Arab-arb_Latn': {'number_of_characters': 61298, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 123.61, 'max_document_length': 160, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'arb_Latn-arb_Arab': {'number_of_characters': 53671, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 107.98, 'max_document_length': 134, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ben_Beng-ben_Latn': {'number_of_characters': 68285, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 9, 'average_document_length': 137.93, 'max_document_length': 185, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ben_Latn-ben_Beng': {'number_of_characters': 63512, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 9, 'average_document_length': 128.15, 'max_document_length': 175, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'hin_Deva-hin_Latn': {'number_of_characters': 68307, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 137.97, 'max_document_length': 170, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'hin_Latn-hin_Deva': {'number_of_characters': 66332, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 133.93, 'max_document_length': 165, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'npi_Deva-npi_Latn': {'number_of_characters': 65683, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 20, 'average_document_length': 132.6, 'max_document_length': 154, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'npi_Latn-npi_Deva': {'number_of_characters': 61183, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 123.38, 'max_document_length': 154, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'sin_Sinh-sin_Latn': {'number_of_characters': 85996, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 19, 'average_document_length': 174.22, 'max_document_length': 224, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'sin_Latn-sin_Sinh': {'number_of_characters': 63902, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 17, 'average_document_length': 128.95, 'max_document_length': 159, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'urd_Arab-urd_Latn': {'number_of_characters': 82039, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 15, 'average_document_length': 166.11, 'max_document_length': 230, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'urd_Latn-urd_Arab': {'number_of_characters': 64450, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 11, 'average_document_length': 130.07, 'max_document_length': 187, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}}}} | -| [BengaliDocumentClassification](https://aclanthology.org/2023.eacl-main.4) | ['ben'] | Classification | s2s | [News, Written] | None | None | +| [BengaliDocumentClassification](https://aclanthology.org/2023.eacl-main.4) (Akash et al., 2023) | ['ben'] | Classification | s2s | [News, Written] | None | None | | [BengaliHateSpeechClassification](https://huggingface.co/datasets/bn_hate_speech) (Karim et al., 2020) | ['ben'] | Classification | s2s | [News, Written] | None | None | | [BengaliSentimentAnalysis](https://data.mendeley.com/datasets/p6zc7krs37/4) (Sazzed et al., 2020) | ['ben'] | Classification | s2s | [Reviews, Written] | None | None | | [BeytooteClustering](https://mcinext.com/) | ['fas'] | Clustering | p2p | [News] | None | None | | [BibleNLPBitextMining](https://arxiv.org/abs/2304.09919) (Akerman et al., 2023) | ['aai', 'aak', 'aau', 'aaz', 'abt', 'abx', 'aby', 'acf', 'acr', 'acu', 'adz', 'aer', 'aey', 'agd', 'agg', 'agm', 'agn', 'agr', 'agt', 'agu', 'aia', 'aii', 'aka', 'ake', 'alp', 'alq', 'als', 'aly', 'ame', 'amf', 'amk', 'amm', 'amn', 'amo', 'amp', 'amr', 'amu', 'amx', 'anh', 'anv', 'aoi', 'aoj', 'aom', 'aon', 'apb', 'ape', 'apn', 'apr', 'apu', 'apw', 'apz', 'arb', 'are', 'arl', 'arn', 'arp', 'asm', 'aso', 'ata', 'atb', 'atd', 'atg', 'att', 'auc', 'aui', 'auy', 'avt', 'awb', 'awk', 'awx', 'azb', 'azg', 'azz', 'bao', 'bba', 'bbb', 'bbr', 'bch', 'bco', 'bdd', 'bea', 'bef', 'bel', 'ben', 'beo', 'beu', 'bgs', 'bgt', 'bhg', 'bhl', 'big', 'bjk', 'bjp', 'bjr', 'bjv', 'bjz', 'bkd', 'bki', 'bkq', 'bkx', 'blw', 'blz', 'bmh', 'bmk', 'bmr', 'bmu', 'bnp', 'boa', 'boj', 'bon', 'box', 'bpr', 'bps', 'bqc', 'bqp', 'bre', 'bsj', 'bsn', 'bsp', 'bss', 'buk', 'bus', 'bvd', 'bvr', 'bxh', 'byr', 'byx', 'bzd', 'bzh', 'bzj', 'caa', 'cab', 'cac', 'caf', 'cak', 'cao', 'cap', 'car', 'cav', 'cax', 'cbc', 'cbi', 'cbk', 'cbr', 'cbs', 'cbt', 'cbu', 'cbv', 'cco', 'ceb', 'cek', 'ces', 'cgc', 'cha', 'chd', 'chf', 'chk', 'chq', 'chz', 'cjo', 'cjv', 'ckb', 'cle', 'clu', 'cme', 'cmn', 'cni', 'cnl', 'cnt', 'cof', 'con', 'cop', 'cot', 'cpa', 'cpb', 'cpc', 'cpu', 'cpy', 'crn', 'crx', 'cso', 'csy', 'cta', 'cth', 'ctp', 'ctu', 'cub', 'cuc', 'cui', 'cuk', 'cut', 'cux', 'cwe', 'cya', 'daa', 'dad', 'dah', 'dan', 'ded', 'deu', 'dgc', 'dgr', 'dgz', 'dhg', 'dif', 'dik', 'dji', 'djk', 'djr', 'dob', 'dop', 'dov', 'dwr', 'dww', 'dwy', 'ebk', 'eko', 'emi', 'emp', 'eng', 'enq', 'epo', 'eri', 'ese', 'esk', 'etr', 'ewe', 'faa', 'fai', 'far', 'ffm', 'for', 'fra', 'fue', 'fuf', 'fuh', 'gah', 'gai', 'gam', 'gaw', 'gdn', 'gdr', 'geb', 'gfk', 'ghs', 'glk', 'gmv', 'gng', 'gnn', 'gnw', 'gof', 'grc', 'gub', 'guh', 'gui', 'guj', 'gul', 'gum', 'gun', 'guo', 'gup', 'gux', 'gvc', 'gvf', 'gvn', 'gvs', 'gwi', 'gym', 'gyr', 'hat', 'hau', 'haw', 'hbo', 'hch', 'heb', 'heg', 'hin', 'hix', 'hla', 'hlt', 'hmo', 'hns', 'hop', 'hot', 'hrv', 'hto', 'hub', 'hui', 'hun', 'hus', 'huu', 'huv', 'hvn', 'ian', 'ign', 'ikk', 'ikw', 'ilo', 'imo', 'inb', 'ind', 'ino', 'iou', 'ipi', 'isn', 'ita', 'iws', 'ixl', 'jac', 'jae', 'jao', 'jic', 'jid', 'jiv', 'jni', 'jpn', 'jvn', 'kan', 'kaq', 'kbc', 'kbh', 'kbm', 'kbq', 'kdc', 'kde', 'kdl', 'kek', 'ken', 'kew', 'kgf', 'kgk', 'kgp', 'khs', 'khz', 'kik', 'kiw', 'kiz', 'kje', 'kjs', 'kkc', 'kkl', 'klt', 'klv', 'kmg', 'kmh', 'kmk', 'kmo', 'kms', 'kmu', 'kne', 'knf', 'knj', 'knv', 'kos', 'kpf', 'kpg', 'kpj', 'kpr', 'kpw', 'kpx', 'kqa', 'kqc', 'kqf', 'kql', 'kqw', 'ksd', 'ksj', 'ksr', 'ktm', 'kto', 'kud', 'kue', 'kup', 'kvg', 'kvn', 'kwd', 'kwf', 'kwi', 'kwj', 'kyc', 'kyf', 'kyg', 'kyq', 'kyz', 'kze', 'lac', 'lat', 'lbb', 'lbk', 'lcm', 'leu', 'lex', 'lgl', 'lid', 'lif', 'lin', 'lit', 'llg', 'lug', 'luo', 'lww', 'maa', 'maj', 'mal', 'mam', 'maq', 'mar', 'mau', 'mav', 'maz', 'mbb', 'mbc', 'mbh', 'mbj', 'mbl', 'mbs', 'mbt', 'mca', 'mcb', 'mcd', 'mcf', 'mco', 'mcp', 'mcq', 'mcr', 'mdy', 'med', 'mee', 'mek', 'meq', 'met', 'meu', 'mgc', 'mgh', 'mgw', 'mhl', 'mib', 'mic', 'mie', 'mig', 'mih', 'mil', 'mio', 'mir', 'mit', 'miz', 'mjc', 'mkj', 'mkl', 'mkn', 'mks', 'mle', 'mlh', 'mlp', 'mmo', 'mmx', 'mna', 'mop', 'mox', 'mph', 'mpj', 'mpm', 'mpp', 'mps', 'mpt', 'mpx', 'mqb', 'mqj', 'msb', 'msc', 'msk', 'msm', 'msy', 'mti', 'mto', 'mux', 'muy', 'mva', 'mvn', 'mwc', 'mwe', 'mwf', 'mwp', 'mxb', 'mxp', 'mxq', 'mxt', 'mya', 'myk', 'myu', 'myw', 'myy', 'mzz', 'nab', 'naf', 'nak', 'nas', 'nbq', 'nca', 'nch', 'ncj', 'ncl', 'ncu', 'ndg', 'ndj', 'nfa', 'ngp', 'ngu', 'nhe', 'nhg', 'nhi', 'nho', 'nhr', 'nhu', 'nhw', 'nhy', 'nif', 'nii', 'nin', 'nko', 'nld', 'nlg', 'nna', 'nnq', 'noa', 'nop', 'not', 'nou', 'npi', 'npl', 'nsn', 'nss', 'ntj', 'ntp', 'ntu', 'nuy', 'nvm', 'nwi', 'nya', 'nys', 'nyu', 'obo', 'okv', 'omw', 'ong', 'ons', 'ood', 'opm', 'ory', 'ote', 'otm', 'otn', 'otq', 'ots', 'pab', 'pad', 'pah', 'pan', 'pao', 'pes', 'pib', 'pio', 'pir', 'piu', 'pjt', 'pls', 'plu', 'pma', 'poe', 'poh', 'poi', 'pol', 'pon', 'por', 'poy', 'ppo', 'prf', 'pri', 'ptp', 'ptu', 'pwg', 'qub', 'quc', 'quf', 'quh', 'qul', 'qup', 'qvc', 'qve', 'qvh', 'qvm', 'qvn', 'qvs', 'qvw', 'qvz', 'qwh', 'qxh', 'qxn', 'qxo', 'rai', 'reg', 'rgu', 'rkb', 'rmc', 'rmy', 'ron', 'roo', 'rop', 'row', 'rro', 'ruf', 'rug', 'rus', 'rwo', 'sab', 'san', 'sbe', 'sbk', 'sbs', 'seh', 'sey', 'sgb', 'sgz', 'shj', 'shp', 'sim', 'sja', 'sll', 'smk', 'snc', 'snn', 'snp', 'snx', 'sny', 'som', 'soq', 'soy', 'spa', 'spl', 'spm', 'spp', 'sps', 'spy', 'sri', 'srm', 'srn', 'srp', 'srq', 'ssd', 'ssg', 'ssx', 'stp', 'sua', 'sue', 'sus', 'suz', 'swe', 'swh', 'swp', 'sxb', 'tac', 'taj', 'tam', 'tav', 'taw', 'tbc', 'tbf', 'tbg', 'tbo', 'tbz', 'tca', 'tcs', 'tcz', 'tdt', 'tee', 'tel', 'ter', 'tet', 'tew', 'tfr', 'tgk', 'tgl', 'tgo', 'tgp', 'tha', 'tif', 'tim', 'tiw', 'tiy', 'tke', 'tku', 'tlf', 'tmd', 'tna', 'tnc', 'tnk', 'tnn', 'tnp', 'toc', 'tod', 'tof', 'toj', 'ton', 'too', 'top', 'tos', 'tpa', 'tpi', 'tpt', 'tpz', 'trc', 'tsw', 'ttc', 'tte', 'tuc', 'tue', 'tuf', 'tuo', 'tur', 'tvk', 'twi', 'txq', 'txu', 'tzj', 'tzo', 'ubr', 'ubu', 'udu', 'uig', 'ukr', 'uli', 'ulk', 'upv', 'ura', 'urb', 'urd', 'uri', 'urt', 'urw', 'usa', 'usp', 'uvh', 'uvl', 'vid', 'vie', 'viv', 'vmy', 'waj', 'wal', 'wap', 'wat', 'wbi', 'wbp', 'wed', 'wer', 'wim', 'wiu', 'wiv', 'wmt', 'wmw', 'wnc', 'wnu', 'wol', 'wos', 'wrk', 'wro', 'wrs', 'wsk', 'wuv', 'xav', 'xbi', 'xed', 'xla', 'xnn', 'xon', 'xsi', 'xtd', 'xtm', 'yaa', 'yad', 'yal', 'yap', 'yaq', 'yby', 'ycn', 'yka', 'yle', 'yml', 'yon', 'yor', 'yrb', 'yre', 'yss', 'yuj', 'yut', 'yuw', 'yva', 'zaa', 'zab', 'zac', 'zad', 'zai', 'zaj', 'zam', 'zao', 'zap', 'zar', 'zas', 'zat', 'zav', 'zaw', 'zca', 'zga', 'zia', 'ziw', 'zlm', 'zos', 'zpc', 'zpl', 'zpm', 'zpo', 'zpq', 'zpu', 'zpv', 'zpz', 'zsr', 'ztq', 'zty', 'zyp'] | BitextMining | s2s | [Religious, Written] | None | None | -| [BigPatentClustering.v2](https://huggingface.co/datasets/NortheasternUniversity/big_patent) (Eva Sharma and Chen Li and Lu Wang, 2019) | ['eng'] | Clustering | p2p | [Legal, Written] | None | None | +| [BigPatentClustering.v2](https://huggingface.co/datasets/NortheasternUniversity/big_patent) (Eva Sharma and Chen Li and Lu Wang, 2019) | ['eng'] | Clustering | p2p | [Legal, Written] | None | None | | [BiorxivClusteringP2P.v2](https://api.biorxiv.org/) | ['eng'] | Clustering | p2p | [Academic, Written] | None | None | | [BiorxivClusteringS2S.v2](https://api.biorxiv.org/) | ['eng'] | Clustering | s2s | [Academic, Written] | None | None | | [Birdsnap](https://openaccess.thecvf.com/content_cvpr_2014/html/Berg_Birdsnap_Large-scale_Fine-grained_2014_CVPR_paper.html) (Berg et al., 2014) | ['eng'] | ImageClassification | i2i | [Encyclopaedic] | {'test': 1851} | {'test': {'num_samples': 1851, 'unique_num_labels': 490, 'min_image_width': 267, 'average_image_width': 2081.56, 'max_image_width': 6400, 'min_image_height': 200, 'average_image_height': 1609.19, 'max_image_height': 5400, 'labels': {'0': {'count': 4}, '1': {'count': 5}, '2': {'count': 4}, '3': {'count': 4}, '4': {'count': 4}, '5': {'count': 2}, '6': {'count': 3}, '7': {'count': 5}, '8': {'count': 4}, '9': {'count': 5}, '11': {'count': 3}, '12': {'count': 4}, '13': {'count': 5}, '14': {'count': 4}, '15': {'count': 5}, '16': {'count': 4}, '17': {'count': 3}, '18': {'count': 2}, '19': {'count': 5}, '20': {'count': 4}, '21': {'count': 4}, '22': {'count': 5}, '23': {'count': 2}, '24': {'count': 4}, '25': {'count': 3}, '26': {'count': 4}, '27': {'count': 4}, '28': {'count': 2}, '29': {'count': 5}, '30': {'count': 3}, '31': {'count': 3}, '32': {'count': 3}, '33': {'count': 4}, '34': {'count': 4}, '35': {'count': 4}, '36': {'count': 3}, '37': {'count': 3}, '38': {'count': 4}, '39': {'count': 3}, '40': {'count': 4}, '41': {'count': 3}, '42': {'count': 3}, '43': {'count': 4}, '44': {'count': 2}, '45': {'count': 3}, '47': {'count': 5}, '48': {'count': 2}, '49': {'count': 5}, '50': {'count': 4}, '51': {'count': 5}, '52': {'count': 3}, '53': {'count': 3}, '54': {'count': 4}, '55': {'count': 2}, '56': {'count': 2}, '57': {'count': 5}, '58': {'count': 2}, '59': {'count': 1}, '60': {'count': 1}, '61': {'count': 3}, '62': {'count': 3}, '63': {'count': 5}, '64': {'count': 5}, '65': {'count': 4}, '67': {'count': 2}, '68': {'count': 3}, '69': {'count': 4}, '70': {'count': 5}, '71': {'count': 5}, '72': {'count': 5}, '73': {'count': 4}, '74': {'count': 5}, '75': {'count': 4}, '76': {'count': 4}, '80': {'count': 3}, '81': {'count': 5}, '82': {'count': 3}, '83': {'count': 5}, '84': {'count': 3}, '85': {'count': 4}, '86': {'count': 4}, '87': {'count': 5}, '88': {'count': 4}, '89': {'count': 5}, '90': {'count': 4}, '91': {'count': 4}, '92': {'count': 5}, '93': {'count': 4}, '94': {'count': 4}, '95': {'count': 5}, '96': {'count': 5}, '97': {'count': 5}, '98': {'count': 3}, '99': {'count': 5}, '100': {'count': 4}, '101': {'count': 5}, '102': {'count': 4}, '103': {'count': 3}, '105': {'count': 4}, '108': {'count': 4}, '109': {'count': 5}, '110': {'count': 3}, '111': {'count': 3}, '112': {'count': 4}, '113': {'count': 4}, '114': {'count': 5}, '115': {'count': 4}, '116': {'count': 5}, '117': {'count': 4}, '118': {'count': 4}, '119': {'count': 5}, '120': {'count': 5}, '121': {'count': 4}, '122': {'count': 3}, '124': {'count': 3}, '125': {'count': 4}, '126': {'count': 2}, '127': {'count': 3}, '128': {'count': 5}, '129': {'count': 5}, '130': {'count': 5}, '131': {'count': 3}, '132': {'count': 4}, '133': {'count': 4}, '134': {'count': 2}, '135': {'count': 5}, '136': {'count': 5}, '137': {'count': 3}, '138': {'count': 4}, '139': {'count': 3}, '140': {'count': 3}, '141': {'count': 2}, '142': {'count': 3}, '143': {'count': 5}, '144': {'count': 4}, '145': {'count': 5}, '146': {'count': 5}, '147': {'count': 5}, '148': {'count': 4}, '149': {'count': 4}, '150': {'count': 5}, '151': {'count': 5}, '152': {'count': 5}, '153': {'count': 3}, '154': {'count': 4}, '155': {'count': 3}, '156': {'count': 3}, '157': {'count': 3}, '159': {'count': 3}, '160': {'count': 4}, '161': {'count': 4}, '162': {'count': 4}, '163': {'count': 4}, '164': {'count': 3}, '165': {'count': 3}, '166': {'count': 3}, '167': {'count': 4}, '168': {'count': 4}, '169': {'count': 4}, '170': {'count': 4}, '171': {'count': 5}, '172': {'count': 4}, '173': {'count': 4}, '174': {'count': 5}, '175': {'count': 4}, '176': {'count': 2}, '177': {'count': 5}, '178': {'count': 5}, '179': {'count': 5}, '180': {'count': 5}, '181': {'count': 4}, '183': {'count': 2}, '184': {'count': 3}, '185': {'count': 2}, '186': {'count': 5}, '187': {'count': 2}, '188': {'count': 3}, '189': {'count': 2}, '190': {'count': 5}, '191': {'count': 4}, '192': {'count': 3}, '193': {'count': 3}, '194': {'count': 4}, '195': {'count': 3}, '196': {'count': 4}, '197': {'count': 3}, '198': {'count': 4}, '199': {'count': 5}, '200': {'count': 5}, '201': {'count': 1}, '204': {'count': 4}, '205': {'count': 5}, '206': {'count': 4}, '207': {'count': 3}, '208': {'count': 4}, '209': {'count': 4}, '210': {'count': 4}, '211': {'count': 4}, '212': {'count': 5}, '213': {'count': 4}, '214': {'count': 5}, '215': {'count': 3}, '216': {'count': 1}, '217': {'count': 5}, '218': {'count': 2}, '219': {'count': 5}, '220': {'count': 4}, '221': {'count': 5}, '222': {'count': 5}, '223': {'count': 3}, '224': {'count': 4}, '225': {'count': 5}, '226': {'count': 3}, '227': {'count': 4}, '228': {'count': 3}, '229': {'count': 4}, '230': {'count': 4}, '231': {'count': 5}, '232': {'count': 5}, '233': {'count': 5}, '234': {'count': 4}, '235': {'count': 4}, '236': {'count': 5}, '237': {'count': 5}, '238': {'count': 5}, '239': {'count': 4}, '240': {'count': 3}, '241': {'count': 3}, '242': {'count': 4}, '243': {'count': 5}, '244': {'count': 2}, '245': {'count': 4}, '246': {'count': 5}, '247': {'count': 3}, '248': {'count': 3}, '249': {'count': 5}, '250': {'count': 5}, '251': {'count': 4}, '252': {'count': 2}, '253': {'count': 5}, '254': {'count': 5}, '255': {'count': 5}, '256': {'count': 4}, '257': {'count': 4}, '258': {'count': 4}, '259': {'count': 3}, '260': {'count': 5}, '261': {'count': 4}, '262': {'count': 4}, '264': {'count': 4}, '265': {'count': 3}, '266': {'count': 5}, '267': {'count': 5}, '268': {'count': 3}, '269': {'count': 2}, '270': {'count': 3}, '271': {'count': 4}, '272': {'count': 4}, '273': {'count': 5}, '274': {'count': 5}, '275': {'count': 5}, '276': {'count': 2}, '277': {'count': 3}, '278': {'count': 5}, '279': {'count': 5}, '280': {'count': 4}, '281': {'count': 5}, '282': {'count': 5}, '283': {'count': 3}, '284': {'count': 5}, '285': {'count': 3}, '286': {'count': 5}, '287': {'count': 5}, '288': {'count': 4}, '289': {'count': 4}, '290': {'count': 5}, '291': {'count': 3}, '292': {'count': 2}, '293': {'count': 1}, '294': {'count': 1}, '295': {'count': 2}, '296': {'count': 4}, '297': {'count': 5}, '298': {'count': 4}, '300': {'count': 3}, '301': {'count': 3}, '303': {'count': 4}, '304': {'count': 4}, '305': {'count': 4}, '306': {'count': 2}, '307': {'count': 5}, '308': {'count': 4}, '309': {'count': 2}, '310': {'count': 3}, '311': {'count': 3}, '312': {'count': 4}, '313': {'count': 3}, '314': {'count': 3}, '315': {'count': 3}, '316': {'count': 5}, '317': {'count': 4}, '318': {'count': 5}, '319': {'count': 4}, '320': {'count': 4}, '321': {'count': 3}, '322': {'count': 5}, '323': {'count': 4}, '324': {'count': 2}, '325': {'count': 1}, '326': {'count': 3}, '327': {'count': 4}, '328': {'count': 3}, '330': {'count': 4}, '331': {'count': 4}, '332': {'count': 2}, '333': {'count': 5}, '334': {'count': 5}, '335': {'count': 5}, '336': {'count': 4}, '337': {'count': 4}, '338': {'count': 5}, '339': {'count': 3}, '340': {'count': 5}, '341': {'count': 5}, '342': {'count': 5}, '343': {'count': 2}, '344': {'count': 2}, '345': {'count': 3}, '346': {'count': 3}, '347': {'count': 5}, '348': {'count': 3}, '349': {'count': 2}, '350': {'count': 4}, '352': {'count': 5}, '353': {'count': 3}, '354': {'count': 5}, '355': {'count': 5}, '356': {'count': 4}, '357': {'count': 3}, '358': {'count': 3}, '359': {'count': 4}, '360': {'count': 5}, '361': {'count': 5}, '362': {'count': 4}, '363': {'count': 3}, '364': {'count': 4}, '365': {'count': 1}, '366': {'count': 4}, '367': {'count': 3}, '368': {'count': 4}, '369': {'count': 3}, '370': {'count': 5}, '371': {'count': 3}, '372': {'count': 5}, '373': {'count': 4}, '374': {'count': 4}, '375': {'count': 3}, '376': {'count': 4}, '377': {'count': 4}, '378': {'count': 4}, '379': {'count': 4}, '380': {'count': 4}, '381': {'count': 4}, '382': {'count': 1}, '383': {'count': 4}, '384': {'count': 4}, '385': {'count': 4}, '386': {'count': 2}, '387': {'count': 4}, '388': {'count': 2}, '389': {'count': 5}, '390': {'count': 4}, '391': {'count': 5}, '392': {'count': 4}, '394': {'count': 4}, '395': {'count': 4}, '396': {'count': 4}, '397': {'count': 4}, '398': {'count': 5}, '399': {'count': 4}, '400': {'count': 5}, '401': {'count': 4}, '402': {'count': 4}, '404': {'count': 5}, '405': {'count': 5}, '406': {'count': 5}, '407': {'count': 4}, '408': {'count': 2}, '409': {'count': 4}, '410': {'count': 3}, '411': {'count': 5}, '412': {'count': 4}, '413': {'count': 3}, '414': {'count': 4}, '415': {'count': 4}, '416': {'count': 4}, '417': {'count': 5}, '418': {'count': 3}, '419': {'count': 5}, '421': {'count': 4}, '422': {'count': 3}, '423': {'count': 5}, '424': {'count': 5}, '425': {'count': 2}, '426': {'count': 5}, '427': {'count': 4}, '428': {'count': 5}, '429': {'count': 3}, '430': {'count': 2}, '431': {'count': 3}, '432': {'count': 5}, '433': {'count': 4}, '434': {'count': 3}, '435': {'count': 3}, '437': {'count': 3}, '438': {'count': 5}, '439': {'count': 2}, '440': {'count': 4}, '441': {'count': 4}, '442': {'count': 5}, '443': {'count': 2}, '444': {'count': 3}, '445': {'count': 3}, '446': {'count': 5}, '447': {'count': 3}, '448': {'count': 2}, '449': {'count': 1}, '450': {'count': 3}, '451': {'count': 3}, '452': {'count': 4}, '453': {'count': 2}, '454': {'count': 4}, '455': {'count': 4}, '456': {'count': 5}, '458': {'count': 4}, '459': {'count': 4}, '460': {'count': 5}, '461': {'count': 4}, '462': {'count': 4}, '463': {'count': 5}, '464': {'count': 5}, '466': {'count': 2}, '467': {'count': 4}, '468': {'count': 3}, '469': {'count': 5}, '470': {'count': 5}, '471': {'count': 2}, '472': {'count': 4}, '473': {'count': 3}, '474': {'count': 5}, '475': {'count': 5}, '476': {'count': 5}, '477': {'count': 4}, '478': {'count': 2}, '479': {'count': 4}, '480': {'count': 4}, '481': {'count': 5}, '482': {'count': 4}, '483': {'count': 3}, '484': {'count': 5}, '485': {'count': 5}, '486': {'count': 4}, '487': {'count': 3}, '488': {'count': 3}, '489': {'count': 1}, '490': {'count': 1}, '491': {'count': 2}, '492': {'count': 4}, '493': {'count': 4}, '494': {'count': 3}, '495': {'count': 4}, '496': {'count': 5}, '497': {'count': 5}, '498': {'count': 5}, '499': {'count': 4}, '79': {'count': 4}, '106': {'count': 4}, '107': {'count': 4}, '202': {'count': 1}, '203': {'count': 1}, '457': {'count': 3}, '77': {'count': 2}, '78': {'count': 4}, '182': {'count': 2}, '263': {'count': 4}, '104': {'count': 1}, '158': {'count': 5}, '329': {'count': 1}, '393': {'count': 2}, '420': {'count': 2}}}} | @@ -67,7 +68,7 @@ The following tables give you an overview of the tasks in MTEB. | [BlurbsClusteringP2P.v2](https://www.inf.uni-hamburg.de/en/inst/ab/lt/resources/data/germeval-2019-hmc.html) (Steffen Remus, 2019) | ['deu'] | Clustering | p2p | [Fiction, Written] | None | None | | [BlurbsClusteringS2S.v2](https://www.inf.uni-hamburg.de/en/inst/ab/lt/resources/data/germeval-2019-hmc.html) (Steffen Remus, 2019) | ['deu'] | Clustering | s2s | [Fiction, Written] | None | None | | [BornholmBitextMining](https://aclanthology.org/W19-6138/) | ['dan'] | BitextMining | s2s | [Fiction, Social, Web, Written] | {'test': 500} | {'test': {'num_samples': 500, 'number_of_characters': 44361, 'unique_pairs': 500, 'min_sentence1_length': 1, 'average_sentence1_length': 49.83, 'max_sentence1_length': 555, 'unique_sentence1': 497, 'min_sentence2_length': 5, 'average_sentence2_length': 38.89, 'max_sentence2_length': 453, 'unique_sentence2': 491}} | -| [BrazilianToxicTweetsClassification](https://paperswithcode.com/dataset/told-br) (Joao Augusto Leite and Diego F. Silva and Kalina Bontcheva and Carolina Scarton, 2020) | ['por'] | MultilabelClassification | s2s | [Constructed, Written] | None | None | +| [BrazilianToxicTweetsClassification](https://paperswithcode.com/dataset/told-br) (Joao Augusto Leite and Diego F. Silva and Kalina Bontcheva and Carolina Scarton, 2020) | ['por'] | MultilabelClassification | s2s | [Constructed, Written] | None | None | | [BrightLongRetrieval](https://huggingface.co/datasets/xlangai/BRIGHT) (Hongjin Su, 2024) | ['eng'] | Retrieval | s2p | [Non-fiction, Written] | None | None | | [BrightRetrieval](https://huggingface.co/datasets/xlangai/BRIGHT) (Hongjin Su, 2024) | ['eng'] | Retrieval | s2p | [Non-fiction, Written] | None | None | | [BuiltBenchClusteringP2P](https://arxiv.org/abs/2411.12056) (Shahinmoghadam et al., 2024) | ['eng'] | Clustering | p2p | [Engineering, Written] | None | None | @@ -76,10 +77,10 @@ The following tables give you an overview of the tasks in MTEB. | [BuiltBenchRetrieval](https://arxiv.org/abs/2411.12056) (Shahinmoghadam et al., 2024) | ['eng'] | Retrieval | p2p | [Engineering, Written] | None | None | | [BulgarianStoreReviewSentimentClassfication](https://doi.org/10.7910/DVN/TXIK9P) (Georgieva-Trifonova et al., 2018) | ['bul'] | Classification | s2s | [Reviews, Written] | None | None | | [CBD](http://2019.poleval.pl/files/poleval2019.pdf) | ['pol'] | Classification | s2s | [Social, Written] | None | None | -| [CDSC-E](https://aclanthology.org/P17-1073.pdf) | ['pol'] | PairClassification | s2s | [Written] | None | None | -| [CDSC-R](https://aclanthology.org/P17-1073.pdf) | ['pol'] | STS | s2s | [Web, Written] | None | None | +| [CDSC-E](https://aclanthology.org/P17-1073.pdf) (Wr{\'o, 2017) | ['pol'] | PairClassification | s2s | [Written] | None | None | +| [CDSC-R](https://aclanthology.org/P17-1073.pdf) (Wr{\'o, 2017) | ['pol'] | STS | s2s | [Web, Written] | None | None | | [CEDRClassification](https://www.sciencedirect.com/science/article/pii/S1877050921013247) (Sboev et al., 2021) | ['rus'] | MultilabelClassification | s2s | [Blog, Social, Web, Written] | {'test': 1882, 'train': 7528} | {'test': {'num_samples': 1882, 'number_of_characters': 171649, 'number_texts_in_train': 7, 'min_text_length': 6, 'average_text_length': 91.21, 'max_text_length': 220, 'unique_texts': 1875, 'min_labels_per_text': 0, 'average_label_per_text': 0.62, 'max_labels_per_text': 2, 'unique_labels': 6, 'labels': {'None': {'count': 734}, '3': {'count': 141}, '2': {'count': 170}, '1': {'count': 379}, '0': {'count': 353}, '4': {'count': 125}}}, 'train': {'num_samples': 7528, 'number_of_characters': 697322, 'number_texts_in_train': None, 'min_text_length': 5, 'average_text_length': 92.63, 'max_text_length': 280, 'unique_texts': 7500, 'min_labels_per_text': 0, 'average_label_per_text': 0.61, 'max_labels_per_text': 3, 'unique_labels': 6, 'labels': {'None': {'count': 3043}, '2': {'count': 607}, '0': {'count': 1569}, '3': {'count': 589}, '1': {'count': 1417}, '4': {'count': 411}}}} | -| [CExaPPC](https://github.com/exaco/exappc) | ['fas'] | PairClassification | s2s | [Social, Web] | None | None | +| [CExaPPC](https://github.com/exaco/exappc) (Sadeghi et al., 2022) | ['fas'] | PairClassification | s2s | [Social, Web] | None | None | | [CIFAR10](https://huggingface.co/datasets/uoft-cs/cifar10) (Alex Krizhevsky, 2009) | ['eng'] | ImageClassification | i2i | [Web] | {'test': 10000} | {'test': {'num_samples': 10000, 'unique_num_labels': 10, 'min_image_width': 32, 'average_image_width': 32.0, 'max_image_width': 32, 'min_image_height': 32, 'average_image_height': 32.0, 'max_image_height': 32, 'labels': {'3': {'count': 1000}, '8': {'count': 1000}, '0': {'count': 1000}, '6': {'count': 1000}, '1': {'count': 1000}, '9': {'count': 1000}, '5': {'count': 1000}, '7': {'count': 1000}, '4': {'count': 1000}, '2': {'count': 1000}}}} | | [CIFAR100](https://huggingface.co/datasets/uoft-cs/cifar100) (Alex Krizhevsky, 2009) | ['eng'] | ImageClassification | i2t | [Web] | {'test': 10000} | {'test': {'num_samples': 10000, 'unique_num_labels': 100, 'min_image_width': 32, 'average_image_width': 32.0, 'max_image_width': 32, 'min_image_height': 32, 'average_image_height': 32.0, 'max_image_height': 32, 'labels': {'49': {'count': 100}, '33': {'count': 100}, '72': {'count': 100}, '51': {'count': 100}, '71': {'count': 100}, '92': {'count': 100}, '15': {'count': 100}, '14': {'count': 100}, '23': {'count': 100}, '0': {'count': 100}, '75': {'count': 100}, '81': {'count': 100}, '69': {'count': 100}, '40': {'count': 100}, '43': {'count': 100}, '97': {'count': 100}, '70': {'count': 100}, '53': {'count': 100}, '29': {'count': 100}, '21': {'count': 100}, '16': {'count': 100}, '39': {'count': 100}, '8': {'count': 100}, '20': {'count': 100}, '61': {'count': 100}, '41': {'count': 100}, '93': {'count': 100}, '56': {'count': 100}, '73': {'count': 100}, '58': {'count': 100}, '11': {'count': 100}, '25': {'count': 100}, '37': {'count': 100}, '63': {'count': 100}, '24': {'count': 100}, '22': {'count': 100}, '17': {'count': 100}, '4': {'count': 100}, '6': {'count': 100}, '9': {'count': 100}, '57': {'count': 100}, '2': {'count': 100}, '32': {'count': 100}, '52': {'count': 100}, '42': {'count': 100}, '77': {'count': 100}, '27': {'count': 100}, '65': {'count': 100}, '7': {'count': 100}, '35': {'count': 100}, '82': {'count': 100}, '66': {'count': 100}, '90': {'count': 100}, '67': {'count': 100}, '91': {'count': 100}, '10': {'count': 100}, '78': {'count': 100}, '54': {'count': 100}, '89': {'count': 100}, '18': {'count': 100}, '13': {'count': 100}, '50': {'count': 100}, '26': {'count': 100}, '83': {'count': 100}, '47': {'count': 100}, '95': {'count': 100}, '76': {'count': 100}, '59': {'count': 100}, '85': {'count': 100}, '19': {'count': 100}, '46': {'count': 100}, '1': {'count': 100}, '74': {'count': 100}, '60': {'count': 100}, '64': {'count': 100}, '45': {'count': 100}, '36': {'count': 100}, '87': {'count': 100}, '30': {'count': 100}, '99': {'count': 100}, '80': {'count': 100}, '28': {'count': 100}, '98': {'count': 100}, '12': {'count': 100}, '94': {'count': 100}, '68': {'count': 100}, '44': {'count': 100}, '31': {'count': 100}, '79': {'count': 100}, '34': {'count': 100}, '55': {'count': 100}, '62': {'count': 100}, '96': {'count': 100}, '84': {'count': 100}, '38': {'count': 100}, '86': {'count': 100}, '5': {'count': 100}, '48': {'count': 100}, '3': {'count': 100}, '88': {'count': 100}}}} | | [CIFAR100Clustering](https://huggingface.co/datasets/uoft-cs/cifar100) (Alex Krizhevsky, 2009) | ['eng'] | ImageClustering | i2t | [Web] | {'test': 10000} | {'test': {'num_samples': 10000, 'unique_num_labels': 100, 'min_image_width': 32, 'average_image_width': 32.0, 'max_image_width': 32, 'min_image_height': 32, 'average_image_height': 32.0, 'max_image_height': 32, 'labels': {'49': {'count': 100}, '33': {'count': 100}, '72': {'count': 100}, '51': {'count': 100}, '71': {'count': 100}, '92': {'count': 100}, '15': {'count': 100}, '14': {'count': 100}, '23': {'count': 100}, '0': {'count': 100}, '75': {'count': 100}, '81': {'count': 100}, '69': {'count': 100}, '40': {'count': 100}, '43': {'count': 100}, '97': {'count': 100}, '70': {'count': 100}, '53': {'count': 100}, '29': {'count': 100}, '21': {'count': 100}, '16': {'count': 100}, '39': {'count': 100}, '8': {'count': 100}, '20': {'count': 100}, '61': {'count': 100}, '41': {'count': 100}, '93': {'count': 100}, '56': {'count': 100}, '73': {'count': 100}, '58': {'count': 100}, '11': {'count': 100}, '25': {'count': 100}, '37': {'count': 100}, '63': {'count': 100}, '24': {'count': 100}, '22': {'count': 100}, '17': {'count': 100}, '4': {'count': 100}, '6': {'count': 100}, '9': {'count': 100}, '57': {'count': 100}, '2': {'count': 100}, '32': {'count': 100}, '52': {'count': 100}, '42': {'count': 100}, '77': {'count': 100}, '27': {'count': 100}, '65': {'count': 100}, '7': {'count': 100}, '35': {'count': 100}, '82': {'count': 100}, '66': {'count': 100}, '90': {'count': 100}, '67': {'count': 100}, '91': {'count': 100}, '10': {'count': 100}, '78': {'count': 100}, '54': {'count': 100}, '89': {'count': 100}, '18': {'count': 100}, '13': {'count': 100}, '50': {'count': 100}, '26': {'count': 100}, '83': {'count': 100}, '47': {'count': 100}, '95': {'count': 100}, '76': {'count': 100}, '59': {'count': 100}, '85': {'count': 100}, '19': {'count': 100}, '46': {'count': 100}, '1': {'count': 100}, '74': {'count': 100}, '60': {'count': 100}, '64': {'count': 100}, '45': {'count': 100}, '36': {'count': 100}, '87': {'count': 100}, '30': {'count': 100}, '99': {'count': 100}, '80': {'count': 100}, '28': {'count': 100}, '98': {'count': 100}, '12': {'count': 100}, '94': {'count': 100}, '68': {'count': 100}, '44': {'count': 100}, '31': {'count': 100}, '79': {'count': 100}, '34': {'count': 100}, '55': {'count': 100}, '62': {'count': 100}, '96': {'count': 100}, '84': {'count': 100}, '38': {'count': 100}, '86': {'count': 100}, '5': {'count': 100}, '48': {'count': 100}, '3': {'count': 100}, '88': {'count': 100}}}} | @@ -134,6 +135,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackWordpress-NL](https://huggingface.co/datasets/clips/beir-nl-cqadupstack) (Nikolay Banar, 2024) | ['nld'] | Retrieval | s2p | [Non-fiction, Written] | None | None | | [CQADupstackWordpressRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Web, Written] | None | None | | [CQADupstackWordpressRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-wordpress-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | +| [CREMA_D](https://huggingface.co/datasets/silky1708/CREMA-D) (Cao et al., 2014) | ['eng'] | AudioClassification | a2t | [Speech] | None | None | | [CSFDCZMovieReviewSentimentClassification](https://arxiv.org/abs/2304.01922) (Michal Štefánik, 2023) | ['ces'] | Classification | s2s | [Reviews, Written] | None | None | | [CSFDSKMovieReviewSentimentClassification](https://arxiv.org/abs/2304.01922) (Michal Štefánik, 2023) | ['slk'] | Classification | s2s | [Reviews, Written] | None | None | | [CTKFactsNLI](https://arxiv.org/abs/2201.11115) (Ullrich et al., 2023) | ['ces'] | PairClassification | s2s | [News, Written] | None | None | @@ -184,17 +186,17 @@ The following tables give you an overview of the tasks in MTEB. | [Caltech101](https://ieeexplore.ieee.org/document/1384978) (Li Fei-Fei, 2004) | ['eng'] | ImageClassification | i2i | [Encyclopaedic] | {'test': 6084} | {'test': {'num_samples': 6084, 'unique_num_labels': 102, 'min_image_width': 80, 'average_image_width': 311.72, 'max_image_width': 3481, 'min_image_height': 101, 'average_image_height': 241.84, 'max_image_height': 3999, 'labels': {'4': {'count': 437}, '37': {'count': 405}, '38': {'count': 405}, '57': {'count': 170}, '66': {'count': 768}, '0': {'count': 25}, '1': {'count': 770}, '2': {'count': 12}, '3': {'count': 12}, '5': {'count': 17}, '6': {'count': 24}, '7': {'count': 16}, '8': {'count': 3}, '9': {'count': 98}, '10': {'count': 68}, '11': {'count': 13}, '12': {'count': 55}, '13': {'count': 61}, '14': {'count': 20}, '15': {'count': 13}, '16': {'count': 93}, '17': {'count': 17}, '18': {'count': 29}, '19': {'count': 32}, '20': {'count': 77}, '22': {'count': 39}, '23': {'count': 43}, '24': {'count': 40}, '25': {'count': 20}, '26': {'count': 21}, '27': {'count': 27}, '28': {'count': 37}, '29': {'count': 22}, '30': {'count': 35}, '31': {'count': 38}, '32': {'count': 45}, '33': {'count': 34}, '34': {'count': 23}, '35': {'count': 34}, '36': {'count': 55}, '39': {'count': 37}, '40': {'count': 37}, '41': {'count': 15}, '42': {'count': 4}, '43': {'count': 4}, '44': {'count': 21}, '45': {'count': 69}, '46': {'count': 70}, '47': {'count': 12}, '48': {'count': 24}, '49': {'count': 58}, '50': {'count': 50}, '51': {'count': 1}, '52': {'count': 34}, '53': {'count': 56}, '54': {'count': 84}, '55': {'count': 31}, '56': {'count': 51}, '58': {'count': 48}, '59': {'count': 11}, '60': {'count': 36}, '61': {'count': 13}, '62': {'count': 10}, '63': {'count': 57}, '64': {'count': 2}, '65': {'count': 46}, '67': {'count': 25}, '68': {'count': 5}, '69': {'count': 9}, '70': {'count': 17}, '71': {'count': 8}, '72': {'count': 15}, '73': {'count': 23}, '74': {'count': 4}, '75': {'count': 27}, '76': {'count': 52}, '77': {'count': 29}, '78': {'count': 19}, '79': {'count': 10}, '80': {'count': 33}, '81': {'count': 9}, '82': {'count': 54}, '83': {'count': 27}, '84': {'count': 5}, '85': {'count': 34}, '86': {'count': 15}, '87': {'count': 56}, '88': {'count': 29}, '89': {'count': 34}, '90': {'count': 5}, '91': {'count': 55}, '92': {'count': 19}, '93': {'count': 56}, '94': {'count': 45}, '95': {'count': 209}, '96': {'count': 7}, '97': {'count': 29}, '98': {'count': 4}, '99': {'count': 26}, '100': {'count': 9}, '101': {'count': 30}, '21': {'count': 17}}}} | | [Caltech101ZeroShot](https://ieeexplore.ieee.org/document/1384978) (Li Fei-Fei, 2004) | ['eng'] | ZeroShotClassification | i2t | [Encyclopaedic] | {'test': 1986} | {'test': {'num_samples': 1986, 'unique_num_labels': 63, 'min_image_width': 105, 'average_image_width': 277.19, 'max_image_width': 300, 'min_image_height': 114, 'average_image_height': 255.33, 'max_image_height': 300, 'min_label_text_length': 17, 'average_label_text_length': 21.88, 'max_label_text_length': 31, 'labels': {'36': {'count': 55}, '39': {'count': 37}, '40': {'count': 37}, '41': {'count': 15}, '42': {'count': 4}, '43': {'count': 4}, '44': {'count': 21}, '45': {'count': 69}, '46': {'count': 70}, '47': {'count': 12}, '48': {'count': 24}, '49': {'count': 58}, '50': {'count': 50}, '51': {'count': 1}, '52': {'count': 34}, '53': {'count': 56}, '54': {'count': 84}, '55': {'count': 31}, '56': {'count': 51}, '58': {'count': 48}, '59': {'count': 11}, '60': {'count': 36}, '61': {'count': 13}, '62': {'count': 10}, '63': {'count': 57}, '64': {'count': 2}, '65': {'count': 46}, '67': {'count': 25}, '68': {'count': 5}, '69': {'count': 9}, '70': {'count': 17}, '71': {'count': 8}, '72': {'count': 15}, '73': {'count': 23}, '74': {'count': 4}, '75': {'count': 27}, '76': {'count': 52}, '77': {'count': 29}, '78': {'count': 19}, '79': {'count': 10}, '80': {'count': 33}, '81': {'count': 9}, '82': {'count': 54}, '83': {'count': 27}, '84': {'count': 5}, '85': {'count': 34}, '86': {'count': 15}, '87': {'count': 56}, '88': {'count': 29}, '89': {'count': 34}, '90': {'count': 5}, '91': {'count': 55}, '92': {'count': 19}, '93': {'count': 56}, '94': {'count': 45}, '95': {'count': 209}, '96': {'count': 7}, '97': {'count': 29}, '98': {'count': 4}, '99': {'count': 26}, '100': {'count': 9}, '101': {'count': 30}, '21': {'count': 17}}}} | | [CanadaTaxCourtOutcomesLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | -| [CataloniaTweetClassification](https://aclanthology.org/2020.lrec-1.171/) | ['cat', 'spa'] | Classification | s2s | [Government, Social, Written] | None | None | +| [CataloniaTweetClassification](https://aclanthology.org/2020.lrec-1.171/) (Zotova et al., 2020) | ['cat', 'spa'] | Classification | s2s | [Government, Social, Written] | None | None | | [ChemHotpotQARetrieval](https://arxiv.org/abs/2412.00532) (Kasmaee et al., 2024) | ['eng'] | Retrieval | s2p | [Chemistry] | None | None | -| [ChemNQRetrieval](https://arxiv.org/abs/2412.00532) (Kasmaee et al., 2024) | ['eng'] | Retrieval | s2p | [Chemistry] | None | None | +| [ChemNQRetrieval](https://arxiv.org/abs/2412.00532) (Tom Kwiatkowski, 2019) | ['eng'] | Retrieval | s2p | [Chemistry] | None | None | | [ClimateFEVER](https://www.sustainablefinance.uzh.ch/en/research/climate-fever.html) (Thomas Diggelmann, 2021) | ['eng'] | Retrieval | s2p | [Encyclopaedic, Written] | None | None | | [ClimateFEVER-Fa](https://huggingface.co/datasets/MCINext/climate-fever-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [ClimateFEVER-NL](https://huggingface.co/datasets/clips/beir-nl-climate-fever) (Nikolay Banar, 2024) | ['nld'] | Retrieval | s2p | [Encyclopaedic, Written] | None | None | | [ClimateFEVER.v2](https://www.sustainablefinance.uzh.ch/en/research/climate-fever.html) (Thomas Diggelmann, 2021) | ['eng'] | Retrieval | s2p | [Academic, Written] | None | None | | [ClimateFEVERHardNegatives](https://www.sustainablefinance.uzh.ch/en/research/climate-fever.html) (Thomas Diggelmann, 2021) | ['eng'] | Retrieval | s2p | [Encyclopaedic, Written] | None | None | -| [ClusTREC-Covid](https://github.com/katzurik/Knowledge_Navigator/tree/main/Benchmarks/CLUSTREC%20COVID) | ['eng'] | Clustering | p2p | [Academic, Medical, Written] | {'test': 4568} | {'test': {'num_samples': 4568, 'number_of_characters': 2977845, 'min_text_length': 14, 'average_text_length': 651.89, 'max_text_length': 8364, 'min_labels_per_text': 6, 'average_labels_per_text': 1.0, 'max_labels_per_text': 100, 'unique_labels': 50, 'labels': {'coronavirus origin': {'count': 100}, 'coronavirus response to weather changes': {'count': 100}, 'coronavirus immunity': {'count': 78}, 'how do people die from the coronavirus': {'count': 100}, 'animal models of COVID-19': {'count': 100}, 'coronavirus test rapid testing': {'count': 100}, 'serological tests for coronavirus': {'count': 100}, 'coronavirus under reporting': {'count': 100}, 'coronavirus in Canada': {'count': 92}, 'coronavirus social distancing impact': {'count': 100}, 'coronavirus hospital rationing': {'count': 100}, 'coronavirus quarantine': {'count': 100}, 'how does coronavirus spread': {'count': 100}, 'coronavirus super spreaders': {'count': 98}, 'coronavirus outside body': {'count': 34}, 'how long does coronavirus survive on surfaces': {'count': 74}, 'coronavirus clinical trials': {'count': 100}, 'masks prevent coronavirus': {'count': 100}, 'what alcohol sanitizer kills coronavirus': {'count': 64}, 'coronavirus and ACE inhibitors': {'count': 100}, 'coronavirus mortality': {'count': 100}, 'coronavirus heart impacts': {'count': 100}, 'coronavirus hypertension': {'count': 74}, 'coronavirus diabetes': {'count': 100}, 'coronavirus biomarkers': {'count': 100}, 'coronavirus early symptoms': {'count': 100}, 'coronavirus asymptomatic': {'count': 100}, 'coronavirus hydroxychloroquine': {'count': 100}, 'coronavirus drug repurposing': {'count': 100}, 'coronavirus remdesivir': {'count': 100}, 'difference between coronavirus and flu': {'count': 100}, 'coronavirus subtypes': {'count': 6}, 'coronavirus vaccine candidates': {'count': 36}, 'coronavirus recovery': {'count': 100}, 'coronavirus public datasets': {'count': 100}, 'SARS-CoV-2 spike structure': {'count': 100}, 'SARS-CoV-2 phylogenetic analysis': {'count': 100}, 'COVID inflammatory response': {'count': 100}, 'COVID-19 cytokine storm': {'count': 100}, 'coronavirus mutations': {'count': 100}, 'COVID-19 in African-Americans': {'count': 100}, 'Vitamin D and COVID-19': {'count': 100}, 'violence during pandemic': {'count': 100}, 'impact of masks on coronavirus transmission': {'count': 100}, 'coronavirus mental health impact': {'count': 100}, 'dexamethasone coronavirus': {'count': 92}, 'COVID-19 outcomes in children': {'count': 100}, 'school reopening coronavirus': {'count': 100}, 'post-infection COVID-19 immunity': {'count': 88}, 'mRNA vaccine coronavirus': {'count': 32}}, 'hf_subset_descriptive_stats': {'title and abstract': {'num_samples': 2284, 'number_of_characters': 2755462, 'min_text_length': 14, 'average_text_length': 1206.42, 'max_text_length': 8364, 'min_labels_per_text': 3, 'average_labels_per_text': 1.0, 'max_labels_per_text': 50, 'unique_labels': 50, 'labels': {'coronavirus origin': {'count': 50}, 'coronavirus response to weather changes': {'count': 50}, 'coronavirus immunity': {'count': 39}, 'how do people die from the coronavirus': {'count': 50}, 'animal models of COVID-19': {'count': 50}, 'coronavirus test rapid testing': {'count': 50}, 'serological tests for coronavirus': {'count': 50}, 'coronavirus under reporting': {'count': 50}, 'coronavirus in Canada': {'count': 46}, 'coronavirus social distancing impact': {'count': 50}, 'coronavirus hospital rationing': {'count': 50}, 'coronavirus quarantine': {'count': 50}, 'how does coronavirus spread': {'count': 50}, 'coronavirus super spreaders': {'count': 49}, 'coronavirus outside body': {'count': 17}, 'how long does coronavirus survive on surfaces': {'count': 37}, 'coronavirus clinical trials': {'count': 50}, 'masks prevent coronavirus': {'count': 50}, 'what alcohol sanitizer kills coronavirus': {'count': 32}, 'coronavirus and ACE inhibitors': {'count': 50}, 'coronavirus mortality': {'count': 50}, 'coronavirus heart impacts': {'count': 50}, 'coronavirus hypertension': {'count': 37}, 'coronavirus diabetes': {'count': 50}, 'coronavirus biomarkers': {'count': 50}, 'coronavirus early symptoms': {'count': 50}, 'coronavirus asymptomatic': {'count': 50}, 'coronavirus hydroxychloroquine': {'count': 50}, 'coronavirus drug repurposing': {'count': 50}, 'coronavirus remdesivir': {'count': 50}, 'difference between coronavirus and flu': {'count': 50}, 'coronavirus subtypes': {'count': 3}, 'coronavirus vaccine candidates': {'count': 18}, 'coronavirus recovery': {'count': 50}, 'coronavirus public datasets': {'count': 50}, 'SARS-CoV-2 spike structure': {'count': 50}, 'SARS-CoV-2 phylogenetic analysis': {'count': 50}, 'COVID inflammatory response': {'count': 50}, 'COVID-19 cytokine storm': {'count': 50}, 'coronavirus mutations': {'count': 50}, 'COVID-19 in African-Americans': {'count': 50}, 'Vitamin D and COVID-19': {'count': 50}, 'violence during pandemic': {'count': 50}, 'impact of masks on coronavirus transmission': {'count': 50}, 'coronavirus mental health impact': {'count': 50}, 'dexamethasone coronavirus': {'count': 46}, 'COVID-19 outcomes in children': {'count': 50}, 'school reopening coronavirus': {'count': 50}, 'post-infection COVID-19 immunity': {'count': 44}, 'mRNA vaccine coronavirus': {'count': 16}}}, 'title': {'num_samples': 2284, 'number_of_characters': 222383, 'min_text_length': 14, 'average_text_length': 97.37, 'max_text_length': 348, 'min_labels_per_text': 3, 'average_labels_per_text': 1.0, 'max_labels_per_text': 50, 'unique_labels': 50, 'labels': {'coronavirus origin': {'count': 50}, 'coronavirus response to weather changes': {'count': 50}, 'coronavirus immunity': {'count': 39}, 'how do people die from the coronavirus': {'count': 50}, 'animal models of COVID-19': {'count': 50}, 'coronavirus test rapid testing': {'count': 50}, 'serological tests for coronavirus': {'count': 50}, 'coronavirus under reporting': {'count': 50}, 'coronavirus in Canada': {'count': 46}, 'coronavirus social distancing impact': {'count': 50}, 'coronavirus hospital rationing': {'count': 50}, 'coronavirus quarantine': {'count': 50}, 'how does coronavirus spread': {'count': 50}, 'coronavirus super spreaders': {'count': 49}, 'coronavirus outside body': {'count': 17}, 'how long does coronavirus survive on surfaces': {'count': 37}, 'coronavirus clinical trials': {'count': 50}, 'masks prevent coronavirus': {'count': 50}, 'what alcohol sanitizer kills coronavirus': {'count': 32}, 'coronavirus and ACE inhibitors': {'count': 50}, 'coronavirus mortality': {'count': 50}, 'coronavirus heart impacts': {'count': 50}, 'coronavirus hypertension': {'count': 37}, 'coronavirus diabetes': {'count': 50}, 'coronavirus biomarkers': {'count': 50}, 'coronavirus early symptoms': {'count': 50}, 'coronavirus asymptomatic': {'count': 50}, 'coronavirus hydroxychloroquine': {'count': 50}, 'coronavirus drug repurposing': {'count': 50}, 'coronavirus remdesivir': {'count': 50}, 'difference between coronavirus and flu': {'count': 50}, 'coronavirus subtypes': {'count': 3}, 'coronavirus vaccine candidates': {'count': 18}, 'coronavirus recovery': {'count': 50}, 'coronavirus public datasets': {'count': 50}, 'SARS-CoV-2 spike structure': {'count': 50}, 'SARS-CoV-2 phylogenetic analysis': {'count': 50}, 'COVID inflammatory response': {'count': 50}, 'COVID-19 cytokine storm': {'count': 50}, 'coronavirus mutations': {'count': 50}, 'COVID-19 in African-Americans': {'count': 50}, 'Vitamin D and COVID-19': {'count': 50}, 'violence during pandemic': {'count': 50}, 'impact of masks on coronavirus transmission': {'count': 50}, 'coronavirus mental health impact': {'count': 50}, 'dexamethasone coronavirus': {'count': 46}, 'COVID-19 outcomes in children': {'count': 50}, 'school reopening coronavirus': {'count': 50}, 'post-infection COVID-19 immunity': {'count': 44}, 'mRNA vaccine coronavirus': {'count': 16}}}}}} | -| [CmedqaRetrieval](https://aclanthology.org/2022.emnlp-main.357.pdf) | ['cmn'] | Retrieval | s2p | [Medical, Written] | None | None | -| [Cmnli](https://huggingface.co/datasets/clue/viewer/cmnli) | ['cmn'] | PairClassification | s2s | | None | None | +| [ClusTREC-Covid](https://github.com/katzurik/Knowledge_Navigator/tree/main/Benchmarks/CLUSTREC%20COVID) (Katz et al., 2024) | ['eng'] | Clustering | p2p | [Academic, Medical, Written] | {'test': 4568} | {'test': {'num_samples': 4568, 'number_of_characters': 2977845, 'min_text_length': 14, 'average_text_length': 651.89, 'max_text_length': 8364, 'min_labels_per_text': 6, 'average_labels_per_text': 1.0, 'max_labels_per_text': 100, 'unique_labels': 50, 'labels': {'coronavirus origin': {'count': 100}, 'coronavirus response to weather changes': {'count': 100}, 'coronavirus immunity': {'count': 78}, 'how do people die from the coronavirus': {'count': 100}, 'animal models of COVID-19': {'count': 100}, 'coronavirus test rapid testing': {'count': 100}, 'serological tests for coronavirus': {'count': 100}, 'coronavirus under reporting': {'count': 100}, 'coronavirus in Canada': {'count': 92}, 'coronavirus social distancing impact': {'count': 100}, 'coronavirus hospital rationing': {'count': 100}, 'coronavirus quarantine': {'count': 100}, 'how does coronavirus spread': {'count': 100}, 'coronavirus super spreaders': {'count': 98}, 'coronavirus outside body': {'count': 34}, 'how long does coronavirus survive on surfaces': {'count': 74}, 'coronavirus clinical trials': {'count': 100}, 'masks prevent coronavirus': {'count': 100}, 'what alcohol sanitizer kills coronavirus': {'count': 64}, 'coronavirus and ACE inhibitors': {'count': 100}, 'coronavirus mortality': {'count': 100}, 'coronavirus heart impacts': {'count': 100}, 'coronavirus hypertension': {'count': 74}, 'coronavirus diabetes': {'count': 100}, 'coronavirus biomarkers': {'count': 100}, 'coronavirus early symptoms': {'count': 100}, 'coronavirus asymptomatic': {'count': 100}, 'coronavirus hydroxychloroquine': {'count': 100}, 'coronavirus drug repurposing': {'count': 100}, 'coronavirus remdesivir': {'count': 100}, 'difference between coronavirus and flu': {'count': 100}, 'coronavirus subtypes': {'count': 6}, 'coronavirus vaccine candidates': {'count': 36}, 'coronavirus recovery': {'count': 100}, 'coronavirus public datasets': {'count': 100}, 'SARS-CoV-2 spike structure': {'count': 100}, 'SARS-CoV-2 phylogenetic analysis': {'count': 100}, 'COVID inflammatory response': {'count': 100}, 'COVID-19 cytokine storm': {'count': 100}, 'coronavirus mutations': {'count': 100}, 'COVID-19 in African-Americans': {'count': 100}, 'Vitamin D and COVID-19': {'count': 100}, 'violence during pandemic': {'count': 100}, 'impact of masks on coronavirus transmission': {'count': 100}, 'coronavirus mental health impact': {'count': 100}, 'dexamethasone coronavirus': {'count': 92}, 'COVID-19 outcomes in children': {'count': 100}, 'school reopening coronavirus': {'count': 100}, 'post-infection COVID-19 immunity': {'count': 88}, 'mRNA vaccine coronavirus': {'count': 32}}, 'hf_subset_descriptive_stats': {'title and abstract': {'num_samples': 2284, 'number_of_characters': 2755462, 'min_text_length': 14, 'average_text_length': 1206.42, 'max_text_length': 8364, 'min_labels_per_text': 3, 'average_labels_per_text': 1.0, 'max_labels_per_text': 50, 'unique_labels': 50, 'labels': {'coronavirus origin': {'count': 50}, 'coronavirus response to weather changes': {'count': 50}, 'coronavirus immunity': {'count': 39}, 'how do people die from the coronavirus': {'count': 50}, 'animal models of COVID-19': {'count': 50}, 'coronavirus test rapid testing': {'count': 50}, 'serological tests for coronavirus': {'count': 50}, 'coronavirus under reporting': {'count': 50}, 'coronavirus in Canada': {'count': 46}, 'coronavirus social distancing impact': {'count': 50}, 'coronavirus hospital rationing': {'count': 50}, 'coronavirus quarantine': {'count': 50}, 'how does coronavirus spread': {'count': 50}, 'coronavirus super spreaders': {'count': 49}, 'coronavirus outside body': {'count': 17}, 'how long does coronavirus survive on surfaces': {'count': 37}, 'coronavirus clinical trials': {'count': 50}, 'masks prevent coronavirus': {'count': 50}, 'what alcohol sanitizer kills coronavirus': {'count': 32}, 'coronavirus and ACE inhibitors': {'count': 50}, 'coronavirus mortality': {'count': 50}, 'coronavirus heart impacts': {'count': 50}, 'coronavirus hypertension': {'count': 37}, 'coronavirus diabetes': {'count': 50}, 'coronavirus biomarkers': {'count': 50}, 'coronavirus early symptoms': {'count': 50}, 'coronavirus asymptomatic': {'count': 50}, 'coronavirus hydroxychloroquine': {'count': 50}, 'coronavirus drug repurposing': {'count': 50}, 'coronavirus remdesivir': {'count': 50}, 'difference between coronavirus and flu': {'count': 50}, 'coronavirus subtypes': {'count': 3}, 'coronavirus vaccine candidates': {'count': 18}, 'coronavirus recovery': {'count': 50}, 'coronavirus public datasets': {'count': 50}, 'SARS-CoV-2 spike structure': {'count': 50}, 'SARS-CoV-2 phylogenetic analysis': {'count': 50}, 'COVID inflammatory response': {'count': 50}, 'COVID-19 cytokine storm': {'count': 50}, 'coronavirus mutations': {'count': 50}, 'COVID-19 in African-Americans': {'count': 50}, 'Vitamin D and COVID-19': {'count': 50}, 'violence during pandemic': {'count': 50}, 'impact of masks on coronavirus transmission': {'count': 50}, 'coronavirus mental health impact': {'count': 50}, 'dexamethasone coronavirus': {'count': 46}, 'COVID-19 outcomes in children': {'count': 50}, 'school reopening coronavirus': {'count': 50}, 'post-infection COVID-19 immunity': {'count': 44}, 'mRNA vaccine coronavirus': {'count': 16}}}, 'title': {'num_samples': 2284, 'number_of_characters': 222383, 'min_text_length': 14, 'average_text_length': 97.37, 'max_text_length': 348, 'min_labels_per_text': 3, 'average_labels_per_text': 1.0, 'max_labels_per_text': 50, 'unique_labels': 50, 'labels': {'coronavirus origin': {'count': 50}, 'coronavirus response to weather changes': {'count': 50}, 'coronavirus immunity': {'count': 39}, 'how do people die from the coronavirus': {'count': 50}, 'animal models of COVID-19': {'count': 50}, 'coronavirus test rapid testing': {'count': 50}, 'serological tests for coronavirus': {'count': 50}, 'coronavirus under reporting': {'count': 50}, 'coronavirus in Canada': {'count': 46}, 'coronavirus social distancing impact': {'count': 50}, 'coronavirus hospital rationing': {'count': 50}, 'coronavirus quarantine': {'count': 50}, 'how does coronavirus spread': {'count': 50}, 'coronavirus super spreaders': {'count': 49}, 'coronavirus outside body': {'count': 17}, 'how long does coronavirus survive on surfaces': {'count': 37}, 'coronavirus clinical trials': {'count': 50}, 'masks prevent coronavirus': {'count': 50}, 'what alcohol sanitizer kills coronavirus': {'count': 32}, 'coronavirus and ACE inhibitors': {'count': 50}, 'coronavirus mortality': {'count': 50}, 'coronavirus heart impacts': {'count': 50}, 'coronavirus hypertension': {'count': 37}, 'coronavirus diabetes': {'count': 50}, 'coronavirus biomarkers': {'count': 50}, 'coronavirus early symptoms': {'count': 50}, 'coronavirus asymptomatic': {'count': 50}, 'coronavirus hydroxychloroquine': {'count': 50}, 'coronavirus drug repurposing': {'count': 50}, 'coronavirus remdesivir': {'count': 50}, 'difference between coronavirus and flu': {'count': 50}, 'coronavirus subtypes': {'count': 3}, 'coronavirus vaccine candidates': {'count': 18}, 'coronavirus recovery': {'count': 50}, 'coronavirus public datasets': {'count': 50}, 'SARS-CoV-2 spike structure': {'count': 50}, 'SARS-CoV-2 phylogenetic analysis': {'count': 50}, 'COVID inflammatory response': {'count': 50}, 'COVID-19 cytokine storm': {'count': 50}, 'coronavirus mutations': {'count': 50}, 'COVID-19 in African-Americans': {'count': 50}, 'Vitamin D and COVID-19': {'count': 50}, 'violence during pandemic': {'count': 50}, 'impact of masks on coronavirus transmission': {'count': 50}, 'coronavirus mental health impact': {'count': 50}, 'dexamethasone coronavirus': {'count': 46}, 'COVID-19 outcomes in children': {'count': 50}, 'school reopening coronavirus': {'count': 50}, 'post-infection COVID-19 immunity': {'count': 44}, 'mRNA vaccine coronavirus': {'count': 16}}}}}} | +| [CmedqaRetrieval](https://aclanthology.org/2022.emnlp-main.357.pdf) (Yifu Qiu, 2022) | ['cmn'] | Retrieval | s2p | [Medical, Written] | None | None | +| [Cmnli](https://huggingface.co/datasets/clue/viewer/cmnli) (Xu et al., 2020) | ['cmn'] | PairClassification | s2s | | None | None | | [CodeEditSearchRetrieval](https://huggingface.co/datasets/cassanof/CodeEditSearch/viewer) (Niklas Muennighoff, 2023) | ['c', 'c++', 'go', 'java', 'javascript', 'php', 'python', 'ruby', 'rust', 'scala', 'shell', 'swift', 'typescript'] | Retrieval | p2p | [Programming, Written] | {'train': 26000} | {'train': {'number_of_characters': 935841, 'num_samples': 26000, 'num_queries': 13000, 'num_documents': 13000, 'min_document_length': 18, 'average_document_length': 70.99, 'max_document_length': 2532, 'unique_documents': 13000, 'min_query_length': 1, 'average_query_length': 1.0, 'max_query_length': 1, 'unique_queries': 13000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 13000, 'hf_subset_descriptive_stats': {'python': {'number_of_characters': 70519, 'num_samples': 2000, 'num_queries': 1000, 'num_documents': 1000, 'min_document_length': 21, 'average_document_length': 69.52, 'max_document_length': 1811, 'unique_documents': 1000, 'min_query_length': 1, 'average_query_length': 1.0, 'max_query_length': 1, 'unique_queries': 1000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 1000}, 'javascript': {'number_of_characters': 57880, 'num_samples': 2000, 'num_queries': 1000, 'num_documents': 1000, 'min_document_length': 18, 'average_document_length': 56.88, 'max_document_length': 601, 'unique_documents': 1000, 'min_query_length': 1, 'average_query_length': 1.0, 'max_query_length': 1, 'unique_queries': 1000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 1000}, 'typescript': {'number_of_characters': 61092, 'num_samples': 2000, 'num_queries': 1000, 'num_documents': 1000, 'min_document_length': 19, 'average_document_length': 60.09, 'max_document_length': 659, 'unique_documents': 1000, 'min_query_length': 1, 'average_query_length': 1.0, 'max_query_length': 1, 'unique_queries': 1000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 1000}, 'go': {'number_of_characters': 71797, 'num_samples': 2000, 'num_queries': 1000, 'num_documents': 1000, 'min_document_length': 19, 'average_document_length': 70.8, 'max_document_length': 1529, 'unique_documents': 1000, 'min_query_length': 1, 'average_query_length': 1.0, 'max_query_length': 1, 'unique_queries': 1000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 1000}, 'ruby': {'number_of_characters': 67900, 'num_samples': 2000, 'num_queries': 1000, 'num_documents': 1000, 'min_document_length': 20, 'average_document_length': 66.9, 'max_document_length': 751, 'unique_documents': 1000, 'min_query_length': 1, 'average_query_length': 1.0, 'max_query_length': 1, 'unique_queries': 1000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 1000}, 'java': {'number_of_characters': 63984, 'num_samples': 2000, 'num_queries': 1000, 'num_documents': 1000, 'min_document_length': 23, 'average_document_length': 62.98, 'max_document_length': 807, 'unique_documents': 1000, 'min_query_length': 1, 'average_query_length': 1.0, 'max_query_length': 1, 'unique_queries': 1000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 1000}, 'php': {'number_of_characters': 62927, 'num_samples': 2000, 'num_queries': 1000, 'num_documents': 1000, 'min_document_length': 21, 'average_document_length': 61.93, 'max_document_length': 766, 'unique_documents': 1000, 'min_query_length': 1, 'average_query_length': 1.0, 'max_query_length': 1, 'unique_queries': 1000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 1000}, 'c': {'number_of_characters': 98588, 'num_samples': 2000, 'num_queries': 1000, 'num_documents': 1000, 'min_document_length': 20, 'average_document_length': 97.59, 'max_document_length': 1672, 'unique_documents': 1000, 'min_query_length': 1, 'average_query_length': 1.0, 'max_query_length': 1, 'unique_queries': 1000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 1000}, 'c++': {'number_of_characters': 115480, 'num_samples': 2000, 'num_queries': 1000, 'num_documents': 1000, 'min_document_length': 22, 'average_document_length': 114.48, 'max_document_length': 1856, 'unique_documents': 1000, 'min_query_length': 1, 'average_query_length': 1.0, 'max_query_length': 1, 'unique_queries': 1000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 1000}, 'rust': {'number_of_characters': 68503, 'num_samples': 2000, 'num_queries': 1000, 'num_documents': 1000, 'min_document_length': 19, 'average_document_length': 67.5, 'max_document_length': 2532, 'unique_documents': 1000, 'min_query_length': 1, 'average_query_length': 1.0, 'max_query_length': 1, 'unique_queries': 1000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 1000}, 'swift': {'number_of_characters': 58279, 'num_samples': 2000, 'num_queries': 1000, 'num_documents': 1000, 'min_document_length': 19, 'average_document_length': 57.28, 'max_document_length': 727, 'unique_documents': 1000, 'min_query_length': 1, 'average_query_length': 1.0, 'max_query_length': 1, 'unique_queries': 1000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 1000}, 'scala': {'number_of_characters': 65833, 'num_samples': 2000, 'num_queries': 1000, 'num_documents': 1000, 'min_document_length': 22, 'average_document_length': 64.83, 'max_document_length': 685, 'unique_documents': 1000, 'min_query_length': 1, 'average_query_length': 1.0, 'max_query_length': 1, 'unique_queries': 1000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 1000}, 'shell': {'number_of_characters': 73059, 'num_samples': 2000, 'num_queries': 1000, 'num_documents': 1000, 'min_document_length': 18, 'average_document_length': 72.06, 'max_document_length': 813, 'unique_documents': 1000, 'min_query_length': 1, 'average_query_length': 1.0, 'max_query_length': 1, 'unique_queries': 1000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 1000}}}} | | [CodeFeedbackMT](https://arxiv.org/abs/2402.14658) (Tianyu Zheng, 2024) | ['eng'] | Retrieval | p2p | [Programming, Written] | {'test': 79660} | {'test': {'number_of_characters': 156266302, 'num_samples': 79660, 'num_queries': 13277, 'num_documents': 66383, 'min_document_length': 127, 'average_document_length': 885.13, 'max_document_length': 32432, 'unique_documents': 66383, 'min_query_length': 2, 'average_query_length': 7344.18, 'max_query_length': 9403, 'unique_queries': 13277, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 13277}} | | [CodeFeedbackST](https://arxiv.org/abs/2407.02883) (Xiangyang Li, 2024) | ['eng'] | Retrieval | p2p | [Programming, Written] | {'test': 187832} | {'test': {'number_of_characters': 260957682, 'num_samples': 187832, 'num_queries': 31306, 'num_documents': 156526, 'min_document_length': 26, 'average_document_length': 144.85, 'max_document_length': 13851, 'unique_documents': 156526, 'min_query_length': 1, 'average_query_length': 7611.46, 'max_query_length': 11354, 'unique_queries': 31306, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 31306}} | @@ -206,6 +208,9 @@ The following tables give you an overview of the tasks in MTEB. | [CodeSearchNetRetrieval](https://huggingface.co/datasets/code_search_net/) (Husain et al., 2019) | ['go', 'java', 'javascript', 'php', 'python', 'ruby'] | Retrieval | p2p | [Programming, Written] | {'test': 12000} | {'test': {'number_of_characters': 1950074, 'num_samples': 12000, 'num_queries': 6000, 'num_documents': 6000, 'min_document_length': 2, 'average_document_length': 324.01, 'max_document_length': 17533, 'unique_documents': 6000, 'min_query_length': 1, 'average_query_length': 1.0, 'max_query_length': 1, 'unique_queries': 6000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 6000, 'hf_subset_descriptive_stats': {'python': {'number_of_characters': 467546, 'num_samples': 2000, 'num_queries': 1000, 'num_documents': 1000, 'min_document_length': 8, 'average_document_length': 466.55, 'max_document_length': 8636, 'unique_documents': 1000, 'min_query_length': 1, 'average_query_length': 1.0, 'max_query_length': 1, 'unique_queries': 1000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 1000}, 'javascript': {'number_of_characters': 187018, 'num_samples': 2000, 'num_queries': 1000, 'num_documents': 1000, 'min_document_length': 2, 'average_document_length': 186.02, 'max_document_length': 7657, 'unique_documents': 1000, 'min_query_length': 1, 'average_query_length': 1.0, 'max_query_length': 1, 'unique_queries': 1000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 1000}, 'go': {'number_of_characters': 126213, 'num_samples': 2000, 'num_queries': 1000, 'num_documents': 1000, 'min_document_length': 14, 'average_document_length': 125.21, 'max_document_length': 1501, 'unique_documents': 1000, 'min_query_length': 1, 'average_query_length': 1.0, 'max_query_length': 1, 'unique_queries': 1000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 1000}, 'ruby': {'number_of_characters': 314818, 'num_samples': 2000, 'num_queries': 1000, 'num_documents': 1000, 'min_document_length': 5, 'average_document_length': 313.82, 'max_document_length': 17533, 'unique_documents': 1000, 'min_query_length': 1, 'average_query_length': 1.0, 'max_query_length': 1, 'unique_queries': 1000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 1000}, 'java': {'number_of_characters': 691360, 'num_samples': 2000, 'num_queries': 1000, 'num_documents': 1000, 'min_document_length': 2, 'average_document_length': 690.36, 'max_document_length': 6473, 'unique_documents': 1000, 'min_query_length': 1, 'average_query_length': 1.0, 'max_query_length': 1, 'unique_queries': 1000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 1000}, 'php': {'number_of_characters': 163119, 'num_samples': 2000, 'num_queries': 1000, 'num_documents': 1000, 'min_document_length': 5, 'average_document_length': 162.12, 'max_document_length': 1240, 'unique_documents': 1000, 'min_query_length': 1, 'average_query_length': 1.0, 'max_query_length': 1, 'unique_queries': 1000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 1000}}}} | | [CodeTransOceanContest](https://arxiv.org/abs/2310.04951) (Weixiang Yan, 2023) | ['c++', 'python'] | Retrieval | p2p | [Programming, Written] | {'test': 1229} | {'test': {'number_of_characters': 1744286, 'num_samples': 1229, 'num_queries': 221, 'num_documents': 1008, 'min_document_length': 8, 'average_document_length': 221.9, 'max_document_length': 4147, 'unique_documents': 1008, 'min_query_length': 8, 'average_query_length': 6880.58, 'max_query_length': 10852, 'unique_queries': 221, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 221}} | | [CodeTransOceanDL](https://arxiv.org/abs/2310.04951) (Weixiang Yan, 2023) | ['python'] | Retrieval | p2p | [Programming, Written] | {'test': 996} | {'test': {'number_of_characters': 1543912, 'num_samples': 996, 'num_queries': 180, 'num_documents': 816, 'min_document_length': 376, 'average_document_length': 411.98, 'max_document_length': 8285, 'unique_documents': 816, 'min_query_length': 58, 'average_query_length': 6709.67, 'max_query_length': 8469, 'unique_queries': 180, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 180}} | +| [CommonLanguageAgeDetection](https://huggingface.co/datasets/speechbrain/common_language) | ['eng'] | AudioClassification | a2t | [Scene, Speech, Spoken] | None | None | +| [CommonLanguageGenderDetection](https://huggingface.co/datasets/speechbrain/common_language) | ['eng'] | AudioClassification | a2t | [Scene, Speech, Spoken] | None | None | +| [CommonLanguageLanguageDetection](https://huggingface.co/datasets/speechbrain/common_language) | ['eng'] | AudioClassification | a2t | [Scene, Speech, Spoken] | None | None | | [ContractNLIConfidentialityOfAgreementLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | | [ContractNLIExplicitIdentificationLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | | [ContractNLIInclusionOfVerballyConveyedInformationLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | @@ -225,13 +230,13 @@ The following tables give you an overview of the tasks in MTEB. | [CosQA](https://arxiv.org/abs/2105.13239) (Junjie Huang, 2021) | ['eng', 'python'] | Retrieval | p2p | [Programming, Written] | {'test': 21104} | {'test': {'number_of_characters': 5728450, 'num_samples': 21104, 'num_queries': 500, 'num_documents': 20604, 'min_document_length': 18, 'average_document_length': 0.89, 'max_document_length': 83, 'unique_documents': 20604, 'min_query_length': 88, 'average_query_length': 11420.09, 'max_query_length': 6396, 'unique_queries': 500, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 500}} | | [Country211](https://huggingface.co/datasets/clip-benchmark/wds_country211) (Radford et al., 2021) | ['eng'] | ImageClassification | i2i | [Scene] | {'test': 21100} | {'test': {'num_samples': 21100, 'unique_num_labels': 211, 'min_image_width': 32, 'average_image_width': 468.59, 'max_image_width': 500, 'min_image_height': 37, 'average_image_height': 381.73, 'max_image_height': 500, 'labels': {'0': {'count': 100}, '1': {'count': 100}, '2': {'count': 100}, '3': {'count': 100}, '4': {'count': 100}, '5': {'count': 100}, '6': {'count': 100}, '7': {'count': 100}, '8': {'count': 100}, '9': {'count': 100}, '10': {'count': 100}, '11': {'count': 100}, '12': {'count': 100}, '13': {'count': 100}, '14': {'count': 100}, '15': {'count': 100}, '16': {'count': 100}, '17': {'count': 100}, '18': {'count': 100}, '19': {'count': 100}, '20': {'count': 100}, '21': {'count': 100}, '22': {'count': 100}, '23': {'count': 100}, '24': {'count': 100}, '25': {'count': 100}, '26': {'count': 100}, '27': {'count': 100}, '28': {'count': 100}, '29': {'count': 100}, '30': {'count': 100}, '31': {'count': 100}, '32': {'count': 100}, '33': {'count': 100}, '34': {'count': 100}, '35': {'count': 100}, '36': {'count': 100}, '37': {'count': 100}, '38': {'count': 100}, '39': {'count': 100}, '40': {'count': 100}, '41': {'count': 100}, '42': {'count': 100}, '43': {'count': 100}, '44': {'count': 100}, '45': {'count': 100}, '46': {'count': 100}, '47': {'count': 100}, '48': {'count': 100}, '49': {'count': 100}, '50': {'count': 100}, '51': {'count': 100}, '52': {'count': 100}, '53': {'count': 100}, '54': {'count': 100}, '55': {'count': 100}, '56': {'count': 100}, '57': {'count': 100}, '58': {'count': 100}, '59': {'count': 100}, '60': {'count': 100}, '61': {'count': 100}, '62': {'count': 100}, '63': {'count': 100}, '64': {'count': 100}, '65': {'count': 100}, '66': {'count': 100}, '67': {'count': 100}, '68': {'count': 100}, '69': {'count': 100}, '70': {'count': 100}, '71': {'count': 100}, '72': {'count': 100}, '73': {'count': 100}, '74': {'count': 100}, '75': {'count': 100}, '76': {'count': 100}, '77': {'count': 100}, '78': {'count': 100}, '79': {'count': 100}, '80': {'count': 100}, '81': {'count': 100}, '82': {'count': 100}, '83': {'count': 100}, '84': {'count': 100}, '85': {'count': 100}, '86': {'count': 100}, '87': {'count': 100}, '88': {'count': 100}, '89': {'count': 100}, '90': {'count': 100}, '91': {'count': 100}, '92': {'count': 100}, '93': {'count': 100}, '94': {'count': 100}, '95': {'count': 100}, '96': {'count': 100}, '97': {'count': 100}, '98': {'count': 100}, '99': {'count': 100}, '100': {'count': 100}, '101': {'count': 100}, '102': {'count': 100}, '103': {'count': 100}, '104': {'count': 100}, '105': {'count': 100}, '106': {'count': 100}, '107': {'count': 100}, '108': {'count': 100}, '109': {'count': 100}, '110': {'count': 100}, '111': {'count': 100}, '112': {'count': 100}, '113': {'count': 100}, '114': {'count': 100}, '115': {'count': 100}, '116': {'count': 100}, '117': {'count': 100}, '118': {'count': 100}, '119': {'count': 100}, '120': {'count': 100}, '121': {'count': 100}, '122': {'count': 100}, '123': {'count': 100}, '124': {'count': 100}, '125': {'count': 100}, '126': {'count': 100}, '127': {'count': 100}, '128': {'count': 100}, '129': {'count': 100}, '130': {'count': 100}, '131': {'count': 100}, '132': {'count': 100}, '133': {'count': 100}, '134': {'count': 100}, '135': {'count': 100}, '136': {'count': 100}, '137': {'count': 100}, '138': {'count': 100}, '139': {'count': 100}, '140': {'count': 100}, '141': {'count': 100}, '142': {'count': 100}, '143': {'count': 100}, '144': {'count': 100}, '145': {'count': 100}, '146': {'count': 100}, '147': {'count': 100}, '148': {'count': 100}, '149': {'count': 100}, '150': {'count': 100}, '151': {'count': 100}, '152': {'count': 100}, '153': {'count': 100}, '154': {'count': 100}, '155': {'count': 100}, '156': {'count': 100}, '157': {'count': 100}, '158': {'count': 100}, '159': {'count': 100}, '160': {'count': 100}, '161': {'count': 100}, '162': {'count': 100}, '163': {'count': 100}, '164': {'count': 100}, '165': {'count': 100}, '166': {'count': 100}, '167': {'count': 100}, '168': {'count': 100}, '169': {'count': 100}, '170': {'count': 100}, '171': {'count': 100}, '172': {'count': 100}, '173': {'count': 100}, '174': {'count': 100}, '175': {'count': 100}, '176': {'count': 100}, '177': {'count': 100}, '178': {'count': 100}, '179': {'count': 100}, '180': {'count': 100}, '181': {'count': 100}, '182': {'count': 100}, '183': {'count': 100}, '184': {'count': 100}, '185': {'count': 100}, '186': {'count': 100}, '187': {'count': 100}, '188': {'count': 100}, '189': {'count': 100}, '190': {'count': 100}, '191': {'count': 100}, '192': {'count': 100}, '193': {'count': 100}, '194': {'count': 100}, '195': {'count': 100}, '196': {'count': 100}, '197': {'count': 100}, '198': {'count': 100}, '199': {'count': 100}, '200': {'count': 100}, '201': {'count': 100}, '202': {'count': 100}, '203': {'count': 100}, '204': {'count': 100}, '205': {'count': 100}, '206': {'count': 100}, '207': {'count': 100}, '208': {'count': 100}, '209': {'count': 100}, '210': {'count': 100}}}} | | [Country211ZeroShot](https://huggingface.co/datasets/clip-benchmark/wds_country211) (Radford et al., 2021) | ['eng'] | ZeroShotClassification | i2t | [Scene] | {'test': 21100} | {'test': {'num_samples': 21100, 'unique_num_labels': 211, 'min_image_width': 32, 'average_image_width': 468.59, 'max_image_width': 500, 'min_image_height': 37, 'average_image_height': 381.73, 'max_image_height': 500, 'min_label_text_length': 37, 'average_label_text_length': 41.95, 'max_label_text_length': 69, 'labels': {'0': {'count': 100}, '1': {'count': 100}, '2': {'count': 100}, '3': {'count': 100}, '4': {'count': 100}, '5': {'count': 100}, '6': {'count': 100}, '7': {'count': 100}, '8': {'count': 100}, '9': {'count': 100}, '10': {'count': 100}, '11': {'count': 100}, '12': {'count': 100}, '13': {'count': 100}, '14': {'count': 100}, '15': {'count': 100}, '16': {'count': 100}, '17': {'count': 100}, '18': {'count': 100}, '19': {'count': 100}, '20': {'count': 100}, '21': {'count': 100}, '22': {'count': 100}, '23': {'count': 100}, '24': {'count': 100}, '25': {'count': 100}, '26': {'count': 100}, '27': {'count': 100}, '28': {'count': 100}, '29': {'count': 100}, '30': {'count': 100}, '31': {'count': 100}, '32': {'count': 100}, '33': {'count': 100}, '34': {'count': 100}, '35': {'count': 100}, '36': {'count': 100}, '37': {'count': 100}, '38': {'count': 100}, '39': {'count': 100}, '40': {'count': 100}, '41': {'count': 100}, '42': {'count': 100}, '43': {'count': 100}, '44': {'count': 100}, '45': {'count': 100}, '46': {'count': 100}, '47': {'count': 100}, '48': {'count': 100}, '49': {'count': 100}, '50': {'count': 100}, '51': {'count': 100}, '52': {'count': 100}, '53': {'count': 100}, '54': {'count': 100}, '55': {'count': 100}, '56': {'count': 100}, '57': {'count': 100}, '58': {'count': 100}, '59': {'count': 100}, '60': {'count': 100}, '61': {'count': 100}, '62': {'count': 100}, '63': {'count': 100}, '64': {'count': 100}, '65': {'count': 100}, '66': {'count': 100}, '67': {'count': 100}, '68': {'count': 100}, '69': {'count': 100}, '70': {'count': 100}, '71': {'count': 100}, '72': {'count': 100}, '73': {'count': 100}, '74': {'count': 100}, '75': {'count': 100}, '76': {'count': 100}, '77': {'count': 100}, '78': {'count': 100}, '79': {'count': 100}, '80': {'count': 100}, '81': {'count': 100}, '82': {'count': 100}, '83': {'count': 100}, '84': {'count': 100}, '85': {'count': 100}, '86': {'count': 100}, '87': {'count': 100}, '88': {'count': 100}, '89': {'count': 100}, '90': {'count': 100}, '91': {'count': 100}, '92': {'count': 100}, '93': {'count': 100}, '94': {'count': 100}, '95': {'count': 100}, '96': {'count': 100}, '97': {'count': 100}, '98': {'count': 100}, '99': {'count': 100}, '100': {'count': 100}, '101': {'count': 100}, '102': {'count': 100}, '103': {'count': 100}, '104': {'count': 100}, '105': {'count': 100}, '106': {'count': 100}, '107': {'count': 100}, '108': {'count': 100}, '109': {'count': 100}, '110': {'count': 100}, '111': {'count': 100}, '112': {'count': 100}, '113': {'count': 100}, '114': {'count': 100}, '115': {'count': 100}, '116': {'count': 100}, '117': {'count': 100}, '118': {'count': 100}, '119': {'count': 100}, '120': {'count': 100}, '121': {'count': 100}, '122': {'count': 100}, '123': {'count': 100}, '124': {'count': 100}, '125': {'count': 100}, '126': {'count': 100}, '127': {'count': 100}, '128': {'count': 100}, '129': {'count': 100}, '130': {'count': 100}, '131': {'count': 100}, '132': {'count': 100}, '133': {'count': 100}, '134': {'count': 100}, '135': {'count': 100}, '136': {'count': 100}, '137': {'count': 100}, '138': {'count': 100}, '139': {'count': 100}, '140': {'count': 100}, '141': {'count': 100}, '142': {'count': 100}, '143': {'count': 100}, '144': {'count': 100}, '145': {'count': 100}, '146': {'count': 100}, '147': {'count': 100}, '148': {'count': 100}, '149': {'count': 100}, '150': {'count': 100}, '151': {'count': 100}, '152': {'count': 100}, '153': {'count': 100}, '154': {'count': 100}, '155': {'count': 100}, '156': {'count': 100}, '157': {'count': 100}, '158': {'count': 100}, '159': {'count': 100}, '160': {'count': 100}, '161': {'count': 100}, '162': {'count': 100}, '163': {'count': 100}, '164': {'count': 100}, '165': {'count': 100}, '166': {'count': 100}, '167': {'count': 100}, '168': {'count': 100}, '169': {'count': 100}, '170': {'count': 100}, '171': {'count': 100}, '172': {'count': 100}, '173': {'count': 100}, '174': {'count': 100}, '175': {'count': 100}, '176': {'count': 100}, '177': {'count': 100}, '178': {'count': 100}, '179': {'count': 100}, '180': {'count': 100}, '181': {'count': 100}, '182': {'count': 100}, '183': {'count': 100}, '184': {'count': 100}, '185': {'count': 100}, '186': {'count': 100}, '187': {'count': 100}, '188': {'count': 100}, '189': {'count': 100}, '190': {'count': 100}, '191': {'count': 100}, '192': {'count': 100}, '193': {'count': 100}, '194': {'count': 100}, '195': {'count': 100}, '196': {'count': 100}, '197': {'count': 100}, '198': {'count': 100}, '199': {'count': 100}, '200': {'count': 100}, '201': {'count': 100}, '202': {'count': 100}, '203': {'count': 100}, '204': {'count': 100}, '205': {'count': 100}, '206': {'count': 100}, '207': {'count': 100}, '208': {'count': 100}, '209': {'count': 100}, '210': {'count': 100}}}} | -| [CovidRetrieval](https://arxiv.org/abs/2203.03367) | ['cmn'] | Retrieval | s2p | | None | None | +| [CovidRetrieval](https://arxiv.org/abs/2203.03367) (Dingkun Long, 2022) | ['cmn'] | Retrieval | s2p | [Entertainment, Medical] | None | None | | [CrossLingualSemanticDiscriminationWMT19](https://huggingface.co/datasets/Andrianos/clsd_wmt19_21) | ['deu', 'fra'] | Retrieval | s2s | [News, Written] | None | None | | [CrossLingualSemanticDiscriminationWMT21](https://huggingface.co/datasets/Andrianos/clsd_wmt19_21) | ['deu', 'fra'] | Retrieval | s2s | [News, Written] | None | None | | [CyrillicTurkicLangClassification](https://huggingface.co/datasets/tatiana-merz/cyrillic_turkic_langs) (Goldhahn et al., 2012) | ['bak', 'chv', 'kaz', 'kir', 'krc', 'rus', 'sah', 'tat', 'tyv'] | Classification | s2s | [Web, Written] | None | None | -| [CzechProductReviewSentimentClassification](https://aclanthology.org/W13-1609/) | ['ces'] | Classification | s2s | [Reviews, Written] | None | None | -| [CzechSoMeSentimentClassification](https://aclanthology.org/W13-1609/) | ['ces'] | Classification | s2s | [Reviews, Written] | None | None | -| [CzechSubjectivityClassification](https://arxiv.org/abs/2009.08712) | ['ces'] | Classification | s2s | [Reviews, Written] | None | None | +| [CzechProductReviewSentimentClassification](https://aclanthology.org/W13-1609/) (Habernal et al., 2013) | ['ces'] | Classification | s2s | [Reviews, Written] | None | None | +| [CzechSoMeSentimentClassification](https://aclanthology.org/W13-1609/) (Habernal et al., 2013) | ['ces'] | Classification | s2s | [Reviews, Written] | None | None | +| [CzechSubjectivityClassification](https://arxiv.org/abs/2009.08712) (P{\v{r, 2022) | ['ces'] | Classification | s2s | [Reviews, Written] | None | None | | [DBPedia](https://github.com/iai-group/DBpedia-Entity/) (Hasibi et al., 2017) | ['eng'] | Retrieval | s2p | [Encyclopaedic, Written] | None | None | | [DBPedia-Fa](https://huggingface.co/datasets/MCINext/dbpedia-fa) | ['fas'] | Retrieval | s2p | [Encyclopaedic] | None | None | | [DBPedia-NL](https://huggingface.co/datasets/clips/beir-nl-dbpedia-entity) (Nikolay Banar, 2024) | ['nld'] | Retrieval | s2p | [Encyclopaedic, Written] | None | None | @@ -239,11 +244,11 @@ The following tables give you an overview of the tasks in MTEB. | [DBPedia-PLHardNegatives](https://github.com/iai-group/DBpedia-Entity/) (Hasibi et al., 2017) | ['pol'] | Retrieval | s2p | [Encyclopaedic, Written] | None | None | | [DBPediaHardNegatives](https://github.com/iai-group/DBpedia-Entity/) (Hasibi et al., 2017) | ['eng'] | Retrieval | s2p | [Encyclopaedic, Written] | None | None | | [DBpediaClassification](https://arxiv.org/abs/1509.01626) (Zhang et al., 2015) | ['eng'] | Classification | s2s | [Encyclopaedic, Written] | None | None | -| [DKHateClassification](https://aclanthology.org/2020.lrec-1.430/) | ['dan'] | Classification | s2s | [Social, Written] | None | None | -| [DTD](https://www.robots.ox.ac.uk/~vgg/data/dtd/) | ['eng'] | ImageClassification | i2i | [Encyclopaedic] | {'test': 1880} | {'test': {'num_samples': 1880, 'unique_num_labels': 47, 'min_image_width': 300, 'average_image_width': 488.98, 'max_image_width': 900, 'min_image_height': 300, 'average_image_height': 447.5, 'max_image_height': 778, 'labels': {'0': {'count': 40}, '1': {'count': 40}, '10': {'count': 40}, '11': {'count': 40}, '12': {'count': 40}, '13': {'count': 40}, '14': {'count': 40}, '15': {'count': 40}, '16': {'count': 40}, '17': {'count': 40}, '18': {'count': 40}, '19': {'count': 40}, '2': {'count': 40}, '20': {'count': 40}, '21': {'count': 40}, '22': {'count': 40}, '23': {'count': 40}, '24': {'count': 40}, '25': {'count': 40}, '26': {'count': 40}, '27': {'count': 40}, '28': {'count': 40}, '29': {'count': 40}, '3': {'count': 40}, '30': {'count': 40}, '31': {'count': 40}, '32': {'count': 40}, '33': {'count': 40}, '34': {'count': 40}, '35': {'count': 40}, '36': {'count': 40}, '37': {'count': 40}, '38': {'count': 40}, '39': {'count': 40}, '4': {'count': 40}, '40': {'count': 40}, '41': {'count': 40}, '42': {'count': 40}, '43': {'count': 40}, '44': {'count': 40}, '45': {'count': 40}, '46': {'count': 40}, '5': {'count': 40}, '6': {'count': 40}, '7': {'count': 40}, '8': {'count': 40}, '9': {'count': 40}}}} | -| [DTDZeroShot](https://www.robots.ox.ac.uk/~vgg/data/dtd/) | ['eng'] | ZeroShotClassification | i2t | [Encyclopaedic] | {'test': 1880} | {'test': {'num_samples': 1880, 'unique_num_labels': 47, 'min_image_width': 300, 'average_image_width': 488.98, 'max_image_width': 900, 'min_image_height': 300, 'average_image_height': 447.5, 'max_image_height': 778, 'min_label_text_length': 24, 'average_label_text_length': 27.38, 'max_label_text_length': 32, 'labels': {'0': {'count': 40}, '1': {'count': 40}, '10': {'count': 40}, '11': {'count': 40}, '12': {'count': 40}, '13': {'count': 40}, '14': {'count': 40}, '15': {'count': 40}, '16': {'count': 40}, '17': {'count': 40}, '18': {'count': 40}, '19': {'count': 40}, '2': {'count': 40}, '20': {'count': 40}, '21': {'count': 40}, '22': {'count': 40}, '23': {'count': 40}, '24': {'count': 40}, '25': {'count': 40}, '26': {'count': 40}, '27': {'count': 40}, '28': {'count': 40}, '29': {'count': 40}, '3': {'count': 40}, '30': {'count': 40}, '31': {'count': 40}, '32': {'count': 40}, '33': {'count': 40}, '34': {'count': 40}, '35': {'count': 40}, '36': {'count': 40}, '37': {'count': 40}, '38': {'count': 40}, '39': {'count': 40}, '4': {'count': 40}, '40': {'count': 40}, '41': {'count': 40}, '42': {'count': 40}, '43': {'count': 40}, '44': {'count': 40}, '45': {'count': 40}, '46': {'count': 40}, '5': {'count': 40}, '6': {'count': 40}, '7': {'count': 40}, '8': {'count': 40}, '9': {'count': 40}}}} | -| [DalajClassification](https://spraakbanken.gu.se/en/resources/superlim) | ['swe'] | Classification | s2s | [Non-fiction, Written] | None | None | -| [DanFeverRetrieval](https://aclanthology.org/2021.nodalida-main.47/) | ['dan'] | Retrieval | p2p | [Encyclopaedic, Non-fiction, Spoken] | None | None | +| [DKHateClassification](https://aclanthology.org/2020.lrec-1.430/) (Sigurbergsson et al., 2020) | ['dan'] | Classification | s2s | [Social, Written] | None | None | +| [DTD](https://www.robots.ox.ac.uk/~vgg/data/dtd/) (M. Cimpoi, 2014) | ['eng'] | ImageClassification | i2i | [Encyclopaedic] | {'test': 1880} | {'test': {'num_samples': 1880, 'unique_num_labels': 47, 'min_image_width': 300, 'average_image_width': 488.98, 'max_image_width': 900, 'min_image_height': 300, 'average_image_height': 447.5, 'max_image_height': 778, 'labels': {'0': {'count': 40}, '1': {'count': 40}, '10': {'count': 40}, '11': {'count': 40}, '12': {'count': 40}, '13': {'count': 40}, '14': {'count': 40}, '15': {'count': 40}, '16': {'count': 40}, '17': {'count': 40}, '18': {'count': 40}, '19': {'count': 40}, '2': {'count': 40}, '20': {'count': 40}, '21': {'count': 40}, '22': {'count': 40}, '23': {'count': 40}, '24': {'count': 40}, '25': {'count': 40}, '26': {'count': 40}, '27': {'count': 40}, '28': {'count': 40}, '29': {'count': 40}, '3': {'count': 40}, '30': {'count': 40}, '31': {'count': 40}, '32': {'count': 40}, '33': {'count': 40}, '34': {'count': 40}, '35': {'count': 40}, '36': {'count': 40}, '37': {'count': 40}, '38': {'count': 40}, '39': {'count': 40}, '4': {'count': 40}, '40': {'count': 40}, '41': {'count': 40}, '42': {'count': 40}, '43': {'count': 40}, '44': {'count': 40}, '45': {'count': 40}, '46': {'count': 40}, '5': {'count': 40}, '6': {'count': 40}, '7': {'count': 40}, '8': {'count': 40}, '9': {'count': 40}}}} | +| [DTDZeroShot](https://www.robots.ox.ac.uk/~vgg/data/dtd/) (M. Cimpoi, 2014) | ['eng'] | ZeroShotClassification | i2t | [Encyclopaedic] | {'test': 1880} | {'test': {'num_samples': 1880, 'unique_num_labels': 47, 'min_image_width': 300, 'average_image_width': 488.98, 'max_image_width': 900, 'min_image_height': 300, 'average_image_height': 447.5, 'max_image_height': 778, 'min_label_text_length': 24, 'average_label_text_length': 27.38, 'max_label_text_length': 32, 'labels': {'0': {'count': 40}, '1': {'count': 40}, '10': {'count': 40}, '11': {'count': 40}, '12': {'count': 40}, '13': {'count': 40}, '14': {'count': 40}, '15': {'count': 40}, '16': {'count': 40}, '17': {'count': 40}, '18': {'count': 40}, '19': {'count': 40}, '2': {'count': 40}, '20': {'count': 40}, '21': {'count': 40}, '22': {'count': 40}, '23': {'count': 40}, '24': {'count': 40}, '25': {'count': 40}, '26': {'count': 40}, '27': {'count': 40}, '28': {'count': 40}, '29': {'count': 40}, '3': {'count': 40}, '30': {'count': 40}, '31': {'count': 40}, '32': {'count': 40}, '33': {'count': 40}, '34': {'count': 40}, '35': {'count': 40}, '36': {'count': 40}, '37': {'count': 40}, '38': {'count': 40}, '39': {'count': 40}, '4': {'count': 40}, '40': {'count': 40}, '41': {'count': 40}, '42': {'count': 40}, '43': {'count': 40}, '44': {'count': 40}, '45': {'count': 40}, '46': {'count': 40}, '5': {'count': 40}, '6': {'count': 40}, '7': {'count': 40}, '8': {'count': 40}, '9': {'count': 40}}}} | +| [DalajClassification](https://spraakbanken.gu.se/en/resources/superlim) (Elena Volodina, 2021) | ['swe'] | Classification | s2s | [Non-fiction, Written] | None | None | +| [DanFeverRetrieval](https://aclanthology.org/2021.nodalida-main.47/) (N{\o, 2021) | ['dan'] | Retrieval | p2p | [Encyclopaedic, Non-fiction, Spoken] | None | None | | [DanishPoliticalCommentsClassification](https://huggingface.co/datasets/danish_political_comments) (Mads Guldborg Kjeldgaard Kongsbak, 2019) | ['dan'] | Classification | s2s | [Social, Written] | None | None | | [DeepSentiPers](https://github.com/JoyeBright/DeepSentiPers) | ['fas'] | Classification | s2s | [Reviews] | None | None | | [DefinitionClassificationLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | @@ -259,42 +264,47 @@ The following tables give you an overview of the tasks in MTEB. | [DuRetrieval](https://aclanthology.org/2022.emnlp-main.357.pdf) (Yifu Qiu, 2022) | ['cmn'] | Retrieval | s2p | | None | None | | [DutchBookReviewSentimentClassification](https://github.com/benjaminvdb/DBRD) (Benjamin et al., 2019) | ['nld'] | Classification | s2s | [Reviews, Written] | None | None | | [EDIST2ITRetrieval](https://aclanthology.org/2023.emnlp-main.297/) (Liu et al., 2023) | ['eng'] | Any2AnyRetrieval | t2it | [News] | {'test': 1050308} | {'test': {'number_of_characters': 97112347, 'num_samples': 1050308, 'num_queries': 3241, 'num_documents': 1047067, 'min_document_length': 3, 'average_document_length': 92.37, 'max_document_length': 264, 'unique_documents': 553171, 'num_document_images': 1047067, 'min_query_length': 30, 'average_query_length': 120.33, 'max_query_length': 340, 'unique_queries': 3241, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 2.57, 'max_relevant_docs_per_query': 8, 'unique_relevant_docs': 8324}} | +| [ESC50](https://huggingface.co/datasets/ashraq/esc50) | ['eng'] | AudioClassification | a2t | [Spoken] | None | None | +| [ESC50_Zeroshot](https://huggingface.co/datasets/ashraq/esc50) | ['eng'] | AudioZeroshotClassification | a2a | [Spoken] | None | None | | [ESCIReranking](https://github.com/amazon-science/esci-data/) (Chandan K. Reddy, 2022) | ['eng', 'jpn', 'spa'] | Reranking | s2p | [Written] | {'test': 29285} | {'test': {'num_samples': 29285, 'number_of_characters': 254538331, 'num_positive': 271416, 'num_negative': 44235, 'min_query_length': 1, 'avg_query_length': 19.69, 'max_query_length': 151, 'unique_query': 29269, 'min_positive_length': 1, 'avg_positive_length': 803.92, 'max_positive_length': 8640, 'unique_positive': 217712, 'min_negative_length': 1, 'avg_negative_length': 808.5, 'max_negative_length': 4441, 'unique_negative': 39551, 'hf_subset_descriptive_stats': {'us': {'num_samples': 21296, 'number_of_characters': 186915609, 'num_positive': 189375, 'num_negative': 25463, 'min_query_length': 1, 'avg_query_length': 21.44, 'max_query_length': 151, 'unique_query': 21296, 'min_positive_length': 1, 'avg_positive_length': 868.37, 'max_positive_length': 5545, 'unique_positive': 150734, 'min_negative_length': 1, 'avg_negative_length': 864.45, 'max_negative_length': 3779, 'unique_negative': 23073}, 'es': {'num_samples': 3703, 'number_of_characters': 48861389, 'num_positive': 39110, 'num_negative': 10183, 'min_query_length': 3, 'avg_query_length': 20.68, 'max_query_length': 59, 'unique_query': 3703, 'min_positive_length': 1, 'avg_positive_length': 980.96, 'max_positive_length': 8640, 'unique_positive': 32921, 'min_negative_length': 1, 'avg_negative_length': 1023.22, 'max_negative_length': 4441, 'unique_negative': 9285}, 'jp': {'num_samples': 4286, 'number_of_characters': 18761333, 'num_positive': 42931, 'num_negative': 8589, 'min_query_length': 1, 'avg_query_length': 10.15, 'max_query_length': 60, 'unique_query': 4286, 'min_positive_length': 1, 'avg_positive_length': 358.36, 'max_positive_length': 3488, 'unique_positive': 35165, 'min_negative_length': 1, 'avg_negative_length': 388.08, 'max_negative_length': 3940, 'unique_negative': 7289}}}} | -| [EcomRetrieval](https://arxiv.org/abs/2203.03367) | ['cmn'] | Retrieval | s2p | | None | None | -| [EightTagsClustering.v2](https://aclanthology.org/2020.lrec-1.207.pdf) | ['pol'] | Clustering | s2s | [Social, Written] | None | None | -| [EmotionClassification](https://www.aclweb.org/anthology/D18-1404) | ['eng'] | Classification | s2s | [Social, Written] | None | None | +| [EcomRetrieval](https://arxiv.org/abs/2203.03367) (Dingkun Long, 2022) | ['cmn'] | Retrieval | s2p | | None | None | +| [EightTagsClustering.v2](https://aclanthology.org/2020.lrec-1.207.pdf) (Dadas et al., 2020) | ['pol'] | Clustering | s2s | [Social, Written] | None | None | +| [EmotionClassification](https://www.aclweb.org/anthology/D18-1404) (Saravia et al., 2018) | ['eng'] | Classification | s2s | [Social, Written] | None | None | | [EncyclopediaVQAIT2ITRetrieval](https://github.com/google-research/google-research/tree/master/encyclopedic_vqa) (Mensink et al., 2023) | ['eng'] | Any2AnyRetrieval | it2it | [Encyclopaedic] | {'test': 72056} | {'test': {'number_of_characters': 88615743, 'num_samples': 72056, 'num_queries': 3743, 'num_documents': 68313, 'min_document_length': 24, 'average_document_length': 1294.37, 'max_document_length': 72928, 'unique_documents': 49186, 'num_document_images': 68313, 'min_query_length': 19, 'average_query_length': 51.7, 'max_query_length': 245, 'unique_queries': 2832, 'num_query_images': 3743, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.31, 'max_relevant_docs_per_query': 75, 'unique_relevant_docs': 2683}} | -| [EstQA](https://www.semanticscholar.org/paper/Extractive-Question-Answering-for-Estonian-Language-182912IAPM-Alum%C3%A4e/ea4f60ab36cadca059c880678bc4c51e293a85d6?utm_source=direct_link) | ['est'] | Retrieval | s2p | [Encyclopaedic, Written] | None | None | -| [EstonianValenceClassification](https://figshare.com/articles/dataset/Estonian_Valence_Corpus_Eesti_valentsikorpus/24517054) | ['est'] | Classification | s2s | [News, Written] | None | None | +| [EstQA](https://www.semanticscholar.org/paper/Extractive-Question-Answering-for-Estonian-Language-182912IAPM-Alum%C3%A4e/ea4f60ab36cadca059c880678bc4c51e293a85d6?utm_source=direct_link) (Anu Käver, 2021) | ['est'] | Retrieval | s2p | [Encyclopaedic, Written] | None | None | +| [EstonianValenceClassification](https://figshare.com/articles/dataset/Estonian_Valence_Corpus_Eesti_valentsikorpus/24517054) (Hille Pajupuu, 2023) | ['est'] | Classification | s2s | [News, Written] | None | None | | [EuroSAT](https://ieeexplore.ieee.org/document/8736785) (Helber et al., 2019) | ['eng'] | ImageClassification | i2i | [Encyclopaedic] | {'test': 5400} | {'test': {'num_samples': 5400, 'unique_num_labels': 10, 'min_image_width': 64, 'average_image_width': 64.0, 'max_image_width': 64, 'min_image_height': 64, 'average_image_height': 64.0, 'max_image_height': 64, 'labels': {'4': {'count': 501}, '3': {'count': 496}, '7': {'count': 554}, '2': {'count': 573}, '9': {'count': 609}, '0': {'count': 596}, '8': {'count': 529}, '1': {'count': 608}, '5': {'count': 396}, '6': {'count': 538}}}} | | [EuroSATZeroShot](https://ieeexplore.ieee.org/document/8736785) (Helber et al., 2019) | ['eng'] | ZeroShotClassification | i2t | [Encyclopaedic] | {'test': 5400} | {'test': {'num_samples': 5400, 'unique_num_labels': 10, 'min_image_width': 64, 'average_image_width': 64.0, 'max_image_width': 64, 'min_image_height': 64, 'average_image_height': 64.0, 'max_image_height': 64, 'min_label_text_length': 36, 'average_label_text_length': 45.2, 'max_label_text_length': 53, 'labels': {'4': {'count': 501}, '3': {'count': 496}, '7': {'count': 554}, '2': {'count': 573}, '9': {'count': 609}, '0': {'count': 596}, '8': {'count': 529}, '1': {'count': 608}, '5': {'count': 396}, '6': {'count': 538}}}} | | [FER2013](https://arxiv.org/abs/1412.6572) (Ian J. Goodfellow, 2015) | ['eng'] | ImageClassification | i2i | [Encyclopaedic] | {'test': 7178} | {'test': {'num_samples': 7178, 'unique_num_labels': 7, 'min_image_width': 48, 'average_image_width': 48.0, 'max_image_width': 48, 'min_image_height': 48, 'average_image_height': 48.0, 'max_image_height': 48, 'labels': {'0': {'count': 958}, '1': {'count': 111}, '2': {'count': 1024}, '3': {'count': 1774}, '4': {'count': 1233}, '5': {'count': 1247}, '6': {'count': 831}}}} | | [FER2013ZeroShot](https://arxiv.org/abs/1412.6572) (Ian J. Goodfellow, 2015) | ['eng'] | ZeroShotClassification | i2t | [Encyclopaedic] | {'test': 7178} | {'test': {'num_samples': 7178, 'unique_num_labels': 7, 'min_image_width': 48, 'average_image_width': 48.0, 'max_image_width': 48, 'min_image_height': 48, 'average_image_height': 48.0, 'max_image_height': 48, 'min_label_text_length': 30, 'average_label_text_length': 32.57, 'max_label_text_length': 35, 'labels': {'0': {'count': 958}, '1': {'count': 111}, '2': {'count': 1024}, '3': {'count': 1774}, '4': {'count': 1233}, '5': {'count': 1247}, '6': {'count': 831}}}} | -| [FEVER](https://fever.ai/) | ['eng'] | Retrieval | s2p | [Encyclopaedic, Written] | None | None | +| [FEVER](https://fever.ai/) (Thorne et al., 2018) | ['eng'] | Retrieval | s2p | [Encyclopaedic, Written] | None | None | | [FEVER-NL](https://huggingface.co/datasets/clips/beir-nl-fever) (Nikolay Banar, 2024) | ['nld'] | Retrieval | s2p | [Encyclopaedic, Written] | None | None | -| [FEVERHardNegatives](https://fever.ai/) | ['eng'] | Retrieval | s2p | | None | None | +| [FEVERHardNegatives](https://fever.ai/) (Thorne et al., 2018) | ['eng'] | Retrieval | s2p | [Encyclopaedic, Written] | None | None | | [FGVCAircraft](https://arxiv.org/abs/1306.5151) (Subhransu Maji, 2013) | ['eng'] | ImageClassification | i2i | [Encyclopaedic] | {'test': 3333} | {'test': {'num_samples': 3333, 'unique_num_labels': 100, 'min_image_width': 800, 'average_image_width': 1098.58, 'max_image_width': 1600, 'min_image_height': 413, 'average_image_height': 747.0, 'max_image_height': 1197, 'labels': {'0': {'count': 33}, '1': {'count': 33}, '2': {'count': 34}, '3': {'count': 33}, '4': {'count': 33}, '5': {'count': 34}, '6': {'count': 33}, '7': {'count': 33}, '8': {'count': 34}, '9': {'count': 33}, '10': {'count': 33}, '11': {'count': 34}, '12': {'count': 33}, '13': {'count': 33}, '14': {'count': 34}, '15': {'count': 33}, '16': {'count': 33}, '17': {'count': 34}, '18': {'count': 33}, '19': {'count': 33}, '20': {'count': 34}, '21': {'count': 33}, '22': {'count': 33}, '23': {'count': 34}, '24': {'count': 33}, '25': {'count': 33}, '26': {'count': 34}, '27': {'count': 33}, '28': {'count': 33}, '29': {'count': 34}, '30': {'count': 33}, '31': {'count': 33}, '32': {'count': 34}, '33': {'count': 33}, '34': {'count': 33}, '35': {'count': 34}, '36': {'count': 33}, '37': {'count': 33}, '38': {'count': 34}, '39': {'count': 33}, '40': {'count': 33}, '41': {'count': 34}, '42': {'count': 33}, '43': {'count': 33}, '44': {'count': 34}, '45': {'count': 33}, '46': {'count': 33}, '47': {'count': 34}, '48': {'count': 33}, '49': {'count': 33}, '50': {'count': 34}, '51': {'count': 33}, '52': {'count': 33}, '53': {'count': 34}, '54': {'count': 33}, '55': {'count': 33}, '56': {'count': 34}, '57': {'count': 33}, '58': {'count': 33}, '59': {'count': 34}, '60': {'count': 33}, '61': {'count': 33}, '62': {'count': 34}, '63': {'count': 33}, '64': {'count': 33}, '65': {'count': 34}, '66': {'count': 33}, '67': {'count': 33}, '68': {'count': 34}, '69': {'count': 33}, '70': {'count': 33}, '71': {'count': 34}, '72': {'count': 33}, '73': {'count': 33}, '74': {'count': 34}, '75': {'count': 33}, '76': {'count': 33}, '77': {'count': 34}, '78': {'count': 33}, '79': {'count': 33}, '80': {'count': 34}, '81': {'count': 33}, '82': {'count': 33}, '83': {'count': 34}, '84': {'count': 33}, '85': {'count': 33}, '86': {'count': 34}, '87': {'count': 33}, '88': {'count': 33}, '89': {'count': 34}, '90': {'count': 33}, '91': {'count': 33}, '92': {'count': 34}, '93': {'count': 33}, '94': {'count': 33}, '95': {'count': 34}, '96': {'count': 33}, '97': {'count': 33}, '98': {'count': 34}, '99': {'count': 33}}}} | | [FGVCAircraftZeroShot](https://arxiv.org/abs/1306.5151) (Subhransu Maji, 2013) | ['eng'] | ZeroShotClassification | i2t | [Encyclopaedic] | {'test': 3333} | {'test': {'num_samples': 3333, 'unique_num_labels': 100, 'min_image_width': 800, 'average_image_width': 1098.58, 'max_image_width': 1600, 'min_image_height': 413, 'average_image_height': 747.0, 'max_image_height': 1197, 'min_label_text_length': 38, 'average_label_text_length': 41.46, 'max_label_text_length': 53, 'labels': {'0': {'count': 33}, '1': {'count': 33}, '2': {'count': 34}, '3': {'count': 33}, '4': {'count': 33}, '5': {'count': 34}, '6': {'count': 33}, '7': {'count': 33}, '8': {'count': 34}, '9': {'count': 33}, '10': {'count': 33}, '11': {'count': 34}, '12': {'count': 33}, '13': {'count': 33}, '14': {'count': 34}, '15': {'count': 33}, '16': {'count': 33}, '17': {'count': 34}, '18': {'count': 33}, '19': {'count': 33}, '20': {'count': 34}, '21': {'count': 33}, '22': {'count': 33}, '23': {'count': 34}, '24': {'count': 33}, '25': {'count': 33}, '26': {'count': 34}, '27': {'count': 33}, '28': {'count': 33}, '29': {'count': 34}, '30': {'count': 33}, '31': {'count': 33}, '32': {'count': 34}, '33': {'count': 33}, '34': {'count': 33}, '35': {'count': 34}, '36': {'count': 33}, '37': {'count': 33}, '38': {'count': 34}, '39': {'count': 33}, '40': {'count': 33}, '41': {'count': 34}, '42': {'count': 33}, '43': {'count': 33}, '44': {'count': 34}, '45': {'count': 33}, '46': {'count': 33}, '47': {'count': 34}, '48': {'count': 33}, '49': {'count': 33}, '50': {'count': 34}, '51': {'count': 33}, '52': {'count': 33}, '53': {'count': 34}, '54': {'count': 33}, '55': {'count': 33}, '56': {'count': 34}, '57': {'count': 33}, '58': {'count': 33}, '59': {'count': 34}, '60': {'count': 33}, '61': {'count': 33}, '62': {'count': 34}, '63': {'count': 33}, '64': {'count': 33}, '65': {'count': 34}, '66': {'count': 33}, '67': {'count': 33}, '68': {'count': 34}, '69': {'count': 33}, '70': {'count': 33}, '71': {'count': 34}, '72': {'count': 33}, '73': {'count': 33}, '74': {'count': 34}, '75': {'count': 33}, '76': {'count': 33}, '77': {'count': 34}, '78': {'count': 33}, '79': {'count': 33}, '80': {'count': 34}, '81': {'count': 33}, '82': {'count': 33}, '83': {'count': 34}, '84': {'count': 33}, '85': {'count': 33}, '86': {'count': 34}, '87': {'count': 33}, '88': {'count': 33}, '89': {'count': 34}, '90': {'count': 33}, '91': {'count': 33}, '92': {'count': 34}, '93': {'count': 33}, '94': {'count': 33}, '95': {'count': 34}, '96': {'count': 33}, '97': {'count': 33}, '98': {'count': 34}, '99': {'count': 33}}}} | | [FORBI2IRetrieval](https://github.com/pxiangwu/FORB) (Pengxiang Wu, 2023) | ['eng'] | Any2AnyRetrieval | i2i | [Encyclopaedic] | {'test': 67234} | {'test': {'number_of_characters': 0, 'num_samples': 67234, 'num_queries': 13250, 'num_documents': 53984, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 53984, 'min_query_length': 0, 'average_query_length': 0, 'max_query_length': 0, 'unique_queries': 0, 'num_query_images': 13250, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 4235}} | -| [FQuADRetrieval](https://huggingface.co/datasets/manu/fquad2_test) | ['fra'] | Retrieval | s2p | [Encyclopaedic, Written] | None | None | +| [FQuADRetrieval](https://huggingface.co/datasets/manu/fquad2_test) (d{', 2020) | ['fra'] | Retrieval | s2p | [Encyclopaedic, Written] | None | None | +| [FSD2019Kaggle](https://huggingface.co/datasets/confit/fsdkaggle2019-parquet) | ['eng'] | AudioMultilabelClassification | a2t | [Web] | None | None | +| [FSD50K](https://huggingface.co/datasets/Chand0320/fsd50k_hf) (Fonseca et al., 2022) | ['eng'] | AudioMultilabelClassification | a2t | [Web] | None | None | +| [FSDD](https://huggingface.co/datasets/silky1708/Free-Spoken-Digit-Dataset) (J. Zohar, 2018) | ['eng'] | AudioClassification | a2t | [Music] | None | None | | [FaithDial](https://mcgill-nlp.github.io/FaithDial) (Dziri et al., 2022) | ['eng'] | Retrieval | s2p | [Encyclopaedic, Written] | None | None | -| [FalseFriendsGermanEnglish](https://drive.google.com/file/d/1jgq0nBnV-UiYNxbKNrrr2gxDEHm-DMKH/view?usp=share_link) | ['deu'] | PairClassification | s2s | [Written] | None | None | -| [FaroeseSTS](https://aclanthology.org/2023.nodalida-1.74.pdf) | ['fao'] | STS | s2s | [News, Web, Written] | None | None | +| [FalseFriendsGermanEnglish](https://drive.google.com/file/d/1jgq0nBnV-UiYNxbKNrrr2gxDEHm-DMKH/view?usp=share_link) (Chibb et al., 2022) | ['deu'] | PairClassification | s2s | [Written] | None | None | +| [FaroeseSTS](https://aclanthology.org/2023.nodalida-1.74.pdf) (Snæbjarnarson et al., 2023) | ['fao'] | STS | s2s | [News, Web, Written] | None | None | | [FarsTail](https://link.springer.com/article/10.1007/s00500-023-08959-3) (Amirkhani et al., 2023) | ['fas'] | PairClassification | s2s | [Academic, Written] | None | None | | [FarsiParaphraseDetection](https://huggingface.co/datasets/alighasemi/farsi_paraphrase_detection) | ['fas'] | PairClassification | s2s | | None | None | | [Farsick](https://github.com/ZahraGhasemi-AI/FarSick) | ['fas'] | STS | s2s | | None | None | | [Fashion200kI2TRetrieval](https://openaccess.thecvf.com/content_iccv_2017/html/Han_Automatic_Spatially-Aware_Fashion_ICCV_2017_paper.html) (Han et al., 2017) | ['eng'] | Any2AnyRetrieval | i2t | [Encyclopaedic] | {'test': 66596} | {'test': {'number_of_characters': 2054137, 'num_samples': 66596, 'num_queries': 4889, 'num_documents': 61707, 'min_document_length': 11, 'average_document_length': 33.29, 'max_document_length': 96, 'unique_documents': 61707, 'num_document_images': 0, 'min_query_length': 0, 'average_query_length': 0, 'max_query_length': 0, 'unique_queries': 0, 'num_query_images': 4889, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 3119}} | | [Fashion200kT2IRetrieval](https://openaccess.thecvf.com/content_iccv_2017/html/Han_Automatic_Spatially-Aware_Fashion_ICCV_2017_paper.html) (Han et al., 2017) | ['eng'] | Any2AnyRetrieval | t2i | [Encyclopaedic] | {'test': 203543} | {'test': {'number_of_characters': 57390, 'num_samples': 203543, 'num_queries': 1719, 'num_documents': 201824, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 201824, 'min_query_length': 14, 'average_query_length': 33.39, 'max_query_length': 83, 'unique_queries': 1719, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 2.82, 'max_relevant_docs_per_query': 35, 'unique_relevant_docs': 4847}} | | [FashionIQIT2IRetrieval](https://openaccess.thecvf.com/content/CVPR2021/html/Wu_Fashion_IQ_A_New_Dataset_Towards_Retrieving_Images_by_Natural_CVPR_2021_paper.html) (Wu et al., 2021) | ['eng'] | Any2AnyRetrieval | it2i | [Encyclopaedic] | {'test': 80384} | {'test': {'number_of_characters': 361250, 'num_samples': 80384, 'num_queries': 6003, 'num_documents': 74381, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 74381, 'min_query_length': 18, 'average_query_length': 60.18, 'max_query_length': 138, 'unique_queries': 5973, 'num_query_images': 6003, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 4, 'unique_relevant_docs': 6003}} | -| [FeedbackQARetrieval](https://arxiv.org/abs/2204.03025) | ['eng'] | Retrieval | s2p | [Government, Medical, Web, Written] | None | None | +| [FeedbackQARetrieval](https://arxiv.org/abs/2204.03025) (Li et al., 2022) | ['eng'] | Retrieval | s2p | [Government, Medical, Web, Written] | None | None | | [FiQA-PL](https://sites.google.com/view/fiqa/) (Nandan Thakur, 2021) | ['pol'] | Retrieval | s2p | [Financial, Written] | None | None | | [FiQA2018](https://sites.google.com/view/fiqa/) (Nandan Thakur, 2021) | ['eng'] | Retrieval | s2p | [Financial, Written] | None | None | | [FiQA2018-Fa](https://huggingface.co/datasets/MCINext/fiqa-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [FiQA2018-NL](https://huggingface.co/datasets/clips/beir-nl-fiqa) (Nikolay Banar, 2024) | ['nld'] | Retrieval | s2p | [Non-fiction, Written] | None | None | | [FilipinoHateSpeechClassification](https://pcj.csp.org.ph/index.php/pcj/issue/download/29/PCJ%20V14%20N1%20pp1-14%202019) (Neil Vicente Cabasag et al., 2019) | ['fil'] | Classification | s2s | [Social, Written] | None | None | | [FilipinoShopeeReviewsClassification](https://uijrt.com/articles/v4/i8/UIJRTV4I80009.pdf) | ['fil'] | Classification | s2s | [Social, Written] | None | None | -| [FinParaSTS](https://huggingface.co/datasets/TurkuNLP/turku_paraphrase_corpus) | ['fin'] | STS | s2s | [News, Subtitles, Written] | None | None | -| [FinToxicityClassification](https://aclanthology.org/2023.nodalida-1.68) | ['fin'] | Classification | s2s | [News, Written] | None | None | +| [FinParaSTS](https://huggingface.co/datasets/TurkuNLP/turku_paraphrase_corpus) (Kanerva et al., 2021) | ['fin'] | STS | s2s | [News, Subtitles, Written] | None | None | +| [FinToxicityClassification](https://aclanthology.org/2023.nodalida-1.68) (Eskelinen et al., 2023) | ['fin'] | Classification | s2s | [News, Written] | None | None | | [FinancialPhrasebankClassification](https://arxiv.org/abs/1307.5336) (P. Malo, 2014) | ['eng'] | Classification | s2s | [Financial, News, Written] | None | None | | [Flickr30kI2TRetrieval](https://www.semanticscholar.org/paper/From-image-descriptions-to-visual-denotations%3A-New-Young-Lai/44040913380206991b1991daf1192942e038fe31) (Peter Young, 2014) | ['eng'] | Any2AnyRetrieval | i2t | [Web, Written] | {'test': 6000} | {'test': {'number_of_characters': 319250, 'num_samples': 6000, 'num_queries': 1000, 'num_documents': 5000, 'min_document_length': 11, 'average_document_length': 63.85, 'max_document_length': 375, 'unique_documents': 4999, 'num_document_images': 0, 'min_query_length': 0, 'average_query_length': 0, 'max_query_length': 0, 'unique_queries': 0, 'num_query_images': 1000, 'min_relevant_docs_per_query': 5, 'average_relevant_docs_per_query': 5.0, 'max_relevant_docs_per_query': 5, 'unique_relevant_docs': 5000}} | | [Flickr30kT2IRetrieval](https://www.semanticscholar.org/paper/From-image-descriptions-to-visual-denotations%3A-New-Young-Lai/44040913380206991b1991daf1192942e038fe31) (Peter Young, 2014) | ['eng'] | Any2AnyRetrieval | t2i | [Web, Written] | {'test': 6000} | {'test': {'number_of_characters': 319250, 'num_samples': 6000, 'num_queries': 5000, 'num_documents': 1000, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 1000, 'min_query_length': 11, 'average_query_length': 63.85, 'max_query_length': 375, 'unique_queries': 4999, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 1000}} | @@ -311,73 +321,77 @@ The following tables give you an overview of the tasks in MTEB. | [GPUSpeedTask](https://github.com/KennethEnevoldsen/scandinavian-embedding-benchmark/blob/c8376f967d1294419be1d3eb41217d04cd3a65d3/src/seb/registered_tasks/speed.py#L83-L96) | ['eng'] | Speed | s2s | [Fiction, Written] | None | None | | [GTSRB](https://benchmark.ini.rub.de/) (Stallkamp et al., 2011) | ['eng'] | ImageClassification | i2i | [Scene] | {'test': 12630} | {'test': {'num_samples': 12630, 'unique_num_labels': 43, 'min_image_width': 25, 'average_image_width': 50.51, 'max_image_width': 266, 'min_image_height': 25, 'average_image_height': 50.36, 'max_image_height': 232, 'labels': {'16': {'count': 150}, '1': {'count': 720}, '38': {'count': 690}, '33': {'count': 210}, '11': {'count': 420}, '18': {'count': 390}, '12': {'count': 690}, '25': {'count': 480}, '35': {'count': 390}, '7': {'count': 450}, '23': {'count': 150}, '4': {'count': 660}, '9': {'count': 480}, '21': {'count': 90}, '20': {'count': 90}, '27': {'count': 60}, '3': {'count': 450}, '13': {'count': 720}, '10': {'count': 660}, '5': {'count': 630}, '17': {'count': 360}, '34': {'count': 120}, '2': {'count': 750}, '8': {'count': 450}, '30': {'count': 150}, '24': {'count': 90}, '15': {'count': 210}, '26': {'count': 180}, '28': {'count': 150}, '22': {'count': 120}, '14': {'count': 270}, '32': {'count': 60}, '29': {'count': 90}, '6': {'count': 150}, '36': {'count': 120}, '40': {'count': 90}, '41': {'count': 60}, '31': {'count': 270}, '19': {'count': 60}, '0': {'count': 60}, '39': {'count': 90}, '42': {'count': 90}, '37': {'count': 60}}}} | | [GTSRBZeroShot](https://benchmark.ini.rub.de/) (Stallkamp et al., 2011) | ['eng'] | ZeroShotClassification | i2t | [Scene] | {'test': 12630} | {'test': {'num_samples': 12630, 'unique_num_labels': 43, 'min_image_width': 25, 'average_image_width': 50.51, 'max_image_width': 266, 'min_image_height': 25, 'average_image_height': 50.36, 'max_image_height': 232, 'min_label_text_length': 43, 'average_label_text_length': 88.23, 'max_label_text_length': 116, 'labels': {'16': {'count': 150}, '1': {'count': 720}, '38': {'count': 690}, '33': {'count': 210}, '11': {'count': 420}, '18': {'count': 390}, '12': {'count': 690}, '25': {'count': 480}, '35': {'count': 390}, '7': {'count': 450}, '23': {'count': 150}, '4': {'count': 660}, '9': {'count': 480}, '21': {'count': 90}, '20': {'count': 90}, '27': {'count': 60}, '3': {'count': 450}, '13': {'count': 720}, '10': {'count': 660}, '5': {'count': 630}, '17': {'count': 360}, '34': {'count': 120}, '2': {'count': 750}, '8': {'count': 450}, '30': {'count': 150}, '24': {'count': 90}, '15': {'count': 210}, '26': {'count': 180}, '28': {'count': 150}, '22': {'count': 120}, '14': {'count': 270}, '32': {'count': 60}, '29': {'count': 90}, '6': {'count': 150}, '36': {'count': 120}, '40': {'count': 90}, '41': {'count': 60}, '31': {'count': 270}, '19': {'count': 60}, '0': {'count': 60}, '39': {'count': 90}, '42': {'count': 90}, '37': {'count': 60}}}} | +| [GTZANGenre](https://huggingface.co/datasets/silky1708/GTZAN-Genre) (Tzanetakis et al., 2002) | ['eng'] | AudioClassification | a2t | [Music] | None | None | | [GeoreviewClassification](https://github.com/yandex/geo-reviews-dataset-2023) | ['rus'] | Classification | p2p | [Reviews, Written] | None | None | | [GeoreviewClusteringP2P](https://github.com/yandex/geo-reviews-dataset-2023) | ['rus'] | Clustering | p2p | [Reviews, Written] | None | None | | [GeorgianFAQRetrieval](https://huggingface.co/datasets/jupyterjazz/georgian-faq) | ['kat'] | Retrieval | s2p | [Web, Written] | None | None | -| [GerDaLIR](https://github.com/lavis-nlp/GerDaLIR) | ['deu'] | Retrieval | s2p | | None | None | -| [GerDaLIRSmall](https://github.com/lavis-nlp/GerDaLIR) | ['deu'] | Retrieval | p2p | [Legal, Written] | None | None | -| [GermanDPR](https://huggingface.co/datasets/deepset/germandpr) (Timo Möller, 2021) | ['deu'] | Retrieval | s2p | | None | None | -| [GermanGovServiceRetrieval](https://huggingface.co/datasets/it-at-m/LHM-Dienstleistungen-QA) | ['deu'] | Retrieval | s2p | [Government, Written] | None | None | -| [GermanPoliticiansTwitterSentimentClassification](https://aclanthology.org/2022.konvens-1.9) | ['deu'] | Classification | s2s | [Government, Social, Written] | None | None | -| [GermanQuAD-Retrieval](https://www.kaggle.com/datasets/GermanQuAD) (Timo Möller, 2021) | ['deu'] | Retrieval | s2p | | None | None | +| [GerDaLIR](https://github.com/lavis-nlp/GerDaLIR) (Wrzalik et al., 2021) | ['deu'] | Retrieval | s2p | [Legal] | None | None | +| [GerDaLIRSmall](https://github.com/lavis-nlp/GerDaLIR) (Wrzalik et al., 2021) | ['deu'] | Retrieval | p2p | [Legal, Written] | None | None | +| [GermanDPR](https://huggingface.co/datasets/deepset/germandpr) (Timo Möller, 2021) | ['deu'] | Retrieval | s2p | [Non-fiction, Web, Written] | None | None | +| [GermanGovServiceRetrieval](https://huggingface.co/datasets/it-at-m/LHM-Dienstleistungen-QA) (Schröder et al., 2022) | ['deu'] | Retrieval | s2p | [Government, Written] | None | None | +| [GermanPoliticiansTwitterSentimentClassification](https://aclanthology.org/2022.konvens-1.9) (Schmidt et al., 2022) | ['deu'] | Classification | s2s | [Government, Social, Written] | None | None | +| [GermanQuAD-Retrieval](https://huggingface.co/datasets/deepset/germanquad) (Timo Möller, 2021) | ['deu'] | Retrieval | s2p | [Non-fiction, Web, Written] | None | None | | [GermanSTSBenchmark](https://github.com/t-systems-on-site-services-gmbh/german-STSbenchmark) (Philip May, 2021) | ['deu'] | STS | s2s | | None | None | | [GreekCivicsQA](https://huggingface.co/datasets/antoinelb7/alloprof) | ['ell'] | Retrieval | s2p | [Academic, Written] | None | None | -| [GreekLegalCodeClassification](https://arxiv.org/abs/2109.15298) | ['ell'] | Classification | s2s | [Legal, Written] | None | None | +| [GreekLegalCodeClassification](https://arxiv.org/abs/2109.15298) (Papaloukas et al., 2021) | ['ell'] | Classification | s2s | [Legal, Written] | None | None | +| [GreenNodeTableMarkdownRetrieval](https://huggingface.co/GreenNode) | ['vie'] | Retrieval | s2p | [Encyclopaedic, Financial, Non-fiction] | None | None | | [GujaratiNewsClassification](https://github.com/goru001/nlp-for-gujarati) | ['guj'] | Classification | s2s | [News, Written] | None | None | +| [GunshotTriangulation](https://huggingface.co/datasets/anime-sh/GunshotTriangulationHEAR) (Simone Raponi, 2021) | ['eng'] | AudioClassification | a2t | | None | None | | [HALClusteringS2S.v2](https://huggingface.co/datasets/lyon-nlp/clustering-hal-s2s) (Mathieu Ciancone, 2024) | ['fra'] | Clustering | s2s | [Academic, Written] | None | None | | [HagridRetrieval](https://github.com/project-miracl/hagrid) (Ehsan Kamalloo, 2023) | ['eng'] | Retrieval | s2p | [Encyclopaedic, Written] | None | None | | [HamshahriClustring](https://github.com/mallahyari/Farsi-datasets) | ['fas'] | Clustering | p2p | [News] | None | None | -| [HateSpeechPortugueseClassification](https://aclanthology.org/W19-3510) | ['por'] | Classification | s2s | [Social, Written] | None | None | +| [HateSpeechPortugueseClassification](https://aclanthology.org/W19-3510) (Fortuna et al., 2019) | ['por'] | Classification | s2s | [Social, Written] | None | None | | [HatefulMemesI2TRetrieval](https://arxiv.org/pdf/2005.04790) (Kiela et al., 2020) | ['eng'] | Any2AnyRetrieval | i2t | [Encyclopaedic] | {'test': 11000} | {'test': {'number_of_characters': 610257, 'num_samples': 11000, 'num_queries': 1000, 'num_documents': 10000, 'min_document_length': 3, 'average_document_length': 61.03, 'max_document_length': 433, 'unique_documents': 8045, 'num_document_images': 0, 'min_query_length': 0, 'average_query_length': 0, 'max_query_length': 0, 'unique_queries': 0, 'num_query_images': 1000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 1000}} | | [HatefulMemesT2IRetrieval](https://arxiv.org/pdf/2005.04790) (Kiela et al., 2020) | ['eng'] | Any2AnyRetrieval | t2i | [Encyclopaedic] | {'test': 11000} | {'test': {'number_of_characters': 55468, 'num_samples': 11000, 'num_queries': 1000, 'num_documents': 10000, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 10000, 'min_query_length': 3, 'average_query_length': 55.47, 'max_query_length': 382, 'unique_queries': 829, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 1000}} | -| [HeadlineClassification](https://aclanthology.org/2020.ngt-1.6/) | ['rus'] | Classification | s2s | [News, Written] | None | None | -| [HebrewSentimentAnalysis](https://huggingface.co/datasets/hebrew_sentiment) | ['heb'] | Classification | s2s | [Reviews, Written] | None | None | +| [HeadlineClassification](https://aclanthology.org/2020.ngt-1.6/) (Gudkov et al., 2020) | ['rus'] | Classification | s2s | [News, Written] | None | None | +| [HebrewSentimentAnalysis](https://huggingface.co/datasets/hebrew_sentiment) (Amram et al., 2018) | ['heb'] | Classification | s2s | [Reviews, Written] | None | None | | [HellaSwag](https://rowanzellers.com/hellaswag/) (Xiao et al., 2024) | ['eng'] | Retrieval | s2s | [Encyclopaedic, Written] | None | None | | [HinDialectClassification](https://lindat.mff.cuni.cz/repository/xmlui/handle/11234/1-4839) (Bafna et al., 2022) | ['anp', 'awa', 'ben', 'bgc', 'bhb', 'bhd', 'bho', 'bjj', 'bns', 'bra', 'gbm', 'guj', 'hne', 'kfg', 'kfy', 'mag', 'mar', 'mup', 'noe', 'pan', 'raj'] | Classification | s2s | [Social, Spoken, Written] | None | None | -| [HindiDiscourseClassification](https://aclanthology.org/2020.lrec-1.149/) | ['hin'] | Classification | s2s | [Fiction, Social, Written] | None | None | +| [HindiDiscourseClassification](https://aclanthology.org/2020.lrec-1.149/) (Dhanwal et al., 2020) | ['hin'] | Classification | s2s | [Fiction, Social, Written] | None | None | | [HotelReviewSentimentClassification](https://link.springer.com/chapter/10.1007/978-3-319-67056-0_3) (Elnagar et al., 2018) | ['ara'] | Classification | s2s | [Reviews, Written] | None | None | -| [HotpotQA](https://hotpotqa.github.io/) | ['eng'] | Retrieval | s2p | [Web, Written] | None | None | +| [HotpotQA](https://hotpotqa.github.io/) (Yang et al., 2018) | ['eng'] | Retrieval | s2p | [Web, Written] | None | None | | [HotpotQA-Fa](https://huggingface.co/datasets/MCINext/hotpotqa-fa) | ['fas'] | Retrieval | s2p | [Encyclopaedic] | None | None | | [HotpotQA-NL](https://hotpotqa.github.io/) (Nikolay Banar, 2024) | ['nld'] | Retrieval | s2p | [Web, Written] | None | None | | [HotpotQA-PL](https://hotpotqa.github.io/) (Konrad Wojtasik, 2024) | ['pol'] | Retrieval | s2p | [Web, Written] | None | None | | [HotpotQA-PLHardNegatives](https://hotpotqa.github.io/) (Konrad Wojtasik, 2024) | ['pol'] | Retrieval | s2p | [Web, Written] | None | None | -| [HotpotQAHardNegatives](https://hotpotqa.github.io/) | ['eng'] | Retrieval | s2p | [Web, Written] | None | None | +| [HotpotQAHardNegatives](https://hotpotqa.github.io/) (Yang et al., 2018) | ['eng'] | Retrieval | s2p | [Web, Written] | None | None | | [HunSum2AbstractiveRetrieval](https://arxiv.org/abs/2404.03555) (Botond Barta, 2024) | ['hun'] | Retrieval | s2p | [News, Written] | None | None | -| [IFlyTek](https://www.cluebenchmarks.com/introduce.html) | ['cmn'] | Classification | s2s | | None | None | +| [IFlyTek](https://www.cluebenchmarks.com/introduce.html) (Xu et al., 2020) | ['cmn'] | Classification | s2s | | None | None | | [IN22ConvBitextMining](https://huggingface.co/datasets/ai4bharat/IN22-Conv) (Jay Gala, 2023) | ['asm', 'ben', 'brx', 'doi', 'eng', 'gom', 'guj', 'hin', 'kan', 'kas', 'mai', 'mal', 'mar', 'mni', 'npi', 'ory', 'pan', 'san', 'sat', 'snd', 'tam', 'tel', 'urd'] | BitextMining | s2s | [Fiction, Social, Spoken, Spoken] | {'test': 760518} | {'test': {'num_samples': 760518, 'number_of_characters': 82637104, 'unique_pairs': 759283, 'min_sentence1_length': 3, 'average_sentence1_length': 54.33, 'max_sentence1_length': 239, 'unique_sentence1': 34430, 'min_sentence2_length': 3, 'average_sentence2_length': 54.33, 'max_sentence2_length': 239, 'unique_sentence2': 34430, 'hf_subset_descriptive_stats': {'asm_Beng-ben_Beng': {'num_samples': 1503, 'number_of_characters': 155988, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'asm_Beng-brx_Deva': {'num_samples': 1503, 'number_of_characters': 162044, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'asm_Beng-doi_Deva': {'num_samples': 1503, 'number_of_characters': 167032, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'asm_Beng-eng_Latn': {'num_samples': 1503, 'number_of_characters': 160716, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'asm_Beng-gom_Deva': {'num_samples': 1503, 'number_of_characters': 156282, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'asm_Beng-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 158269, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'asm_Beng-hin_Deva': {'num_samples': 1503, 'number_of_characters': 159964, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'asm_Beng-kan_Knda': {'num_samples': 1503, 'number_of_characters': 165177, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'asm_Beng-kas_Arab': {'num_samples': 1503, 'number_of_characters': 164681, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'asm_Beng-mai_Deva': {'num_samples': 1503, 'number_of_characters': 162408, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'asm_Beng-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 172838, 'unique_pairs': 1498, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'asm_Beng-mar_Deva': {'num_samples': 1503, 'number_of_characters': 162747, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'asm_Beng-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 157316, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'asm_Beng-npi_Deva': {'num_samples': 1503, 'number_of_characters': 160906, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'asm_Beng-ory_Orya': {'num_samples': 1503, 'number_of_characters': 164223, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'asm_Beng-pan_Guru': {'num_samples': 1503, 'number_of_characters': 160201, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'asm_Beng-san_Deva': {'num_samples': 1503, 'number_of_characters': 158093, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'asm_Beng-sat_Olck': {'num_samples': 1503, 'number_of_characters': 169379, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'asm_Beng-snd_Deva': {'num_samples': 1503, 'number_of_characters': 162623, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'asm_Beng-tam_Taml': {'num_samples': 1503, 'number_of_characters': 174866, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'asm_Beng-tel_Telu': {'num_samples': 1503, 'number_of_characters': 157690, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'asm_Beng-urd_Arab': {'num_samples': 1503, 'number_of_characters': 161305, 'unique_pairs': 1498, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'ben_Beng-asm_Beng': {'num_samples': 1503, 'number_of_characters': 155988, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'ben_Beng-brx_Deva': {'num_samples': 1503, 'number_of_characters': 156448, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'ben_Beng-doi_Deva': {'num_samples': 1503, 'number_of_characters': 161436, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'ben_Beng-eng_Latn': {'num_samples': 1503, 'number_of_characters': 155120, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'ben_Beng-gom_Deva': {'num_samples': 1503, 'number_of_characters': 150686, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'ben_Beng-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 152673, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'ben_Beng-hin_Deva': {'num_samples': 1503, 'number_of_characters': 154368, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'ben_Beng-kan_Knda': {'num_samples': 1503, 'number_of_characters': 159581, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'ben_Beng-kas_Arab': {'num_samples': 1503, 'number_of_characters': 159085, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'ben_Beng-mai_Deva': {'num_samples': 1503, 'number_of_characters': 156812, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'ben_Beng-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 167242, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'ben_Beng-mar_Deva': {'num_samples': 1503, 'number_of_characters': 157151, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'ben_Beng-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 151720, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'ben_Beng-npi_Deva': {'num_samples': 1503, 'number_of_characters': 155310, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'ben_Beng-ory_Orya': {'num_samples': 1503, 'number_of_characters': 158627, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'ben_Beng-pan_Guru': {'num_samples': 1503, 'number_of_characters': 154605, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'ben_Beng-san_Deva': {'num_samples': 1503, 'number_of_characters': 152497, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'ben_Beng-sat_Olck': {'num_samples': 1503, 'number_of_characters': 163783, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'ben_Beng-snd_Deva': {'num_samples': 1503, 'number_of_characters': 157027, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'ben_Beng-tam_Taml': {'num_samples': 1503, 'number_of_characters': 169270, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'ben_Beng-tel_Telu': {'num_samples': 1503, 'number_of_characters': 152094, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'ben_Beng-urd_Arab': {'num_samples': 1503, 'number_of_characters': 155709, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'brx_Deva-asm_Beng': {'num_samples': 1503, 'number_of_characters': 162044, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'brx_Deva-ben_Beng': {'num_samples': 1503, 'number_of_characters': 156448, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'brx_Deva-doi_Deva': {'num_samples': 1503, 'number_of_characters': 167492, 'unique_pairs': 1501, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'brx_Deva-eng_Latn': {'num_samples': 1503, 'number_of_characters': 161176, 'unique_pairs': 1501, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'brx_Deva-gom_Deva': {'num_samples': 1503, 'number_of_characters': 156742, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'brx_Deva-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 158729, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'brx_Deva-hin_Deva': {'num_samples': 1503, 'number_of_characters': 160424, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'brx_Deva-kan_Knda': {'num_samples': 1503, 'number_of_characters': 165637, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'brx_Deva-kas_Arab': {'num_samples': 1503, 'number_of_characters': 165141, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'brx_Deva-mai_Deva': {'num_samples': 1503, 'number_of_characters': 162868, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'brx_Deva-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 173298, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'brx_Deva-mar_Deva': {'num_samples': 1503, 'number_of_characters': 163207, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'brx_Deva-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 157776, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'brx_Deva-npi_Deva': {'num_samples': 1503, 'number_of_characters': 161366, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'brx_Deva-ory_Orya': {'num_samples': 1503, 'number_of_characters': 164683, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'brx_Deva-pan_Guru': {'num_samples': 1503, 'number_of_characters': 160661, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'brx_Deva-san_Deva': {'num_samples': 1503, 'number_of_characters': 158553, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'brx_Deva-sat_Olck': {'num_samples': 1503, 'number_of_characters': 169839, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'brx_Deva-snd_Deva': {'num_samples': 1503, 'number_of_characters': 163083, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'brx_Deva-tam_Taml': {'num_samples': 1503, 'number_of_characters': 175326, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'brx_Deva-tel_Telu': {'num_samples': 1503, 'number_of_characters': 158150, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'brx_Deva-urd_Arab': {'num_samples': 1503, 'number_of_characters': 161765, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'doi_Deva-asm_Beng': {'num_samples': 1503, 'number_of_characters': 167032, 'unique_pairs': 1500, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'doi_Deva-ben_Beng': {'num_samples': 1503, 'number_of_characters': 161436, 'unique_pairs': 1501, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'doi_Deva-brx_Deva': {'num_samples': 1503, 'number_of_characters': 167492, 'unique_pairs': 1501, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'doi_Deva-eng_Latn': {'num_samples': 1503, 'number_of_characters': 166164, 'unique_pairs': 1500, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'doi_Deva-gom_Deva': {'num_samples': 1503, 'number_of_characters': 161730, 'unique_pairs': 1502, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'doi_Deva-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 163717, 'unique_pairs': 1503, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'doi_Deva-hin_Deva': {'num_samples': 1503, 'number_of_characters': 165412, 'unique_pairs': 1503, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'doi_Deva-kan_Knda': {'num_samples': 1503, 'number_of_characters': 170625, 'unique_pairs': 1503, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'doi_Deva-kas_Arab': {'num_samples': 1503, 'number_of_characters': 170129, 'unique_pairs': 1502, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'doi_Deva-mai_Deva': {'num_samples': 1503, 'number_of_characters': 167856, 'unique_pairs': 1502, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'doi_Deva-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 178286, 'unique_pairs': 1500, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'doi_Deva-mar_Deva': {'num_samples': 1503, 'number_of_characters': 168195, 'unique_pairs': 1502, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'doi_Deva-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 162764, 'unique_pairs': 1501, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'doi_Deva-npi_Deva': {'num_samples': 1503, 'number_of_characters': 166354, 'unique_pairs': 1501, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'doi_Deva-ory_Orya': {'num_samples': 1503, 'number_of_characters': 169671, 'unique_pairs': 1503, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'doi_Deva-pan_Guru': {'num_samples': 1503, 'number_of_characters': 165649, 'unique_pairs': 1500, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'doi_Deva-san_Deva': {'num_samples': 1503, 'number_of_characters': 163541, 'unique_pairs': 1502, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'doi_Deva-sat_Olck': {'num_samples': 1503, 'number_of_characters': 174827, 'unique_pairs': 1503, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'doi_Deva-snd_Deva': {'num_samples': 1503, 'number_of_characters': 168071, 'unique_pairs': 1500, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'doi_Deva-tam_Taml': {'num_samples': 1503, 'number_of_characters': 180314, 'unique_pairs': 1502, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'doi_Deva-tel_Telu': {'num_samples': 1503, 'number_of_characters': 163138, 'unique_pairs': 1501, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'doi_Deva-urd_Arab': {'num_samples': 1503, 'number_of_characters': 166753, 'unique_pairs': 1500, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'eng_Latn-asm_Beng': {'num_samples': 1503, 'number_of_characters': 160716, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'eng_Latn-ben_Beng': {'num_samples': 1503, 'number_of_characters': 155120, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'eng_Latn-brx_Deva': {'num_samples': 1503, 'number_of_characters': 161176, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'eng_Latn-doi_Deva': {'num_samples': 1503, 'number_of_characters': 166164, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'eng_Latn-gom_Deva': {'num_samples': 1503, 'number_of_characters': 155414, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'eng_Latn-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 157401, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'eng_Latn-hin_Deva': {'num_samples': 1503, 'number_of_characters': 159096, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'eng_Latn-kan_Knda': {'num_samples': 1503, 'number_of_characters': 164309, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'eng_Latn-kas_Arab': {'num_samples': 1503, 'number_of_characters': 163813, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'eng_Latn-mai_Deva': {'num_samples': 1503, 'number_of_characters': 161540, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'eng_Latn-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 171970, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'eng_Latn-mar_Deva': {'num_samples': 1503, 'number_of_characters': 161879, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'eng_Latn-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 156448, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'eng_Latn-npi_Deva': {'num_samples': 1503, 'number_of_characters': 160038, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'eng_Latn-ory_Orya': {'num_samples': 1503, 'number_of_characters': 163355, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'eng_Latn-pan_Guru': {'num_samples': 1503, 'number_of_characters': 159333, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'eng_Latn-san_Deva': {'num_samples': 1503, 'number_of_characters': 157225, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'eng_Latn-sat_Olck': {'num_samples': 1503, 'number_of_characters': 168511, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'eng_Latn-snd_Deva': {'num_samples': 1503, 'number_of_characters': 161755, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'eng_Latn-tam_Taml': {'num_samples': 1503, 'number_of_characters': 173998, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'eng_Latn-tel_Telu': {'num_samples': 1503, 'number_of_characters': 156822, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'eng_Latn-urd_Arab': {'num_samples': 1503, 'number_of_characters': 160437, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'gom_Deva-asm_Beng': {'num_samples': 1503, 'number_of_characters': 156282, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'gom_Deva-ben_Beng': {'num_samples': 1503, 'number_of_characters': 150686, 'unique_pairs': 1501, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'gom_Deva-brx_Deva': {'num_samples': 1503, 'number_of_characters': 156742, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'gom_Deva-doi_Deva': {'num_samples': 1503, 'number_of_characters': 161730, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'gom_Deva-eng_Latn': {'num_samples': 1503, 'number_of_characters': 155414, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'gom_Deva-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 152967, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'gom_Deva-hin_Deva': {'num_samples': 1503, 'number_of_characters': 154662, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'gom_Deva-kan_Knda': {'num_samples': 1503, 'number_of_characters': 159875, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'gom_Deva-kas_Arab': {'num_samples': 1503, 'number_of_characters': 159379, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'gom_Deva-mai_Deva': {'num_samples': 1503, 'number_of_characters': 157106, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'gom_Deva-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 167536, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'gom_Deva-mar_Deva': {'num_samples': 1503, 'number_of_characters': 157445, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'gom_Deva-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 152014, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'gom_Deva-npi_Deva': {'num_samples': 1503, 'number_of_characters': 155604, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'gom_Deva-ory_Orya': {'num_samples': 1503, 'number_of_characters': 158921, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'gom_Deva-pan_Guru': {'num_samples': 1503, 'number_of_characters': 154899, 'unique_pairs': 1501, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'gom_Deva-san_Deva': {'num_samples': 1503, 'number_of_characters': 152791, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'gom_Deva-sat_Olck': {'num_samples': 1503, 'number_of_characters': 164077, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'gom_Deva-snd_Deva': {'num_samples': 1503, 'number_of_characters': 157321, 'unique_pairs': 1500, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'gom_Deva-tam_Taml': {'num_samples': 1503, 'number_of_characters': 169564, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'gom_Deva-tel_Telu': {'num_samples': 1503, 'number_of_characters': 152388, 'unique_pairs': 1501, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'gom_Deva-urd_Arab': {'num_samples': 1503, 'number_of_characters': 156003, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'guj_Gujr-asm_Beng': {'num_samples': 1503, 'number_of_characters': 158269, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'guj_Gujr-ben_Beng': {'num_samples': 1503, 'number_of_characters': 152673, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'guj_Gujr-brx_Deva': {'num_samples': 1503, 'number_of_characters': 158729, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'guj_Gujr-doi_Deva': {'num_samples': 1503, 'number_of_characters': 163717, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'guj_Gujr-eng_Latn': {'num_samples': 1503, 'number_of_characters': 157401, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'guj_Gujr-gom_Deva': {'num_samples': 1503, 'number_of_characters': 152967, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'guj_Gujr-hin_Deva': {'num_samples': 1503, 'number_of_characters': 156649, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'guj_Gujr-kan_Knda': {'num_samples': 1503, 'number_of_characters': 161862, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'guj_Gujr-kas_Arab': {'num_samples': 1503, 'number_of_characters': 161366, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'guj_Gujr-mai_Deva': {'num_samples': 1503, 'number_of_characters': 159093, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'guj_Gujr-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 169523, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'guj_Gujr-mar_Deva': {'num_samples': 1503, 'number_of_characters': 159432, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'guj_Gujr-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 154001, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'guj_Gujr-npi_Deva': {'num_samples': 1503, 'number_of_characters': 157591, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'guj_Gujr-ory_Orya': {'num_samples': 1503, 'number_of_characters': 160908, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'guj_Gujr-pan_Guru': {'num_samples': 1503, 'number_of_characters': 156886, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'guj_Gujr-san_Deva': {'num_samples': 1503, 'number_of_characters': 154778, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'guj_Gujr-sat_Olck': {'num_samples': 1503, 'number_of_characters': 166064, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'guj_Gujr-snd_Deva': {'num_samples': 1503, 'number_of_characters': 159308, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'guj_Gujr-tam_Taml': {'num_samples': 1503, 'number_of_characters': 171551, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'guj_Gujr-tel_Telu': {'num_samples': 1503, 'number_of_characters': 154375, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'guj_Gujr-urd_Arab': {'num_samples': 1503, 'number_of_characters': 157990, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'hin_Deva-asm_Beng': {'num_samples': 1503, 'number_of_characters': 159964, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'hin_Deva-ben_Beng': {'num_samples': 1503, 'number_of_characters': 154368, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'hin_Deva-brx_Deva': {'num_samples': 1503, 'number_of_characters': 160424, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'hin_Deva-doi_Deva': {'num_samples': 1503, 'number_of_characters': 165412, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'hin_Deva-eng_Latn': {'num_samples': 1503, 'number_of_characters': 159096, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'hin_Deva-gom_Deva': {'num_samples': 1503, 'number_of_characters': 154662, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'hin_Deva-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 156649, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'hin_Deva-kan_Knda': {'num_samples': 1503, 'number_of_characters': 163557, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'hin_Deva-kas_Arab': {'num_samples': 1503, 'number_of_characters': 163061, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'hin_Deva-mai_Deva': {'num_samples': 1503, 'number_of_characters': 160788, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'hin_Deva-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 171218, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'hin_Deva-mar_Deva': {'num_samples': 1503, 'number_of_characters': 161127, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'hin_Deva-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 155696, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'hin_Deva-npi_Deva': {'num_samples': 1503, 'number_of_characters': 159286, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'hin_Deva-ory_Orya': {'num_samples': 1503, 'number_of_characters': 162603, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'hin_Deva-pan_Guru': {'num_samples': 1503, 'number_of_characters': 158581, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'hin_Deva-san_Deva': {'num_samples': 1503, 'number_of_characters': 156473, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'hin_Deva-sat_Olck': {'num_samples': 1503, 'number_of_characters': 167759, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'hin_Deva-snd_Deva': {'num_samples': 1503, 'number_of_characters': 161003, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'hin_Deva-tam_Taml': {'num_samples': 1503, 'number_of_characters': 173246, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'hin_Deva-tel_Telu': {'num_samples': 1503, 'number_of_characters': 156070, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'hin_Deva-urd_Arab': {'num_samples': 1503, 'number_of_characters': 159685, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'kan_Knda-asm_Beng': {'num_samples': 1503, 'number_of_characters': 165177, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'kan_Knda-ben_Beng': {'num_samples': 1503, 'number_of_characters': 159581, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'kan_Knda-brx_Deva': {'num_samples': 1503, 'number_of_characters': 165637, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'kan_Knda-doi_Deva': {'num_samples': 1503, 'number_of_characters': 170625, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'kan_Knda-eng_Latn': {'num_samples': 1503, 'number_of_characters': 164309, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'kan_Knda-gom_Deva': {'num_samples': 1503, 'number_of_characters': 159875, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'kan_Knda-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 161862, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'kan_Knda-hin_Deva': {'num_samples': 1503, 'number_of_characters': 163557, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'kan_Knda-kas_Arab': {'num_samples': 1503, 'number_of_characters': 168274, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'kan_Knda-mai_Deva': {'num_samples': 1503, 'number_of_characters': 166001, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'kan_Knda-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 176431, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'kan_Knda-mar_Deva': {'num_samples': 1503, 'number_of_characters': 166340, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'kan_Knda-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 160909, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'kan_Knda-npi_Deva': {'num_samples': 1503, 'number_of_characters': 164499, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'kan_Knda-ory_Orya': {'num_samples': 1503, 'number_of_characters': 167816, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'kan_Knda-pan_Guru': {'num_samples': 1503, 'number_of_characters': 163794, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'kan_Knda-san_Deva': {'num_samples': 1503, 'number_of_characters': 161686, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'kan_Knda-sat_Olck': {'num_samples': 1503, 'number_of_characters': 172972, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'kan_Knda-snd_Deva': {'num_samples': 1503, 'number_of_characters': 166216, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'kan_Knda-tam_Taml': {'num_samples': 1503, 'number_of_characters': 178459, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'kan_Knda-tel_Telu': {'num_samples': 1503, 'number_of_characters': 161283, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'kan_Knda-urd_Arab': {'num_samples': 1503, 'number_of_characters': 164898, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'kas_Arab-asm_Beng': {'num_samples': 1503, 'number_of_characters': 164681, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'kas_Arab-ben_Beng': {'num_samples': 1503, 'number_of_characters': 159085, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'kas_Arab-brx_Deva': {'num_samples': 1503, 'number_of_characters': 165141, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'kas_Arab-doi_Deva': {'num_samples': 1503, 'number_of_characters': 170129, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'kas_Arab-eng_Latn': {'num_samples': 1503, 'number_of_characters': 163813, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'kas_Arab-gom_Deva': {'num_samples': 1503, 'number_of_characters': 159379, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'kas_Arab-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 161366, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'kas_Arab-hin_Deva': {'num_samples': 1503, 'number_of_characters': 163061, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'kas_Arab-kan_Knda': {'num_samples': 1503, 'number_of_characters': 168274, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'kas_Arab-mai_Deva': {'num_samples': 1503, 'number_of_characters': 165505, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'kas_Arab-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 175935, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'kas_Arab-mar_Deva': {'num_samples': 1503, 'number_of_characters': 165844, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'kas_Arab-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 160413, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'kas_Arab-npi_Deva': {'num_samples': 1503, 'number_of_characters': 164003, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'kas_Arab-ory_Orya': {'num_samples': 1503, 'number_of_characters': 167320, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'kas_Arab-pan_Guru': {'num_samples': 1503, 'number_of_characters': 163298, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'kas_Arab-san_Deva': {'num_samples': 1503, 'number_of_characters': 161190, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'kas_Arab-sat_Olck': {'num_samples': 1503, 'number_of_characters': 172476, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'kas_Arab-snd_Deva': {'num_samples': 1503, 'number_of_characters': 165720, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'kas_Arab-tam_Taml': {'num_samples': 1503, 'number_of_characters': 177963, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'kas_Arab-tel_Telu': {'num_samples': 1503, 'number_of_characters': 160787, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'kas_Arab-urd_Arab': {'num_samples': 1503, 'number_of_characters': 164402, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'mai_Deva-asm_Beng': {'num_samples': 1503, 'number_of_characters': 162408, 'unique_pairs': 1501, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'mai_Deva-ben_Beng': {'num_samples': 1503, 'number_of_characters': 156812, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'mai_Deva-brx_Deva': {'num_samples': 1503, 'number_of_characters': 162868, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'mai_Deva-doi_Deva': {'num_samples': 1503, 'number_of_characters': 167856, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'mai_Deva-eng_Latn': {'num_samples': 1503, 'number_of_characters': 161540, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'mai_Deva-gom_Deva': {'num_samples': 1503, 'number_of_characters': 157106, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'mai_Deva-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 159093, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'mai_Deva-hin_Deva': {'num_samples': 1503, 'number_of_characters': 160788, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'mai_Deva-kan_Knda': {'num_samples': 1503, 'number_of_characters': 166001, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'mai_Deva-kas_Arab': {'num_samples': 1503, 'number_of_characters': 165505, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'mai_Deva-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 173662, 'unique_pairs': 1501, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'mai_Deva-mar_Deva': {'num_samples': 1503, 'number_of_characters': 163571, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'mai_Deva-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 158140, 'unique_pairs': 1501, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'mai_Deva-npi_Deva': {'num_samples': 1503, 'number_of_characters': 161730, 'unique_pairs': 1500, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'mai_Deva-ory_Orya': {'num_samples': 1503, 'number_of_characters': 165047, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'mai_Deva-pan_Guru': {'num_samples': 1503, 'number_of_characters': 161025, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'mai_Deva-san_Deva': {'num_samples': 1503, 'number_of_characters': 158917, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'mai_Deva-sat_Olck': {'num_samples': 1503, 'number_of_characters': 170203, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'mai_Deva-snd_Deva': {'num_samples': 1503, 'number_of_characters': 163447, 'unique_pairs': 1501, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'mai_Deva-tam_Taml': {'num_samples': 1503, 'number_of_characters': 175690, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'mai_Deva-tel_Telu': {'num_samples': 1503, 'number_of_characters': 158514, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'mai_Deva-urd_Arab': {'num_samples': 1503, 'number_of_characters': 162129, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'mal_Mlym-asm_Beng': {'num_samples': 1503, 'number_of_characters': 172838, 'unique_pairs': 1498, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'mal_Mlym-ben_Beng': {'num_samples': 1503, 'number_of_characters': 167242, 'unique_pairs': 1501, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'mal_Mlym-brx_Deva': {'num_samples': 1503, 'number_of_characters': 173298, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'mal_Mlym-doi_Deva': {'num_samples': 1503, 'number_of_characters': 178286, 'unique_pairs': 1500, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'mal_Mlym-eng_Latn': {'num_samples': 1503, 'number_of_characters': 171970, 'unique_pairs': 1499, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'mal_Mlym-gom_Deva': {'num_samples': 1503, 'number_of_characters': 167536, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'mal_Mlym-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 169523, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'mal_Mlym-hin_Deva': {'num_samples': 1503, 'number_of_characters': 171218, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'mal_Mlym-kan_Knda': {'num_samples': 1503, 'number_of_characters': 176431, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'mal_Mlym-kas_Arab': {'num_samples': 1503, 'number_of_characters': 175935, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'mal_Mlym-mai_Deva': {'num_samples': 1503, 'number_of_characters': 173662, 'unique_pairs': 1501, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'mal_Mlym-mar_Deva': {'num_samples': 1503, 'number_of_characters': 174001, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'mal_Mlym-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 168570, 'unique_pairs': 1500, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'mal_Mlym-npi_Deva': {'num_samples': 1503, 'number_of_characters': 172160, 'unique_pairs': 1500, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'mal_Mlym-ory_Orya': {'num_samples': 1503, 'number_of_characters': 175477, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'mal_Mlym-pan_Guru': {'num_samples': 1503, 'number_of_characters': 171455, 'unique_pairs': 1498, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'mal_Mlym-san_Deva': {'num_samples': 1503, 'number_of_characters': 169347, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'mal_Mlym-sat_Olck': {'num_samples': 1503, 'number_of_characters': 180633, 'unique_pairs': 1501, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'mal_Mlym-snd_Deva': {'num_samples': 1503, 'number_of_characters': 173877, 'unique_pairs': 1499, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'mal_Mlym-tam_Taml': {'num_samples': 1503, 'number_of_characters': 186120, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'mal_Mlym-tel_Telu': {'num_samples': 1503, 'number_of_characters': 168944, 'unique_pairs': 1500, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'mal_Mlym-urd_Arab': {'num_samples': 1503, 'number_of_characters': 172559, 'unique_pairs': 1499, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'mar_Deva-asm_Beng': {'num_samples': 1503, 'number_of_characters': 162747, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'mar_Deva-ben_Beng': {'num_samples': 1503, 'number_of_characters': 157151, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'mar_Deva-brx_Deva': {'num_samples': 1503, 'number_of_characters': 163207, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'mar_Deva-doi_Deva': {'num_samples': 1503, 'number_of_characters': 168195, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'mar_Deva-eng_Latn': {'num_samples': 1503, 'number_of_characters': 161879, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'mar_Deva-gom_Deva': {'num_samples': 1503, 'number_of_characters': 157445, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'mar_Deva-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 159432, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'mar_Deva-hin_Deva': {'num_samples': 1503, 'number_of_characters': 161127, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'mar_Deva-kan_Knda': {'num_samples': 1503, 'number_of_characters': 166340, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'mar_Deva-kas_Arab': {'num_samples': 1503, 'number_of_characters': 165844, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'mar_Deva-mai_Deva': {'num_samples': 1503, 'number_of_characters': 163571, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'mar_Deva-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 174001, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'mar_Deva-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 158479, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'mar_Deva-npi_Deva': {'num_samples': 1503, 'number_of_characters': 162069, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'mar_Deva-ory_Orya': {'num_samples': 1503, 'number_of_characters': 165386, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'mar_Deva-pan_Guru': {'num_samples': 1503, 'number_of_characters': 161364, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'mar_Deva-san_Deva': {'num_samples': 1503, 'number_of_characters': 159256, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'mar_Deva-sat_Olck': {'num_samples': 1503, 'number_of_characters': 170542, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'mar_Deva-snd_Deva': {'num_samples': 1503, 'number_of_characters': 163786, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'mar_Deva-tam_Taml': {'num_samples': 1503, 'number_of_characters': 176029, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'mar_Deva-tel_Telu': {'num_samples': 1503, 'number_of_characters': 158853, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'mar_Deva-urd_Arab': {'num_samples': 1503, 'number_of_characters': 162468, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'mni_Mtei-asm_Beng': {'num_samples': 1503, 'number_of_characters': 157316, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'mni_Mtei-ben_Beng': {'num_samples': 1503, 'number_of_characters': 151720, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'mni_Mtei-brx_Deva': {'num_samples': 1503, 'number_of_characters': 157776, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'mni_Mtei-doi_Deva': {'num_samples': 1503, 'number_of_characters': 162764, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'mni_Mtei-eng_Latn': {'num_samples': 1503, 'number_of_characters': 156448, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'mni_Mtei-gom_Deva': {'num_samples': 1503, 'number_of_characters': 152014, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'mni_Mtei-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 154001, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'mni_Mtei-hin_Deva': {'num_samples': 1503, 'number_of_characters': 155696, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'mni_Mtei-kan_Knda': {'num_samples': 1503, 'number_of_characters': 160909, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'mni_Mtei-kas_Arab': {'num_samples': 1503, 'number_of_characters': 160413, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'mni_Mtei-mai_Deva': {'num_samples': 1503, 'number_of_characters': 158140, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'mni_Mtei-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 168570, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'mni_Mtei-mar_Deva': {'num_samples': 1503, 'number_of_characters': 158479, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'mni_Mtei-npi_Deva': {'num_samples': 1503, 'number_of_characters': 156638, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'mni_Mtei-ory_Orya': {'num_samples': 1503, 'number_of_characters': 159955, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'mni_Mtei-pan_Guru': {'num_samples': 1503, 'number_of_characters': 155933, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'mni_Mtei-san_Deva': {'num_samples': 1503, 'number_of_characters': 153825, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'mni_Mtei-sat_Olck': {'num_samples': 1503, 'number_of_characters': 165111, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'mni_Mtei-snd_Deva': {'num_samples': 1503, 'number_of_characters': 158355, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'mni_Mtei-tam_Taml': {'num_samples': 1503, 'number_of_characters': 170598, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'mni_Mtei-tel_Telu': {'num_samples': 1503, 'number_of_characters': 153422, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'mni_Mtei-urd_Arab': {'num_samples': 1503, 'number_of_characters': 157037, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'npi_Deva-asm_Beng': {'num_samples': 1503, 'number_of_characters': 160906, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'npi_Deva-ben_Beng': {'num_samples': 1503, 'number_of_characters': 155310, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'npi_Deva-brx_Deva': {'num_samples': 1503, 'number_of_characters': 161366, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'npi_Deva-doi_Deva': {'num_samples': 1503, 'number_of_characters': 166354, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'npi_Deva-eng_Latn': {'num_samples': 1503, 'number_of_characters': 160038, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'npi_Deva-gom_Deva': {'num_samples': 1503, 'number_of_characters': 155604, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'npi_Deva-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 157591, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'npi_Deva-hin_Deva': {'num_samples': 1503, 'number_of_characters': 159286, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'npi_Deva-kan_Knda': {'num_samples': 1503, 'number_of_characters': 164499, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'npi_Deva-kas_Arab': {'num_samples': 1503, 'number_of_characters': 164003, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'npi_Deva-mai_Deva': {'num_samples': 1503, 'number_of_characters': 161730, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'npi_Deva-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 172160, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'npi_Deva-mar_Deva': {'num_samples': 1503, 'number_of_characters': 162069, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'npi_Deva-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 156638, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'npi_Deva-ory_Orya': {'num_samples': 1503, 'number_of_characters': 163545, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'npi_Deva-pan_Guru': {'num_samples': 1503, 'number_of_characters': 159523, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'npi_Deva-san_Deva': {'num_samples': 1503, 'number_of_characters': 157415, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'npi_Deva-sat_Olck': {'num_samples': 1503, 'number_of_characters': 168701, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'npi_Deva-snd_Deva': {'num_samples': 1503, 'number_of_characters': 161945, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'npi_Deva-tam_Taml': {'num_samples': 1503, 'number_of_characters': 174188, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'npi_Deva-tel_Telu': {'num_samples': 1503, 'number_of_characters': 157012, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'npi_Deva-urd_Arab': {'num_samples': 1503, 'number_of_characters': 160627, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'ory_Orya-asm_Beng': {'num_samples': 1503, 'number_of_characters': 164223, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'ory_Orya-ben_Beng': {'num_samples': 1503, 'number_of_characters': 158627, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'ory_Orya-brx_Deva': {'num_samples': 1503, 'number_of_characters': 164683, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'ory_Orya-doi_Deva': {'num_samples': 1503, 'number_of_characters': 169671, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'ory_Orya-eng_Latn': {'num_samples': 1503, 'number_of_characters': 163355, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'ory_Orya-gom_Deva': {'num_samples': 1503, 'number_of_characters': 158921, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'ory_Orya-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 160908, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'ory_Orya-hin_Deva': {'num_samples': 1503, 'number_of_characters': 162603, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'ory_Orya-kan_Knda': {'num_samples': 1503, 'number_of_characters': 167816, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'ory_Orya-kas_Arab': {'num_samples': 1503, 'number_of_characters': 167320, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'ory_Orya-mai_Deva': {'num_samples': 1503, 'number_of_characters': 165047, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'ory_Orya-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 175477, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'ory_Orya-mar_Deva': {'num_samples': 1503, 'number_of_characters': 165386, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'ory_Orya-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 159955, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'ory_Orya-npi_Deva': {'num_samples': 1503, 'number_of_characters': 163545, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'ory_Orya-pan_Guru': {'num_samples': 1503, 'number_of_characters': 162840, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'ory_Orya-san_Deva': {'num_samples': 1503, 'number_of_characters': 160732, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'ory_Orya-sat_Olck': {'num_samples': 1503, 'number_of_characters': 172018, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'ory_Orya-snd_Deva': {'num_samples': 1503, 'number_of_characters': 165262, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'ory_Orya-tam_Taml': {'num_samples': 1503, 'number_of_characters': 177505, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'ory_Orya-tel_Telu': {'num_samples': 1503, 'number_of_characters': 160329, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'ory_Orya-urd_Arab': {'num_samples': 1503, 'number_of_characters': 163944, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'pan_Guru-asm_Beng': {'num_samples': 1503, 'number_of_characters': 160201, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'pan_Guru-ben_Beng': {'num_samples': 1503, 'number_of_characters': 154605, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'pan_Guru-brx_Deva': {'num_samples': 1503, 'number_of_characters': 160661, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'pan_Guru-doi_Deva': {'num_samples': 1503, 'number_of_characters': 165649, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'pan_Guru-eng_Latn': {'num_samples': 1503, 'number_of_characters': 159333, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'pan_Guru-gom_Deva': {'num_samples': 1503, 'number_of_characters': 154899, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'pan_Guru-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 156886, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'pan_Guru-hin_Deva': {'num_samples': 1503, 'number_of_characters': 158581, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'pan_Guru-kan_Knda': {'num_samples': 1503, 'number_of_characters': 163794, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'pan_Guru-kas_Arab': {'num_samples': 1503, 'number_of_characters': 163298, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'pan_Guru-mai_Deva': {'num_samples': 1503, 'number_of_characters': 161025, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'pan_Guru-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 171455, 'unique_pairs': 1498, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'pan_Guru-mar_Deva': {'num_samples': 1503, 'number_of_characters': 161364, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'pan_Guru-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 155933, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'pan_Guru-npi_Deva': {'num_samples': 1503, 'number_of_characters': 159523, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'pan_Guru-ory_Orya': {'num_samples': 1503, 'number_of_characters': 162840, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'pan_Guru-san_Deva': {'num_samples': 1503, 'number_of_characters': 156710, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'pan_Guru-sat_Olck': {'num_samples': 1503, 'number_of_characters': 167996, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'pan_Guru-snd_Deva': {'num_samples': 1503, 'number_of_characters': 161240, 'unique_pairs': 1498, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'pan_Guru-tam_Taml': {'num_samples': 1503, 'number_of_characters': 173483, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'pan_Guru-tel_Telu': {'num_samples': 1503, 'number_of_characters': 156307, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'pan_Guru-urd_Arab': {'num_samples': 1503, 'number_of_characters': 159922, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'san_Deva-asm_Beng': {'num_samples': 1503, 'number_of_characters': 158093, 'unique_pairs': 1501, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'san_Deva-ben_Beng': {'num_samples': 1503, 'number_of_characters': 152497, 'unique_pairs': 1503, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'san_Deva-brx_Deva': {'num_samples': 1503, 'number_of_characters': 158553, 'unique_pairs': 1503, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'san_Deva-doi_Deva': {'num_samples': 1503, 'number_of_characters': 163541, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'san_Deva-eng_Latn': {'num_samples': 1503, 'number_of_characters': 157225, 'unique_pairs': 1501, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'san_Deva-gom_Deva': {'num_samples': 1503, 'number_of_characters': 152791, 'unique_pairs': 1503, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'san_Deva-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 154778, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'san_Deva-hin_Deva': {'num_samples': 1503, 'number_of_characters': 156473, 'unique_pairs': 1503, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'san_Deva-kan_Knda': {'num_samples': 1503, 'number_of_characters': 161686, 'unique_pairs': 1503, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'san_Deva-kas_Arab': {'num_samples': 1503, 'number_of_characters': 161190, 'unique_pairs': 1503, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'san_Deva-mai_Deva': {'num_samples': 1503, 'number_of_characters': 158917, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'san_Deva-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 169347, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'san_Deva-mar_Deva': {'num_samples': 1503, 'number_of_characters': 159256, 'unique_pairs': 1503, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'san_Deva-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 153825, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'san_Deva-npi_Deva': {'num_samples': 1503, 'number_of_characters': 157415, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'san_Deva-ory_Orya': {'num_samples': 1503, 'number_of_characters': 160732, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'san_Deva-pan_Guru': {'num_samples': 1503, 'number_of_characters': 156710, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'san_Deva-sat_Olck': {'num_samples': 1503, 'number_of_characters': 165888, 'unique_pairs': 1503, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'san_Deva-snd_Deva': {'num_samples': 1503, 'number_of_characters': 159132, 'unique_pairs': 1501, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'san_Deva-tam_Taml': {'num_samples': 1503, 'number_of_characters': 171375, 'unique_pairs': 1503, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'san_Deva-tel_Telu': {'num_samples': 1503, 'number_of_characters': 154199, 'unique_pairs': 1501, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'san_Deva-urd_Arab': {'num_samples': 1503, 'number_of_characters': 157814, 'unique_pairs': 1501, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'sat_Olck-asm_Beng': {'num_samples': 1503, 'number_of_characters': 169379, 'unique_pairs': 1502, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'sat_Olck-ben_Beng': {'num_samples': 1503, 'number_of_characters': 163783, 'unique_pairs': 1503, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'sat_Olck-brx_Deva': {'num_samples': 1503, 'number_of_characters': 169839, 'unique_pairs': 1503, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'sat_Olck-doi_Deva': {'num_samples': 1503, 'number_of_characters': 174827, 'unique_pairs': 1503, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'sat_Olck-eng_Latn': {'num_samples': 1503, 'number_of_characters': 168511, 'unique_pairs': 1502, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'sat_Olck-gom_Deva': {'num_samples': 1503, 'number_of_characters': 164077, 'unique_pairs': 1502, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'sat_Olck-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 166064, 'unique_pairs': 1503, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'sat_Olck-hin_Deva': {'num_samples': 1503, 'number_of_characters': 167759, 'unique_pairs': 1502, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'sat_Olck-kan_Knda': {'num_samples': 1503, 'number_of_characters': 172972, 'unique_pairs': 1503, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'sat_Olck-kas_Arab': {'num_samples': 1503, 'number_of_characters': 172476, 'unique_pairs': 1503, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'sat_Olck-mai_Deva': {'num_samples': 1503, 'number_of_characters': 170203, 'unique_pairs': 1503, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'sat_Olck-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 180633, 'unique_pairs': 1501, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'sat_Olck-mar_Deva': {'num_samples': 1503, 'number_of_characters': 170542, 'unique_pairs': 1503, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'sat_Olck-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 165111, 'unique_pairs': 1502, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'sat_Olck-npi_Deva': {'num_samples': 1503, 'number_of_characters': 168701, 'unique_pairs': 1503, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'sat_Olck-ory_Orya': {'num_samples': 1503, 'number_of_characters': 172018, 'unique_pairs': 1503, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'sat_Olck-pan_Guru': {'num_samples': 1503, 'number_of_characters': 167996, 'unique_pairs': 1501, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'sat_Olck-san_Deva': {'num_samples': 1503, 'number_of_characters': 165888, 'unique_pairs': 1503, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'sat_Olck-snd_Deva': {'num_samples': 1503, 'number_of_characters': 170418, 'unique_pairs': 1501, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'sat_Olck-tam_Taml': {'num_samples': 1503, 'number_of_characters': 182661, 'unique_pairs': 1503, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'sat_Olck-tel_Telu': {'num_samples': 1503, 'number_of_characters': 165485, 'unique_pairs': 1502, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'sat_Olck-urd_Arab': {'num_samples': 1503, 'number_of_characters': 169100, 'unique_pairs': 1502, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'snd_Deva-asm_Beng': {'num_samples': 1503, 'number_of_characters': 162623, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'snd_Deva-ben_Beng': {'num_samples': 1503, 'number_of_characters': 157027, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'snd_Deva-brx_Deva': {'num_samples': 1503, 'number_of_characters': 163083, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'snd_Deva-doi_Deva': {'num_samples': 1503, 'number_of_characters': 168071, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'snd_Deva-eng_Latn': {'num_samples': 1503, 'number_of_characters': 161755, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'snd_Deva-gom_Deva': {'num_samples': 1503, 'number_of_characters': 157321, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'snd_Deva-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 159308, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'snd_Deva-hin_Deva': {'num_samples': 1503, 'number_of_characters': 161003, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'snd_Deva-kan_Knda': {'num_samples': 1503, 'number_of_characters': 166216, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'snd_Deva-kas_Arab': {'num_samples': 1503, 'number_of_characters': 165720, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'snd_Deva-mai_Deva': {'num_samples': 1503, 'number_of_characters': 163447, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'snd_Deva-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 173877, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'snd_Deva-mar_Deva': {'num_samples': 1503, 'number_of_characters': 163786, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'snd_Deva-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 158355, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'snd_Deva-npi_Deva': {'num_samples': 1503, 'number_of_characters': 161945, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'snd_Deva-ory_Orya': {'num_samples': 1503, 'number_of_characters': 165262, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'snd_Deva-pan_Guru': {'num_samples': 1503, 'number_of_characters': 161240, 'unique_pairs': 1498, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'snd_Deva-san_Deva': {'num_samples': 1503, 'number_of_characters': 159132, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'snd_Deva-sat_Olck': {'num_samples': 1503, 'number_of_characters': 170418, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'snd_Deva-tam_Taml': {'num_samples': 1503, 'number_of_characters': 175905, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'snd_Deva-tel_Telu': {'num_samples': 1503, 'number_of_characters': 158729, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'snd_Deva-urd_Arab': {'num_samples': 1503, 'number_of_characters': 162344, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'tam_Taml-asm_Beng': {'num_samples': 1503, 'number_of_characters': 174866, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'tam_Taml-ben_Beng': {'num_samples': 1503, 'number_of_characters': 169270, 'unique_pairs': 1501, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'tam_Taml-brx_Deva': {'num_samples': 1503, 'number_of_characters': 175326, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'tam_Taml-doi_Deva': {'num_samples': 1503, 'number_of_characters': 180314, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'tam_Taml-eng_Latn': {'num_samples': 1503, 'number_of_characters': 173998, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'tam_Taml-gom_Deva': {'num_samples': 1503, 'number_of_characters': 169564, 'unique_pairs': 1503, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'tam_Taml-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 171551, 'unique_pairs': 1503, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'tam_Taml-hin_Deva': {'num_samples': 1503, 'number_of_characters': 173246, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'tam_Taml-kan_Knda': {'num_samples': 1503, 'number_of_characters': 178459, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'tam_Taml-kas_Arab': {'num_samples': 1503, 'number_of_characters': 177963, 'unique_pairs': 1503, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'tam_Taml-mai_Deva': {'num_samples': 1503, 'number_of_characters': 175690, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'tam_Taml-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 186120, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'tam_Taml-mar_Deva': {'num_samples': 1503, 'number_of_characters': 176029, 'unique_pairs': 1503, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'tam_Taml-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 170598, 'unique_pairs': 1503, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'tam_Taml-npi_Deva': {'num_samples': 1503, 'number_of_characters': 174188, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'tam_Taml-ory_Orya': {'num_samples': 1503, 'number_of_characters': 177505, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'tam_Taml-pan_Guru': {'num_samples': 1503, 'number_of_characters': 173483, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'tam_Taml-san_Deva': {'num_samples': 1503, 'number_of_characters': 171375, 'unique_pairs': 1503, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'tam_Taml-sat_Olck': {'num_samples': 1503, 'number_of_characters': 182661, 'unique_pairs': 1503, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'tam_Taml-snd_Deva': {'num_samples': 1503, 'number_of_characters': 175905, 'unique_pairs': 1503, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'tam_Taml-tel_Telu': {'num_samples': 1503, 'number_of_characters': 170972, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'tam_Taml-urd_Arab': {'num_samples': 1503, 'number_of_characters': 174587, 'unique_pairs': 1503, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'tel_Telu-asm_Beng': {'num_samples': 1503, 'number_of_characters': 157690, 'unique_pairs': 1499, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'tel_Telu-ben_Beng': {'num_samples': 1503, 'number_of_characters': 152094, 'unique_pairs': 1501, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'tel_Telu-brx_Deva': {'num_samples': 1503, 'number_of_characters': 158150, 'unique_pairs': 1502, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'tel_Telu-doi_Deva': {'num_samples': 1503, 'number_of_characters': 163138, 'unique_pairs': 1501, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'tel_Telu-eng_Latn': {'num_samples': 1503, 'number_of_characters': 156822, 'unique_pairs': 1500, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'tel_Telu-gom_Deva': {'num_samples': 1503, 'number_of_characters': 152388, 'unique_pairs': 1501, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'tel_Telu-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 154375, 'unique_pairs': 1502, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'tel_Telu-hin_Deva': {'num_samples': 1503, 'number_of_characters': 156070, 'unique_pairs': 1502, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'tel_Telu-kan_Knda': {'num_samples': 1503, 'number_of_characters': 161283, 'unique_pairs': 1502, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'tel_Telu-kas_Arab': {'num_samples': 1503, 'number_of_characters': 160787, 'unique_pairs': 1503, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'tel_Telu-mai_Deva': {'num_samples': 1503, 'number_of_characters': 158514, 'unique_pairs': 1502, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'tel_Telu-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 168944, 'unique_pairs': 1500, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'tel_Telu-mar_Deva': {'num_samples': 1503, 'number_of_characters': 158853, 'unique_pairs': 1503, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'tel_Telu-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 153422, 'unique_pairs': 1501, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'tel_Telu-npi_Deva': {'num_samples': 1503, 'number_of_characters': 157012, 'unique_pairs': 1502, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'tel_Telu-ory_Orya': {'num_samples': 1503, 'number_of_characters': 160329, 'unique_pairs': 1502, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'tel_Telu-pan_Guru': {'num_samples': 1503, 'number_of_characters': 156307, 'unique_pairs': 1499, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'tel_Telu-san_Deva': {'num_samples': 1503, 'number_of_characters': 154199, 'unique_pairs': 1501, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'tel_Telu-sat_Olck': {'num_samples': 1503, 'number_of_characters': 165485, 'unique_pairs': 1502, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'tel_Telu-snd_Deva': {'num_samples': 1503, 'number_of_characters': 158729, 'unique_pairs': 1499, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'tel_Telu-tam_Taml': {'num_samples': 1503, 'number_of_characters': 170972, 'unique_pairs': 1502, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'tel_Telu-urd_Arab': {'num_samples': 1503, 'number_of_characters': 157411, 'unique_pairs': 1499, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'urd_Arab-asm_Beng': {'num_samples': 1503, 'number_of_characters': 161305, 'unique_pairs': 1498, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'urd_Arab-ben_Beng': {'num_samples': 1503, 'number_of_characters': 155709, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'urd_Arab-brx_Deva': {'num_samples': 1503, 'number_of_characters': 161765, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'urd_Arab-doi_Deva': {'num_samples': 1503, 'number_of_characters': 166753, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'urd_Arab-eng_Latn': {'num_samples': 1503, 'number_of_characters': 160437, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'urd_Arab-gom_Deva': {'num_samples': 1503, 'number_of_characters': 156003, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'urd_Arab-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 157990, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'urd_Arab-hin_Deva': {'num_samples': 1503, 'number_of_characters': 159685, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'urd_Arab-kan_Knda': {'num_samples': 1503, 'number_of_characters': 164898, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'urd_Arab-kas_Arab': {'num_samples': 1503, 'number_of_characters': 164402, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'urd_Arab-mai_Deva': {'num_samples': 1503, 'number_of_characters': 162129, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'urd_Arab-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 172559, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'urd_Arab-mar_Deva': {'num_samples': 1503, 'number_of_characters': 162468, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'urd_Arab-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 157037, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'urd_Arab-npi_Deva': {'num_samples': 1503, 'number_of_characters': 160627, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'urd_Arab-ory_Orya': {'num_samples': 1503, 'number_of_characters': 163944, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'urd_Arab-pan_Guru': {'num_samples': 1503, 'number_of_characters': 159922, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'urd_Arab-san_Deva': {'num_samples': 1503, 'number_of_characters': 157814, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'urd_Arab-sat_Olck': {'num_samples': 1503, 'number_of_characters': 169100, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'urd_Arab-snd_Deva': {'num_samples': 1503, 'number_of_characters': 162344, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'urd_Arab-tam_Taml': {'num_samples': 1503, 'number_of_characters': 174587, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'urd_Arab-tel_Telu': {'num_samples': 1503, 'number_of_characters': 157411, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}}}} | | [IN22GenBitextMining](https://huggingface.co/datasets/ai4bharat/IN22-Gen) (Jay Gala, 2023) | ['asm', 'ben', 'brx', 'doi', 'eng', 'gom', 'guj', 'hin', 'kan', 'kas', 'mai', 'mal', 'mar', 'mni', 'npi', 'ory', 'pan', 'san', 'sat', 'snd', 'tam', 'tel', 'urd'] | BitextMining | s2s | [Government, Legal, News, Non-fiction, Religious, Web, Written] | {'test': 518144} | {'test': {'num_samples': 518144, 'number_of_characters': 162367876, 'unique_pairs': 518101, 'min_sentence1_length': 9, 'average_sentence1_length': 156.68, 'max_sentence1_length': 692, 'unique_sentence1': 23550, 'min_sentence2_length': 9, 'average_sentence2_length': 156.68, 'max_sentence2_length': 692, 'unique_sentence2': 23550, 'hf_subset_descriptive_stats': {'asm_Beng-ben_Beng': {'num_samples': 1024, 'number_of_characters': 310622, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'asm_Beng-brx_Deva': {'num_samples': 1024, 'number_of_characters': 323609, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'asm_Beng-doi_Deva': {'num_samples': 1024, 'number_of_characters': 319020, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'asm_Beng-eng_Latn': {'num_samples': 1024, 'number_of_characters': 320098, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'asm_Beng-gom_Deva': {'num_samples': 1024, 'number_of_characters': 312594, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'asm_Beng-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 309440, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'asm_Beng-hin_Deva': {'num_samples': 1024, 'number_of_characters': 320106, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'asm_Beng-kan_Knda': {'num_samples': 1024, 'number_of_characters': 332064, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'asm_Beng-kas_Arab': {'num_samples': 1024, 'number_of_characters': 322764, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'asm_Beng-mai_Deva': {'num_samples': 1024, 'number_of_characters': 308682, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'asm_Beng-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 343636, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'asm_Beng-mar_Deva': {'num_samples': 1024, 'number_of_characters': 321784, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'asm_Beng-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 313134, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'asm_Beng-npi_Deva': {'num_samples': 1024, 'number_of_characters': 313419, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'asm_Beng-ory_Orya': {'num_samples': 1024, 'number_of_characters': 334226, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'asm_Beng-pan_Guru': {'num_samples': 1024, 'number_of_characters': 306863, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'asm_Beng-san_Deva': {'num_samples': 1024, 'number_of_characters': 318079, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'asm_Beng-sat_Olck': {'num_samples': 1024, 'number_of_characters': 326732, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'asm_Beng-snd_Deva': {'num_samples': 1024, 'number_of_characters': 320421, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'asm_Beng-tam_Taml': {'num_samples': 1024, 'number_of_characters': 348346, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'asm_Beng-tel_Telu': {'num_samples': 1024, 'number_of_characters': 319045, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'asm_Beng-urd_Arab': {'num_samples': 1024, 'number_of_characters': 315134, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'ben_Beng-asm_Beng': {'num_samples': 1024, 'number_of_characters': 310622, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'ben_Beng-brx_Deva': {'num_samples': 1024, 'number_of_characters': 313313, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'ben_Beng-doi_Deva': {'num_samples': 1024, 'number_of_characters': 308724, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'ben_Beng-eng_Latn': {'num_samples': 1024, 'number_of_characters': 309802, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'ben_Beng-gom_Deva': {'num_samples': 1024, 'number_of_characters': 302298, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'ben_Beng-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 299144, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'ben_Beng-hin_Deva': {'num_samples': 1024, 'number_of_characters': 309810, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'ben_Beng-kan_Knda': {'num_samples': 1024, 'number_of_characters': 321768, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'ben_Beng-kas_Arab': {'num_samples': 1024, 'number_of_characters': 312468, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'ben_Beng-mai_Deva': {'num_samples': 1024, 'number_of_characters': 298386, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'ben_Beng-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 333340, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'ben_Beng-mar_Deva': {'num_samples': 1024, 'number_of_characters': 311488, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'ben_Beng-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 302838, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'ben_Beng-npi_Deva': {'num_samples': 1024, 'number_of_characters': 303123, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'ben_Beng-ory_Orya': {'num_samples': 1024, 'number_of_characters': 323930, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'ben_Beng-pan_Guru': {'num_samples': 1024, 'number_of_characters': 296567, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'ben_Beng-san_Deva': {'num_samples': 1024, 'number_of_characters': 307783, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'ben_Beng-sat_Olck': {'num_samples': 1024, 'number_of_characters': 316436, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'ben_Beng-snd_Deva': {'num_samples': 1024, 'number_of_characters': 310125, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'ben_Beng-tam_Taml': {'num_samples': 1024, 'number_of_characters': 338050, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'ben_Beng-tel_Telu': {'num_samples': 1024, 'number_of_characters': 308749, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'ben_Beng-urd_Arab': {'num_samples': 1024, 'number_of_characters': 304838, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'brx_Deva-asm_Beng': {'num_samples': 1024, 'number_of_characters': 323609, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'brx_Deva-ben_Beng': {'num_samples': 1024, 'number_of_characters': 313313, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'brx_Deva-doi_Deva': {'num_samples': 1024, 'number_of_characters': 321711, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'brx_Deva-eng_Latn': {'num_samples': 1024, 'number_of_characters': 322789, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'brx_Deva-gom_Deva': {'num_samples': 1024, 'number_of_characters': 315285, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'brx_Deva-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 312131, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'brx_Deva-hin_Deva': {'num_samples': 1024, 'number_of_characters': 322797, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'brx_Deva-kan_Knda': {'num_samples': 1024, 'number_of_characters': 334755, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'brx_Deva-kas_Arab': {'num_samples': 1024, 'number_of_characters': 325455, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'brx_Deva-mai_Deva': {'num_samples': 1024, 'number_of_characters': 311373, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'brx_Deva-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 346327, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'brx_Deva-mar_Deva': {'num_samples': 1024, 'number_of_characters': 324475, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'brx_Deva-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 315825, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'brx_Deva-npi_Deva': {'num_samples': 1024, 'number_of_characters': 316110, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'brx_Deva-ory_Orya': {'num_samples': 1024, 'number_of_characters': 336917, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'brx_Deva-pan_Guru': {'num_samples': 1024, 'number_of_characters': 309554, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'brx_Deva-san_Deva': {'num_samples': 1024, 'number_of_characters': 320770, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'brx_Deva-sat_Olck': {'num_samples': 1024, 'number_of_characters': 329423, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'brx_Deva-snd_Deva': {'num_samples': 1024, 'number_of_characters': 323112, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'brx_Deva-tam_Taml': {'num_samples': 1024, 'number_of_characters': 351037, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'brx_Deva-tel_Telu': {'num_samples': 1024, 'number_of_characters': 321736, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'brx_Deva-urd_Arab': {'num_samples': 1024, 'number_of_characters': 317825, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'doi_Deva-asm_Beng': {'num_samples': 1024, 'number_of_characters': 319020, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'doi_Deva-ben_Beng': {'num_samples': 1024, 'number_of_characters': 308724, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'doi_Deva-brx_Deva': {'num_samples': 1024, 'number_of_characters': 321711, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'doi_Deva-eng_Latn': {'num_samples': 1024, 'number_of_characters': 318200, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'doi_Deva-gom_Deva': {'num_samples': 1024, 'number_of_characters': 310696, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'doi_Deva-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 307542, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'doi_Deva-hin_Deva': {'num_samples': 1024, 'number_of_characters': 318208, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'doi_Deva-kan_Knda': {'num_samples': 1024, 'number_of_characters': 330166, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'doi_Deva-kas_Arab': {'num_samples': 1024, 'number_of_characters': 320866, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'doi_Deva-mai_Deva': {'num_samples': 1024, 'number_of_characters': 306784, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'doi_Deva-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 341738, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'doi_Deva-mar_Deva': {'num_samples': 1024, 'number_of_characters': 319886, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'doi_Deva-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 311236, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'doi_Deva-npi_Deva': {'num_samples': 1024, 'number_of_characters': 311521, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'doi_Deva-ory_Orya': {'num_samples': 1024, 'number_of_characters': 332328, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'doi_Deva-pan_Guru': {'num_samples': 1024, 'number_of_characters': 304965, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'doi_Deva-san_Deva': {'num_samples': 1024, 'number_of_characters': 316181, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'doi_Deva-sat_Olck': {'num_samples': 1024, 'number_of_characters': 324834, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'doi_Deva-snd_Deva': {'num_samples': 1024, 'number_of_characters': 318523, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'doi_Deva-tam_Taml': {'num_samples': 1024, 'number_of_characters': 346448, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'doi_Deva-tel_Telu': {'num_samples': 1024, 'number_of_characters': 317147, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'doi_Deva-urd_Arab': {'num_samples': 1024, 'number_of_characters': 313236, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'eng_Latn-asm_Beng': {'num_samples': 1024, 'number_of_characters': 320098, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'eng_Latn-ben_Beng': {'num_samples': 1024, 'number_of_characters': 309802, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'eng_Latn-brx_Deva': {'num_samples': 1024, 'number_of_characters': 322789, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'eng_Latn-doi_Deva': {'num_samples': 1024, 'number_of_characters': 318200, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'eng_Latn-gom_Deva': {'num_samples': 1024, 'number_of_characters': 311774, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'eng_Latn-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 308620, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'eng_Latn-hin_Deva': {'num_samples': 1024, 'number_of_characters': 319286, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'eng_Latn-kan_Knda': {'num_samples': 1024, 'number_of_characters': 331244, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'eng_Latn-kas_Arab': {'num_samples': 1024, 'number_of_characters': 321944, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'eng_Latn-mai_Deva': {'num_samples': 1024, 'number_of_characters': 307862, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'eng_Latn-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 342816, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'eng_Latn-mar_Deva': {'num_samples': 1024, 'number_of_characters': 320964, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'eng_Latn-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 312314, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'eng_Latn-npi_Deva': {'num_samples': 1024, 'number_of_characters': 312599, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'eng_Latn-ory_Orya': {'num_samples': 1024, 'number_of_characters': 333406, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'eng_Latn-pan_Guru': {'num_samples': 1024, 'number_of_characters': 306043, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'eng_Latn-san_Deva': {'num_samples': 1024, 'number_of_characters': 317259, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'eng_Latn-sat_Olck': {'num_samples': 1024, 'number_of_characters': 325912, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'eng_Latn-snd_Deva': {'num_samples': 1024, 'number_of_characters': 319601, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'eng_Latn-tam_Taml': {'num_samples': 1024, 'number_of_characters': 347526, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'eng_Latn-tel_Telu': {'num_samples': 1024, 'number_of_characters': 318225, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'eng_Latn-urd_Arab': {'num_samples': 1024, 'number_of_characters': 314314, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'gom_Deva-asm_Beng': {'num_samples': 1024, 'number_of_characters': 312594, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'gom_Deva-ben_Beng': {'num_samples': 1024, 'number_of_characters': 302298, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'gom_Deva-brx_Deva': {'num_samples': 1024, 'number_of_characters': 315285, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'gom_Deva-doi_Deva': {'num_samples': 1024, 'number_of_characters': 310696, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'gom_Deva-eng_Latn': {'num_samples': 1024, 'number_of_characters': 311774, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'gom_Deva-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 301116, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'gom_Deva-hin_Deva': {'num_samples': 1024, 'number_of_characters': 311782, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'gom_Deva-kan_Knda': {'num_samples': 1024, 'number_of_characters': 323740, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'gom_Deva-kas_Arab': {'num_samples': 1024, 'number_of_characters': 314440, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'gom_Deva-mai_Deva': {'num_samples': 1024, 'number_of_characters': 300358, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'gom_Deva-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 335312, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'gom_Deva-mar_Deva': {'num_samples': 1024, 'number_of_characters': 313460, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'gom_Deva-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 304810, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'gom_Deva-npi_Deva': {'num_samples': 1024, 'number_of_characters': 305095, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'gom_Deva-ory_Orya': {'num_samples': 1024, 'number_of_characters': 325902, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'gom_Deva-pan_Guru': {'num_samples': 1024, 'number_of_characters': 298539, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'gom_Deva-san_Deva': {'num_samples': 1024, 'number_of_characters': 309755, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'gom_Deva-sat_Olck': {'num_samples': 1024, 'number_of_characters': 318408, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'gom_Deva-snd_Deva': {'num_samples': 1024, 'number_of_characters': 312097, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'gom_Deva-tam_Taml': {'num_samples': 1024, 'number_of_characters': 340022, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'gom_Deva-tel_Telu': {'num_samples': 1024, 'number_of_characters': 310721, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'gom_Deva-urd_Arab': {'num_samples': 1024, 'number_of_characters': 306810, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'guj_Gujr-asm_Beng': {'num_samples': 1024, 'number_of_characters': 309440, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'guj_Gujr-ben_Beng': {'num_samples': 1024, 'number_of_characters': 299144, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'guj_Gujr-brx_Deva': {'num_samples': 1024, 'number_of_characters': 312131, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'guj_Gujr-doi_Deva': {'num_samples': 1024, 'number_of_characters': 307542, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'guj_Gujr-eng_Latn': {'num_samples': 1024, 'number_of_characters': 308620, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'guj_Gujr-gom_Deva': {'num_samples': 1024, 'number_of_characters': 301116, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'guj_Gujr-hin_Deva': {'num_samples': 1024, 'number_of_characters': 308628, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'guj_Gujr-kan_Knda': {'num_samples': 1024, 'number_of_characters': 320586, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'guj_Gujr-kas_Arab': {'num_samples': 1024, 'number_of_characters': 311286, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'guj_Gujr-mai_Deva': {'num_samples': 1024, 'number_of_characters': 297204, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'guj_Gujr-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 332158, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'guj_Gujr-mar_Deva': {'num_samples': 1024, 'number_of_characters': 310306, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'guj_Gujr-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 301656, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'guj_Gujr-npi_Deva': {'num_samples': 1024, 'number_of_characters': 301941, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'guj_Gujr-ory_Orya': {'num_samples': 1024, 'number_of_characters': 322748, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'guj_Gujr-pan_Guru': {'num_samples': 1024, 'number_of_characters': 295385, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'guj_Gujr-san_Deva': {'num_samples': 1024, 'number_of_characters': 306601, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'guj_Gujr-sat_Olck': {'num_samples': 1024, 'number_of_characters': 315254, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'guj_Gujr-snd_Deva': {'num_samples': 1024, 'number_of_characters': 308943, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'guj_Gujr-tam_Taml': {'num_samples': 1024, 'number_of_characters': 336868, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'guj_Gujr-tel_Telu': {'num_samples': 1024, 'number_of_characters': 307567, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'guj_Gujr-urd_Arab': {'num_samples': 1024, 'number_of_characters': 303656, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'hin_Deva-asm_Beng': {'num_samples': 1024, 'number_of_characters': 320106, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'hin_Deva-ben_Beng': {'num_samples': 1024, 'number_of_characters': 309810, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'hin_Deva-brx_Deva': {'num_samples': 1024, 'number_of_characters': 322797, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'hin_Deva-doi_Deva': {'num_samples': 1024, 'number_of_characters': 318208, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'hin_Deva-eng_Latn': {'num_samples': 1024, 'number_of_characters': 319286, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'hin_Deva-gom_Deva': {'num_samples': 1024, 'number_of_characters': 311782, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'hin_Deva-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 308628, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'hin_Deva-kan_Knda': {'num_samples': 1024, 'number_of_characters': 331252, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'hin_Deva-kas_Arab': {'num_samples': 1024, 'number_of_characters': 321952, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'hin_Deva-mai_Deva': {'num_samples': 1024, 'number_of_characters': 307870, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'hin_Deva-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 342824, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'hin_Deva-mar_Deva': {'num_samples': 1024, 'number_of_characters': 320972, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'hin_Deva-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 312322, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'hin_Deva-npi_Deva': {'num_samples': 1024, 'number_of_characters': 312607, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'hin_Deva-ory_Orya': {'num_samples': 1024, 'number_of_characters': 333414, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'hin_Deva-pan_Guru': {'num_samples': 1024, 'number_of_characters': 306051, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'hin_Deva-san_Deva': {'num_samples': 1024, 'number_of_characters': 317267, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'hin_Deva-sat_Olck': {'num_samples': 1024, 'number_of_characters': 325920, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'hin_Deva-snd_Deva': {'num_samples': 1024, 'number_of_characters': 319609, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'hin_Deva-tam_Taml': {'num_samples': 1024, 'number_of_characters': 347534, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'hin_Deva-tel_Telu': {'num_samples': 1024, 'number_of_characters': 318233, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'hin_Deva-urd_Arab': {'num_samples': 1024, 'number_of_characters': 314322, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'kan_Knda-asm_Beng': {'num_samples': 1024, 'number_of_characters': 332064, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'kan_Knda-ben_Beng': {'num_samples': 1024, 'number_of_characters': 321768, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'kan_Knda-brx_Deva': {'num_samples': 1024, 'number_of_characters': 334755, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'kan_Knda-doi_Deva': {'num_samples': 1024, 'number_of_characters': 330166, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'kan_Knda-eng_Latn': {'num_samples': 1024, 'number_of_characters': 331244, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'kan_Knda-gom_Deva': {'num_samples': 1024, 'number_of_characters': 323740, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'kan_Knda-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 320586, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'kan_Knda-hin_Deva': {'num_samples': 1024, 'number_of_characters': 331252, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'kan_Knda-kas_Arab': {'num_samples': 1024, 'number_of_characters': 333910, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'kan_Knda-mai_Deva': {'num_samples': 1024, 'number_of_characters': 319828, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'kan_Knda-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 354782, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'kan_Knda-mar_Deva': {'num_samples': 1024, 'number_of_characters': 332930, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'kan_Knda-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 324280, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'kan_Knda-npi_Deva': {'num_samples': 1024, 'number_of_characters': 324565, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'kan_Knda-ory_Orya': {'num_samples': 1024, 'number_of_characters': 345372, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'kan_Knda-pan_Guru': {'num_samples': 1024, 'number_of_characters': 318009, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'kan_Knda-san_Deva': {'num_samples': 1024, 'number_of_characters': 329225, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'kan_Knda-sat_Olck': {'num_samples': 1024, 'number_of_characters': 337878, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'kan_Knda-snd_Deva': {'num_samples': 1024, 'number_of_characters': 331567, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'kan_Knda-tam_Taml': {'num_samples': 1024, 'number_of_characters': 359492, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'kan_Knda-tel_Telu': {'num_samples': 1024, 'number_of_characters': 330191, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'kan_Knda-urd_Arab': {'num_samples': 1024, 'number_of_characters': 326280, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'kas_Arab-asm_Beng': {'num_samples': 1024, 'number_of_characters': 322764, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'kas_Arab-ben_Beng': {'num_samples': 1024, 'number_of_characters': 312468, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'kas_Arab-brx_Deva': {'num_samples': 1024, 'number_of_characters': 325455, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'kas_Arab-doi_Deva': {'num_samples': 1024, 'number_of_characters': 320866, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'kas_Arab-eng_Latn': {'num_samples': 1024, 'number_of_characters': 321944, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'kas_Arab-gom_Deva': {'num_samples': 1024, 'number_of_characters': 314440, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'kas_Arab-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 311286, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'kas_Arab-hin_Deva': {'num_samples': 1024, 'number_of_characters': 321952, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'kas_Arab-kan_Knda': {'num_samples': 1024, 'number_of_characters': 333910, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'kas_Arab-mai_Deva': {'num_samples': 1024, 'number_of_characters': 310528, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'kas_Arab-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 345482, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'kas_Arab-mar_Deva': {'num_samples': 1024, 'number_of_characters': 323630, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'kas_Arab-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 314980, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'kas_Arab-npi_Deva': {'num_samples': 1024, 'number_of_characters': 315265, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'kas_Arab-ory_Orya': {'num_samples': 1024, 'number_of_characters': 336072, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'kas_Arab-pan_Guru': {'num_samples': 1024, 'number_of_characters': 308709, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'kas_Arab-san_Deva': {'num_samples': 1024, 'number_of_characters': 319925, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'kas_Arab-sat_Olck': {'num_samples': 1024, 'number_of_characters': 328578, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'kas_Arab-snd_Deva': {'num_samples': 1024, 'number_of_characters': 322267, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'kas_Arab-tam_Taml': {'num_samples': 1024, 'number_of_characters': 350192, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'kas_Arab-tel_Telu': {'num_samples': 1024, 'number_of_characters': 320891, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'kas_Arab-urd_Arab': {'num_samples': 1024, 'number_of_characters': 316980, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'mai_Deva-asm_Beng': {'num_samples': 1024, 'number_of_characters': 308682, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'mai_Deva-ben_Beng': {'num_samples': 1024, 'number_of_characters': 298386, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'mai_Deva-brx_Deva': {'num_samples': 1024, 'number_of_characters': 311373, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'mai_Deva-doi_Deva': {'num_samples': 1024, 'number_of_characters': 306784, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'mai_Deva-eng_Latn': {'num_samples': 1024, 'number_of_characters': 307862, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'mai_Deva-gom_Deva': {'num_samples': 1024, 'number_of_characters': 300358, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'mai_Deva-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 297204, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'mai_Deva-hin_Deva': {'num_samples': 1024, 'number_of_characters': 307870, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'mai_Deva-kan_Knda': {'num_samples': 1024, 'number_of_characters': 319828, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'mai_Deva-kas_Arab': {'num_samples': 1024, 'number_of_characters': 310528, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'mai_Deva-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 331400, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'mai_Deva-mar_Deva': {'num_samples': 1024, 'number_of_characters': 309548, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'mai_Deva-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 300898, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'mai_Deva-npi_Deva': {'num_samples': 1024, 'number_of_characters': 301183, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'mai_Deva-ory_Orya': {'num_samples': 1024, 'number_of_characters': 321990, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'mai_Deva-pan_Guru': {'num_samples': 1024, 'number_of_characters': 294627, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'mai_Deva-san_Deva': {'num_samples': 1024, 'number_of_characters': 305843, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'mai_Deva-sat_Olck': {'num_samples': 1024, 'number_of_characters': 314496, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'mai_Deva-snd_Deva': {'num_samples': 1024, 'number_of_characters': 308185, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'mai_Deva-tam_Taml': {'num_samples': 1024, 'number_of_characters': 336110, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'mai_Deva-tel_Telu': {'num_samples': 1024, 'number_of_characters': 306809, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'mai_Deva-urd_Arab': {'num_samples': 1024, 'number_of_characters': 302898, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'mal_Mlym-asm_Beng': {'num_samples': 1024, 'number_of_characters': 343636, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'mal_Mlym-ben_Beng': {'num_samples': 1024, 'number_of_characters': 333340, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'mal_Mlym-brx_Deva': {'num_samples': 1024, 'number_of_characters': 346327, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'mal_Mlym-doi_Deva': {'num_samples': 1024, 'number_of_characters': 341738, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'mal_Mlym-eng_Latn': {'num_samples': 1024, 'number_of_characters': 342816, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'mal_Mlym-gom_Deva': {'num_samples': 1024, 'number_of_characters': 335312, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'mal_Mlym-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 332158, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'mal_Mlym-hin_Deva': {'num_samples': 1024, 'number_of_characters': 342824, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'mal_Mlym-kan_Knda': {'num_samples': 1024, 'number_of_characters': 354782, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'mal_Mlym-kas_Arab': {'num_samples': 1024, 'number_of_characters': 345482, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'mal_Mlym-mai_Deva': {'num_samples': 1024, 'number_of_characters': 331400, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'mal_Mlym-mar_Deva': {'num_samples': 1024, 'number_of_characters': 344502, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'mal_Mlym-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 335852, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'mal_Mlym-npi_Deva': {'num_samples': 1024, 'number_of_characters': 336137, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'mal_Mlym-ory_Orya': {'num_samples': 1024, 'number_of_characters': 356944, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'mal_Mlym-pan_Guru': {'num_samples': 1024, 'number_of_characters': 329581, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'mal_Mlym-san_Deva': {'num_samples': 1024, 'number_of_characters': 340797, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'mal_Mlym-sat_Olck': {'num_samples': 1024, 'number_of_characters': 349450, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'mal_Mlym-snd_Deva': {'num_samples': 1024, 'number_of_characters': 343139, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'mal_Mlym-tam_Taml': {'num_samples': 1024, 'number_of_characters': 371064, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'mal_Mlym-tel_Telu': {'num_samples': 1024, 'number_of_characters': 341763, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'mal_Mlym-urd_Arab': {'num_samples': 1024, 'number_of_characters': 337852, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'mar_Deva-asm_Beng': {'num_samples': 1024, 'number_of_characters': 321784, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'mar_Deva-ben_Beng': {'num_samples': 1024, 'number_of_characters': 311488, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'mar_Deva-brx_Deva': {'num_samples': 1024, 'number_of_characters': 324475, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'mar_Deva-doi_Deva': {'num_samples': 1024, 'number_of_characters': 319886, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'mar_Deva-eng_Latn': {'num_samples': 1024, 'number_of_characters': 320964, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'mar_Deva-gom_Deva': {'num_samples': 1024, 'number_of_characters': 313460, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'mar_Deva-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 310306, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'mar_Deva-hin_Deva': {'num_samples': 1024, 'number_of_characters': 320972, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'mar_Deva-kan_Knda': {'num_samples': 1024, 'number_of_characters': 332930, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'mar_Deva-kas_Arab': {'num_samples': 1024, 'number_of_characters': 323630, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'mar_Deva-mai_Deva': {'num_samples': 1024, 'number_of_characters': 309548, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'mar_Deva-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 344502, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'mar_Deva-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 314000, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'mar_Deva-npi_Deva': {'num_samples': 1024, 'number_of_characters': 314285, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'mar_Deva-ory_Orya': {'num_samples': 1024, 'number_of_characters': 335092, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'mar_Deva-pan_Guru': {'num_samples': 1024, 'number_of_characters': 307729, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'mar_Deva-san_Deva': {'num_samples': 1024, 'number_of_characters': 318945, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'mar_Deva-sat_Olck': {'num_samples': 1024, 'number_of_characters': 327598, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'mar_Deva-snd_Deva': {'num_samples': 1024, 'number_of_characters': 321287, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'mar_Deva-tam_Taml': {'num_samples': 1024, 'number_of_characters': 349212, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'mar_Deva-tel_Telu': {'num_samples': 1024, 'number_of_characters': 319911, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'mar_Deva-urd_Arab': {'num_samples': 1024, 'number_of_characters': 316000, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'mni_Mtei-asm_Beng': {'num_samples': 1024, 'number_of_characters': 313134, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'mni_Mtei-ben_Beng': {'num_samples': 1024, 'number_of_characters': 302838, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'mni_Mtei-brx_Deva': {'num_samples': 1024, 'number_of_characters': 315825, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'mni_Mtei-doi_Deva': {'num_samples': 1024, 'number_of_characters': 311236, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'mni_Mtei-eng_Latn': {'num_samples': 1024, 'number_of_characters': 312314, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'mni_Mtei-gom_Deva': {'num_samples': 1024, 'number_of_characters': 304810, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'mni_Mtei-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 301656, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'mni_Mtei-hin_Deva': {'num_samples': 1024, 'number_of_characters': 312322, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'mni_Mtei-kan_Knda': {'num_samples': 1024, 'number_of_characters': 324280, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'mni_Mtei-kas_Arab': {'num_samples': 1024, 'number_of_characters': 314980, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'mni_Mtei-mai_Deva': {'num_samples': 1024, 'number_of_characters': 300898, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'mni_Mtei-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 335852, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'mni_Mtei-mar_Deva': {'num_samples': 1024, 'number_of_characters': 314000, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'mni_Mtei-npi_Deva': {'num_samples': 1024, 'number_of_characters': 305635, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'mni_Mtei-ory_Orya': {'num_samples': 1024, 'number_of_characters': 326442, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'mni_Mtei-pan_Guru': {'num_samples': 1024, 'number_of_characters': 299079, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'mni_Mtei-san_Deva': {'num_samples': 1024, 'number_of_characters': 310295, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'mni_Mtei-sat_Olck': {'num_samples': 1024, 'number_of_characters': 318948, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'mni_Mtei-snd_Deva': {'num_samples': 1024, 'number_of_characters': 312637, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'mni_Mtei-tam_Taml': {'num_samples': 1024, 'number_of_characters': 340562, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'mni_Mtei-tel_Telu': {'num_samples': 1024, 'number_of_characters': 311261, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'mni_Mtei-urd_Arab': {'num_samples': 1024, 'number_of_characters': 307350, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'npi_Deva-asm_Beng': {'num_samples': 1024, 'number_of_characters': 313419, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'npi_Deva-ben_Beng': {'num_samples': 1024, 'number_of_characters': 303123, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'npi_Deva-brx_Deva': {'num_samples': 1024, 'number_of_characters': 316110, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'npi_Deva-doi_Deva': {'num_samples': 1024, 'number_of_characters': 311521, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'npi_Deva-eng_Latn': {'num_samples': 1024, 'number_of_characters': 312599, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'npi_Deva-gom_Deva': {'num_samples': 1024, 'number_of_characters': 305095, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'npi_Deva-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 301941, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'npi_Deva-hin_Deva': {'num_samples': 1024, 'number_of_characters': 312607, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'npi_Deva-kan_Knda': {'num_samples': 1024, 'number_of_characters': 324565, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'npi_Deva-kas_Arab': {'num_samples': 1024, 'number_of_characters': 315265, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'npi_Deva-mai_Deva': {'num_samples': 1024, 'number_of_characters': 301183, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'npi_Deva-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 336137, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'npi_Deva-mar_Deva': {'num_samples': 1024, 'number_of_characters': 314285, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'npi_Deva-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 305635, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'npi_Deva-ory_Orya': {'num_samples': 1024, 'number_of_characters': 326727, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'npi_Deva-pan_Guru': {'num_samples': 1024, 'number_of_characters': 299364, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'npi_Deva-san_Deva': {'num_samples': 1024, 'number_of_characters': 310580, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'npi_Deva-sat_Olck': {'num_samples': 1024, 'number_of_characters': 319233, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'npi_Deva-snd_Deva': {'num_samples': 1024, 'number_of_characters': 312922, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'npi_Deva-tam_Taml': {'num_samples': 1024, 'number_of_characters': 340847, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'npi_Deva-tel_Telu': {'num_samples': 1024, 'number_of_characters': 311546, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'npi_Deva-urd_Arab': {'num_samples': 1024, 'number_of_characters': 307635, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'ory_Orya-asm_Beng': {'num_samples': 1024, 'number_of_characters': 334226, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'ory_Orya-ben_Beng': {'num_samples': 1024, 'number_of_characters': 323930, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'ory_Orya-brx_Deva': {'num_samples': 1024, 'number_of_characters': 336917, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'ory_Orya-doi_Deva': {'num_samples': 1024, 'number_of_characters': 332328, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'ory_Orya-eng_Latn': {'num_samples': 1024, 'number_of_characters': 333406, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'ory_Orya-gom_Deva': {'num_samples': 1024, 'number_of_characters': 325902, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'ory_Orya-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 322748, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'ory_Orya-hin_Deva': {'num_samples': 1024, 'number_of_characters': 333414, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'ory_Orya-kan_Knda': {'num_samples': 1024, 'number_of_characters': 345372, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'ory_Orya-kas_Arab': {'num_samples': 1024, 'number_of_characters': 336072, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'ory_Orya-mai_Deva': {'num_samples': 1024, 'number_of_characters': 321990, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'ory_Orya-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 356944, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'ory_Orya-mar_Deva': {'num_samples': 1024, 'number_of_characters': 335092, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'ory_Orya-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 326442, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'ory_Orya-npi_Deva': {'num_samples': 1024, 'number_of_characters': 326727, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'ory_Orya-pan_Guru': {'num_samples': 1024, 'number_of_characters': 320171, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'ory_Orya-san_Deva': {'num_samples': 1024, 'number_of_characters': 331387, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'ory_Orya-sat_Olck': {'num_samples': 1024, 'number_of_characters': 340040, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'ory_Orya-snd_Deva': {'num_samples': 1024, 'number_of_characters': 333729, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'ory_Orya-tam_Taml': {'num_samples': 1024, 'number_of_characters': 361654, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'ory_Orya-tel_Telu': {'num_samples': 1024, 'number_of_characters': 332353, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'ory_Orya-urd_Arab': {'num_samples': 1024, 'number_of_characters': 328442, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'pan_Guru-asm_Beng': {'num_samples': 1024, 'number_of_characters': 306863, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'pan_Guru-ben_Beng': {'num_samples': 1024, 'number_of_characters': 296567, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'pan_Guru-brx_Deva': {'num_samples': 1024, 'number_of_characters': 309554, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'pan_Guru-doi_Deva': {'num_samples': 1024, 'number_of_characters': 304965, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'pan_Guru-eng_Latn': {'num_samples': 1024, 'number_of_characters': 306043, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'pan_Guru-gom_Deva': {'num_samples': 1024, 'number_of_characters': 298539, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'pan_Guru-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 295385, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'pan_Guru-hin_Deva': {'num_samples': 1024, 'number_of_characters': 306051, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'pan_Guru-kan_Knda': {'num_samples': 1024, 'number_of_characters': 318009, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'pan_Guru-kas_Arab': {'num_samples': 1024, 'number_of_characters': 308709, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'pan_Guru-mai_Deva': {'num_samples': 1024, 'number_of_characters': 294627, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'pan_Guru-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 329581, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'pan_Guru-mar_Deva': {'num_samples': 1024, 'number_of_characters': 307729, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'pan_Guru-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 299079, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'pan_Guru-npi_Deva': {'num_samples': 1024, 'number_of_characters': 299364, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'pan_Guru-ory_Orya': {'num_samples': 1024, 'number_of_characters': 320171, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'pan_Guru-san_Deva': {'num_samples': 1024, 'number_of_characters': 304024, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'pan_Guru-sat_Olck': {'num_samples': 1024, 'number_of_characters': 312677, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'pan_Guru-snd_Deva': {'num_samples': 1024, 'number_of_characters': 306366, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'pan_Guru-tam_Taml': {'num_samples': 1024, 'number_of_characters': 334291, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'pan_Guru-tel_Telu': {'num_samples': 1024, 'number_of_characters': 304990, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'pan_Guru-urd_Arab': {'num_samples': 1024, 'number_of_characters': 301079, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'san_Deva-asm_Beng': {'num_samples': 1024, 'number_of_characters': 318079, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'san_Deva-ben_Beng': {'num_samples': 1024, 'number_of_characters': 307783, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'san_Deva-brx_Deva': {'num_samples': 1024, 'number_of_characters': 320770, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'san_Deva-doi_Deva': {'num_samples': 1024, 'number_of_characters': 316181, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'san_Deva-eng_Latn': {'num_samples': 1024, 'number_of_characters': 317259, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'san_Deva-gom_Deva': {'num_samples': 1024, 'number_of_characters': 309755, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'san_Deva-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 306601, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'san_Deva-hin_Deva': {'num_samples': 1024, 'number_of_characters': 317267, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'san_Deva-kan_Knda': {'num_samples': 1024, 'number_of_characters': 329225, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'san_Deva-kas_Arab': {'num_samples': 1024, 'number_of_characters': 319925, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'san_Deva-mai_Deva': {'num_samples': 1024, 'number_of_characters': 305843, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'san_Deva-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 340797, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'san_Deva-mar_Deva': {'num_samples': 1024, 'number_of_characters': 318945, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'san_Deva-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 310295, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'san_Deva-npi_Deva': {'num_samples': 1024, 'number_of_characters': 310580, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'san_Deva-ory_Orya': {'num_samples': 1024, 'number_of_characters': 331387, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'san_Deva-pan_Guru': {'num_samples': 1024, 'number_of_characters': 304024, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'san_Deva-sat_Olck': {'num_samples': 1024, 'number_of_characters': 323893, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'san_Deva-snd_Deva': {'num_samples': 1024, 'number_of_characters': 317582, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'san_Deva-tam_Taml': {'num_samples': 1024, 'number_of_characters': 345507, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'san_Deva-tel_Telu': {'num_samples': 1024, 'number_of_characters': 316206, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'san_Deva-urd_Arab': {'num_samples': 1024, 'number_of_characters': 312295, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'sat_Olck-asm_Beng': {'num_samples': 1024, 'number_of_characters': 326732, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'sat_Olck-ben_Beng': {'num_samples': 1024, 'number_of_characters': 316436, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'sat_Olck-brx_Deva': {'num_samples': 1024, 'number_of_characters': 329423, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'sat_Olck-doi_Deva': {'num_samples': 1024, 'number_of_characters': 324834, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'sat_Olck-eng_Latn': {'num_samples': 1024, 'number_of_characters': 325912, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'sat_Olck-gom_Deva': {'num_samples': 1024, 'number_of_characters': 318408, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'sat_Olck-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 315254, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'sat_Olck-hin_Deva': {'num_samples': 1024, 'number_of_characters': 325920, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'sat_Olck-kan_Knda': {'num_samples': 1024, 'number_of_characters': 337878, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'sat_Olck-kas_Arab': {'num_samples': 1024, 'number_of_characters': 328578, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'sat_Olck-mai_Deva': {'num_samples': 1024, 'number_of_characters': 314496, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'sat_Olck-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 349450, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'sat_Olck-mar_Deva': {'num_samples': 1024, 'number_of_characters': 327598, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'sat_Olck-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 318948, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'sat_Olck-npi_Deva': {'num_samples': 1024, 'number_of_characters': 319233, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'sat_Olck-ory_Orya': {'num_samples': 1024, 'number_of_characters': 340040, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'sat_Olck-pan_Guru': {'num_samples': 1024, 'number_of_characters': 312677, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'sat_Olck-san_Deva': {'num_samples': 1024, 'number_of_characters': 323893, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'sat_Olck-snd_Deva': {'num_samples': 1024, 'number_of_characters': 326235, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'sat_Olck-tam_Taml': {'num_samples': 1024, 'number_of_characters': 354160, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'sat_Olck-tel_Telu': {'num_samples': 1024, 'number_of_characters': 324859, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'sat_Olck-urd_Arab': {'num_samples': 1024, 'number_of_characters': 320948, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'snd_Deva-asm_Beng': {'num_samples': 1024, 'number_of_characters': 320421, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'snd_Deva-ben_Beng': {'num_samples': 1024, 'number_of_characters': 310125, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'snd_Deva-brx_Deva': {'num_samples': 1024, 'number_of_characters': 323112, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'snd_Deva-doi_Deva': {'num_samples': 1024, 'number_of_characters': 318523, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'snd_Deva-eng_Latn': {'num_samples': 1024, 'number_of_characters': 319601, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'snd_Deva-gom_Deva': {'num_samples': 1024, 'number_of_characters': 312097, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'snd_Deva-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 308943, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'snd_Deva-hin_Deva': {'num_samples': 1024, 'number_of_characters': 319609, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'snd_Deva-kan_Knda': {'num_samples': 1024, 'number_of_characters': 331567, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'snd_Deva-kas_Arab': {'num_samples': 1024, 'number_of_characters': 322267, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'snd_Deva-mai_Deva': {'num_samples': 1024, 'number_of_characters': 308185, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'snd_Deva-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 343139, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'snd_Deva-mar_Deva': {'num_samples': 1024, 'number_of_characters': 321287, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'snd_Deva-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 312637, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'snd_Deva-npi_Deva': {'num_samples': 1024, 'number_of_characters': 312922, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'snd_Deva-ory_Orya': {'num_samples': 1024, 'number_of_characters': 333729, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'snd_Deva-pan_Guru': {'num_samples': 1024, 'number_of_characters': 306366, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'snd_Deva-san_Deva': {'num_samples': 1024, 'number_of_characters': 317582, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'snd_Deva-sat_Olck': {'num_samples': 1024, 'number_of_characters': 326235, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'snd_Deva-tam_Taml': {'num_samples': 1024, 'number_of_characters': 347849, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'snd_Deva-tel_Telu': {'num_samples': 1024, 'number_of_characters': 318548, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'snd_Deva-urd_Arab': {'num_samples': 1024, 'number_of_characters': 314637, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'tam_Taml-asm_Beng': {'num_samples': 1024, 'number_of_characters': 348346, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'tam_Taml-ben_Beng': {'num_samples': 1024, 'number_of_characters': 338050, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'tam_Taml-brx_Deva': {'num_samples': 1024, 'number_of_characters': 351037, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'tam_Taml-doi_Deva': {'num_samples': 1024, 'number_of_characters': 346448, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'tam_Taml-eng_Latn': {'num_samples': 1024, 'number_of_characters': 347526, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'tam_Taml-gom_Deva': {'num_samples': 1024, 'number_of_characters': 340022, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'tam_Taml-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 336868, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'tam_Taml-hin_Deva': {'num_samples': 1024, 'number_of_characters': 347534, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'tam_Taml-kan_Knda': {'num_samples': 1024, 'number_of_characters': 359492, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'tam_Taml-kas_Arab': {'num_samples': 1024, 'number_of_characters': 350192, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'tam_Taml-mai_Deva': {'num_samples': 1024, 'number_of_characters': 336110, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'tam_Taml-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 371064, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'tam_Taml-mar_Deva': {'num_samples': 1024, 'number_of_characters': 349212, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'tam_Taml-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 340562, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'tam_Taml-npi_Deva': {'num_samples': 1024, 'number_of_characters': 340847, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'tam_Taml-ory_Orya': {'num_samples': 1024, 'number_of_characters': 361654, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'tam_Taml-pan_Guru': {'num_samples': 1024, 'number_of_characters': 334291, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'tam_Taml-san_Deva': {'num_samples': 1024, 'number_of_characters': 345507, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'tam_Taml-sat_Olck': {'num_samples': 1024, 'number_of_characters': 354160, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'tam_Taml-snd_Deva': {'num_samples': 1024, 'number_of_characters': 347849, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'tam_Taml-tel_Telu': {'num_samples': 1024, 'number_of_characters': 346473, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'tam_Taml-urd_Arab': {'num_samples': 1024, 'number_of_characters': 342562, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'tel_Telu-asm_Beng': {'num_samples': 1024, 'number_of_characters': 319045, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'tel_Telu-ben_Beng': {'num_samples': 1024, 'number_of_characters': 308749, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'tel_Telu-brx_Deva': {'num_samples': 1024, 'number_of_characters': 321736, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'tel_Telu-doi_Deva': {'num_samples': 1024, 'number_of_characters': 317147, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'tel_Telu-eng_Latn': {'num_samples': 1024, 'number_of_characters': 318225, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'tel_Telu-gom_Deva': {'num_samples': 1024, 'number_of_characters': 310721, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'tel_Telu-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 307567, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'tel_Telu-hin_Deva': {'num_samples': 1024, 'number_of_characters': 318233, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'tel_Telu-kan_Knda': {'num_samples': 1024, 'number_of_characters': 330191, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'tel_Telu-kas_Arab': {'num_samples': 1024, 'number_of_characters': 320891, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'tel_Telu-mai_Deva': {'num_samples': 1024, 'number_of_characters': 306809, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'tel_Telu-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 341763, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'tel_Telu-mar_Deva': {'num_samples': 1024, 'number_of_characters': 319911, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'tel_Telu-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 311261, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'tel_Telu-npi_Deva': {'num_samples': 1024, 'number_of_characters': 311546, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'tel_Telu-ory_Orya': {'num_samples': 1024, 'number_of_characters': 332353, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'tel_Telu-pan_Guru': {'num_samples': 1024, 'number_of_characters': 304990, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'tel_Telu-san_Deva': {'num_samples': 1024, 'number_of_characters': 316206, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'tel_Telu-sat_Olck': {'num_samples': 1024, 'number_of_characters': 324859, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'tel_Telu-snd_Deva': {'num_samples': 1024, 'number_of_characters': 318548, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'tel_Telu-tam_Taml': {'num_samples': 1024, 'number_of_characters': 346473, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'tel_Telu-urd_Arab': {'num_samples': 1024, 'number_of_characters': 313261, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'urd_Arab-asm_Beng': {'num_samples': 1024, 'number_of_characters': 315134, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'urd_Arab-ben_Beng': {'num_samples': 1024, 'number_of_characters': 304838, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'urd_Arab-brx_Deva': {'num_samples': 1024, 'number_of_characters': 317825, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'urd_Arab-doi_Deva': {'num_samples': 1024, 'number_of_characters': 313236, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'urd_Arab-eng_Latn': {'num_samples': 1024, 'number_of_characters': 314314, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'urd_Arab-gom_Deva': {'num_samples': 1024, 'number_of_characters': 306810, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'urd_Arab-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 303656, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'urd_Arab-hin_Deva': {'num_samples': 1024, 'number_of_characters': 314322, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'urd_Arab-kan_Knda': {'num_samples': 1024, 'number_of_characters': 326280, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'urd_Arab-kas_Arab': {'num_samples': 1024, 'number_of_characters': 316980, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'urd_Arab-mai_Deva': {'num_samples': 1024, 'number_of_characters': 302898, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'urd_Arab-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 337852, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'urd_Arab-mar_Deva': {'num_samples': 1024, 'number_of_characters': 316000, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'urd_Arab-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 307350, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'urd_Arab-npi_Deva': {'num_samples': 1024, 'number_of_characters': 307635, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'urd_Arab-ory_Orya': {'num_samples': 1024, 'number_of_characters': 328442, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'urd_Arab-pan_Guru': {'num_samples': 1024, 'number_of_characters': 301079, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'urd_Arab-san_Deva': {'num_samples': 1024, 'number_of_characters': 312295, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'urd_Arab-sat_Olck': {'num_samples': 1024, 'number_of_characters': 320948, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'urd_Arab-snd_Deva': {'num_samples': 1024, 'number_of_characters': 314637, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'urd_Arab-tam_Taml': {'num_samples': 1024, 'number_of_characters': 342562, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'urd_Arab-tel_Telu': {'num_samples': 1024, 'number_of_characters': 313261, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}}}} | -| [IWSLT2017BitextMining](https://aclanthology.org/2017.iwslt-1.1/) | ['ara', 'cmn', 'deu', 'eng', 'fra', 'ita', 'jpn', 'kor', 'nld', 'ron'] | BitextMining | s2s | [Fiction, Non-fiction, Written] | {'validation': 21938} | {'validation': {'num_samples': 21938, 'number_of_characters': 4256244, 'unique_pairs': 21840, 'min_sentence1_length': 2, 'average_sentence1_length': 97.01, 'max_sentence1_length': 521, 'unique_sentence1': 11563, 'min_sentence2_length': 2, 'average_sentence2_length': 97.01, 'max_sentence2_length': 521, 'unique_sentence2': 11563, 'hf_subset_descriptive_stats': {'ar-en': {'num_samples': 888, 'number_of_characters': 172499, 'unique_pairs': 887, 'min_sentence1_length': 4, 'average_sentence1_length': 85.49, 'max_sentence1_length': 369, 'unique_sentence1': 887, 'min_sentence2_length': 10, 'average_sentence2_length': 108.77, 'max_sentence2_length': 462, 'unique_sentence2': 881}, 'de-en': {'num_samples': 888, 'number_of_characters': 202336, 'unique_pairs': 883, 'min_sentence1_length': 6, 'average_sentence1_length': 119.03, 'max_sentence1_length': 521, 'unique_sentence1': 881, 'min_sentence2_length': 10, 'average_sentence2_length': 108.83, 'max_sentence2_length': 462, 'unique_sentence2': 881}, 'en-ar': {'num_samples': 888, 'number_of_characters': 172499, 'unique_pairs': 887, 'min_sentence1_length': 10, 'average_sentence1_length': 108.77, 'max_sentence1_length': 462, 'unique_sentence1': 881, 'min_sentence2_length': 4, 'average_sentence2_length': 85.49, 'max_sentence2_length': 369, 'unique_sentence2': 887}, 'en-de': {'num_samples': 888, 'number_of_characters': 202336, 'unique_pairs': 883, 'min_sentence1_length': 10, 'average_sentence1_length': 108.83, 'max_sentence1_length': 462, 'unique_sentence1': 881, 'min_sentence2_length': 6, 'average_sentence2_length': 119.03, 'max_sentence2_length': 521, 'unique_sentence2': 881}, 'en-fr': {'num_samples': 890, 'number_of_characters': 197619, 'unique_pairs': 883, 'min_sentence1_length': 10, 'average_sentence1_length': 108.41, 'max_sentence1_length': 462, 'unique_sentence1': 883, 'min_sentence2_length': 6, 'average_sentence2_length': 113.63, 'max_sentence2_length': 493, 'unique_sentence2': 881}, 'en-it': {'num_samples': 929, 'number_of_characters': 191803, 'unique_pairs': 924, 'min_sentence1_length': 10, 'average_sentence1_length': 103.0, 'max_sentence1_length': 433, 'unique_sentence1': 922, 'min_sentence2_length': 7, 'average_sentence2_length': 103.46, 'max_sentence2_length': 444, 'unique_sentence2': 918}, 'en-ja': {'num_samples': 871, 'number_of_characters': 132742, 'unique_pairs': 867, 'min_sentence1_length': 10, 'average_sentence1_length': 109.81, 'max_sentence1_length': 462, 'unique_sentence1': 864, 'min_sentence2_length': 5, 'average_sentence2_length': 42.59, 'max_sentence2_length': 225, 'unique_sentence2': 866}, 'en-ko': {'num_samples': 879, 'number_of_characters': 142659, 'unique_pairs': 874, 'min_sentence1_length': 10, 'average_sentence1_length': 107.74, 'max_sentence1_length': 462, 'unique_sentence1': 872, 'min_sentence2_length': 3, 'average_sentence2_length': 54.56, 'max_sentence2_length': 250, 'unique_sentence2': 872}, 'en-nl': {'num_samples': 1003, 'number_of_characters': 189637, 'unique_pairs': 1000, 'min_sentence1_length': 10, 'average_sentence1_length': 95.27, 'max_sentence1_length': 433, 'unique_sentence1': 996, 'min_sentence2_length': 4, 'average_sentence2_length': 93.8, 'max_sentence2_length': 477, 'unique_sentence2': 1000}, 'en-ro': {'num_samples': 914, 'number_of_characters': 194128, 'unique_pairs': 910, 'min_sentence1_length': 10, 'average_sentence1_length': 104.72, 'max_sentence1_length': 433, 'unique_sentence1': 907, 'min_sentence2_length': 9, 'average_sentence2_length': 107.67, 'max_sentence2_length': 448, 'unique_sentence2': 910}, 'en-zh': {'num_samples': 879, 'number_of_characters': 131126, 'unique_pairs': 877, 'min_sentence1_length': 10, 'average_sentence1_length': 109.37, 'max_sentence1_length': 462, 'unique_sentence1': 872, 'min_sentence2_length': 2, 'average_sentence2_length': 39.81, 'max_sentence2_length': 230, 'unique_sentence2': 867}, 'fr-en': {'num_samples': 890, 'number_of_characters': 197619, 'unique_pairs': 883, 'min_sentence1_length': 6, 'average_sentence1_length': 113.63, 'max_sentence1_length': 493, 'unique_sentence1': 881, 'min_sentence2_length': 10, 'average_sentence2_length': 108.41, 'max_sentence2_length': 462, 'unique_sentence2': 883}, 'it-en': {'num_samples': 929, 'number_of_characters': 191803, 'unique_pairs': 924, 'min_sentence1_length': 7, 'average_sentence1_length': 103.46, 'max_sentence1_length': 444, 'unique_sentence1': 918, 'min_sentence2_length': 10, 'average_sentence2_length': 103.0, 'max_sentence2_length': 433, 'unique_sentence2': 922}, 'it-nl': {'num_samples': 1001, 'number_of_characters': 188858, 'unique_pairs': 998, 'min_sentence1_length': 7, 'average_sentence1_length': 94.64, 'max_sentence1_length': 459, 'unique_sentence1': 994, 'min_sentence2_length': 7, 'average_sentence2_length': 94.03, 'max_sentence2_length': 505, 'unique_sentence2': 998}, 'it-ro': {'num_samples': 914, 'number_of_characters': 193339, 'unique_pairs': 911, 'min_sentence1_length': 7, 'average_sentence1_length': 103.91, 'max_sentence1_length': 435, 'unique_sentence1': 907, 'min_sentence2_length': 9, 'average_sentence2_length': 107.62, 'max_sentence2_length': 448, 'unique_sentence2': 910}, 'ja-en': {'num_samples': 871, 'number_of_characters': 132742, 'unique_pairs': 867, 'min_sentence1_length': 5, 'average_sentence1_length': 42.59, 'max_sentence1_length': 225, 'unique_sentence1': 866, 'min_sentence2_length': 10, 'average_sentence2_length': 109.81, 'max_sentence2_length': 462, 'unique_sentence2': 864}, 'ko-en': {'num_samples': 879, 'number_of_characters': 142659, 'unique_pairs': 874, 'min_sentence1_length': 3, 'average_sentence1_length': 54.56, 'max_sentence1_length': 250, 'unique_sentence1': 872, 'min_sentence2_length': 10, 'average_sentence2_length': 107.74, 'max_sentence2_length': 462, 'unique_sentence2': 872}, 'nl-en': {'num_samples': 1003, 'number_of_characters': 189637, 'unique_pairs': 1000, 'min_sentence1_length': 4, 'average_sentence1_length': 93.8, 'max_sentence1_length': 477, 'unique_sentence1': 1000, 'min_sentence2_length': 10, 'average_sentence2_length': 95.27, 'max_sentence2_length': 433, 'unique_sentence2': 996}, 'nl-it': {'num_samples': 1001, 'number_of_characters': 188858, 'unique_pairs': 998, 'min_sentence1_length': 7, 'average_sentence1_length': 94.03, 'max_sentence1_length': 505, 'unique_sentence1': 998, 'min_sentence2_length': 7, 'average_sentence2_length': 94.64, 'max_sentence2_length': 459, 'unique_sentence2': 994}, 'nl-ro': {'num_samples': 913, 'number_of_characters': 191376, 'unique_pairs': 911, 'min_sentence1_length': 7, 'average_sentence1_length': 102.02, 'max_sentence1_length': 478, 'unique_sentence1': 909, 'min_sentence2_length': 9, 'average_sentence2_length': 107.59, 'max_sentence2_length': 515, 'unique_sentence2': 909}, 'ro-en': {'num_samples': 914, 'number_of_characters': 194128, 'unique_pairs': 910, 'min_sentence1_length': 9, 'average_sentence1_length': 107.67, 'max_sentence1_length': 448, 'unique_sentence1': 910, 'min_sentence2_length': 10, 'average_sentence2_length': 104.72, 'max_sentence2_length': 433, 'unique_sentence2': 907}, 'ro-it': {'num_samples': 914, 'number_of_characters': 193339, 'unique_pairs': 911, 'min_sentence1_length': 9, 'average_sentence1_length': 107.62, 'max_sentence1_length': 448, 'unique_sentence1': 910, 'min_sentence2_length': 7, 'average_sentence2_length': 103.91, 'max_sentence2_length': 435, 'unique_sentence2': 907}, 'ro-nl': {'num_samples': 913, 'number_of_characters': 191376, 'unique_pairs': 911, 'min_sentence1_length': 9, 'average_sentence1_length': 107.59, 'max_sentence1_length': 515, 'unique_sentence1': 909, 'min_sentence2_length': 7, 'average_sentence2_length': 102.02, 'max_sentence2_length': 478, 'unique_sentence2': 909}, 'zh-en': {'num_samples': 879, 'number_of_characters': 131126, 'unique_pairs': 877, 'min_sentence1_length': 2, 'average_sentence1_length': 39.81, 'max_sentence1_length': 230, 'unique_sentence1': 867, 'min_sentence2_length': 10, 'average_sentence2_length': 109.37, 'max_sentence2_length': 462, 'unique_sentence2': 872}}}} | +| [IWSLT2017BitextMining](https://aclanthology.org/2017.iwslt-1.1/) (Cettolo et al., 2017) | ['ara', 'cmn', 'deu', 'eng', 'fra', 'ita', 'jpn', 'kor', 'nld', 'ron'] | BitextMining | s2s | [Fiction, Non-fiction, Written] | {'validation': 21938} | {'validation': {'num_samples': 21938, 'number_of_characters': 4256244, 'unique_pairs': 21840, 'min_sentence1_length': 2, 'average_sentence1_length': 97.01, 'max_sentence1_length': 521, 'unique_sentence1': 11563, 'min_sentence2_length': 2, 'average_sentence2_length': 97.01, 'max_sentence2_length': 521, 'unique_sentence2': 11563, 'hf_subset_descriptive_stats': {'ar-en': {'num_samples': 888, 'number_of_characters': 172499, 'unique_pairs': 887, 'min_sentence1_length': 4, 'average_sentence1_length': 85.49, 'max_sentence1_length': 369, 'unique_sentence1': 887, 'min_sentence2_length': 10, 'average_sentence2_length': 108.77, 'max_sentence2_length': 462, 'unique_sentence2': 881}, 'de-en': {'num_samples': 888, 'number_of_characters': 202336, 'unique_pairs': 883, 'min_sentence1_length': 6, 'average_sentence1_length': 119.03, 'max_sentence1_length': 521, 'unique_sentence1': 881, 'min_sentence2_length': 10, 'average_sentence2_length': 108.83, 'max_sentence2_length': 462, 'unique_sentence2': 881}, 'en-ar': {'num_samples': 888, 'number_of_characters': 172499, 'unique_pairs': 887, 'min_sentence1_length': 10, 'average_sentence1_length': 108.77, 'max_sentence1_length': 462, 'unique_sentence1': 881, 'min_sentence2_length': 4, 'average_sentence2_length': 85.49, 'max_sentence2_length': 369, 'unique_sentence2': 887}, 'en-de': {'num_samples': 888, 'number_of_characters': 202336, 'unique_pairs': 883, 'min_sentence1_length': 10, 'average_sentence1_length': 108.83, 'max_sentence1_length': 462, 'unique_sentence1': 881, 'min_sentence2_length': 6, 'average_sentence2_length': 119.03, 'max_sentence2_length': 521, 'unique_sentence2': 881}, 'en-fr': {'num_samples': 890, 'number_of_characters': 197619, 'unique_pairs': 883, 'min_sentence1_length': 10, 'average_sentence1_length': 108.41, 'max_sentence1_length': 462, 'unique_sentence1': 883, 'min_sentence2_length': 6, 'average_sentence2_length': 113.63, 'max_sentence2_length': 493, 'unique_sentence2': 881}, 'en-it': {'num_samples': 929, 'number_of_characters': 191803, 'unique_pairs': 924, 'min_sentence1_length': 10, 'average_sentence1_length': 103.0, 'max_sentence1_length': 433, 'unique_sentence1': 922, 'min_sentence2_length': 7, 'average_sentence2_length': 103.46, 'max_sentence2_length': 444, 'unique_sentence2': 918}, 'en-ja': {'num_samples': 871, 'number_of_characters': 132742, 'unique_pairs': 867, 'min_sentence1_length': 10, 'average_sentence1_length': 109.81, 'max_sentence1_length': 462, 'unique_sentence1': 864, 'min_sentence2_length': 5, 'average_sentence2_length': 42.59, 'max_sentence2_length': 225, 'unique_sentence2': 866}, 'en-ko': {'num_samples': 879, 'number_of_characters': 142659, 'unique_pairs': 874, 'min_sentence1_length': 10, 'average_sentence1_length': 107.74, 'max_sentence1_length': 462, 'unique_sentence1': 872, 'min_sentence2_length': 3, 'average_sentence2_length': 54.56, 'max_sentence2_length': 250, 'unique_sentence2': 872}, 'en-nl': {'num_samples': 1003, 'number_of_characters': 189637, 'unique_pairs': 1000, 'min_sentence1_length': 10, 'average_sentence1_length': 95.27, 'max_sentence1_length': 433, 'unique_sentence1': 996, 'min_sentence2_length': 4, 'average_sentence2_length': 93.8, 'max_sentence2_length': 477, 'unique_sentence2': 1000}, 'en-ro': {'num_samples': 914, 'number_of_characters': 194128, 'unique_pairs': 910, 'min_sentence1_length': 10, 'average_sentence1_length': 104.72, 'max_sentence1_length': 433, 'unique_sentence1': 907, 'min_sentence2_length': 9, 'average_sentence2_length': 107.67, 'max_sentence2_length': 448, 'unique_sentence2': 910}, 'en-zh': {'num_samples': 879, 'number_of_characters': 131126, 'unique_pairs': 877, 'min_sentence1_length': 10, 'average_sentence1_length': 109.37, 'max_sentence1_length': 462, 'unique_sentence1': 872, 'min_sentence2_length': 2, 'average_sentence2_length': 39.81, 'max_sentence2_length': 230, 'unique_sentence2': 867}, 'fr-en': {'num_samples': 890, 'number_of_characters': 197619, 'unique_pairs': 883, 'min_sentence1_length': 6, 'average_sentence1_length': 113.63, 'max_sentence1_length': 493, 'unique_sentence1': 881, 'min_sentence2_length': 10, 'average_sentence2_length': 108.41, 'max_sentence2_length': 462, 'unique_sentence2': 883}, 'it-en': {'num_samples': 929, 'number_of_characters': 191803, 'unique_pairs': 924, 'min_sentence1_length': 7, 'average_sentence1_length': 103.46, 'max_sentence1_length': 444, 'unique_sentence1': 918, 'min_sentence2_length': 10, 'average_sentence2_length': 103.0, 'max_sentence2_length': 433, 'unique_sentence2': 922}, 'it-nl': {'num_samples': 1001, 'number_of_characters': 188858, 'unique_pairs': 998, 'min_sentence1_length': 7, 'average_sentence1_length': 94.64, 'max_sentence1_length': 459, 'unique_sentence1': 994, 'min_sentence2_length': 7, 'average_sentence2_length': 94.03, 'max_sentence2_length': 505, 'unique_sentence2': 998}, 'it-ro': {'num_samples': 914, 'number_of_characters': 193339, 'unique_pairs': 911, 'min_sentence1_length': 7, 'average_sentence1_length': 103.91, 'max_sentence1_length': 435, 'unique_sentence1': 907, 'min_sentence2_length': 9, 'average_sentence2_length': 107.62, 'max_sentence2_length': 448, 'unique_sentence2': 910}, 'ja-en': {'num_samples': 871, 'number_of_characters': 132742, 'unique_pairs': 867, 'min_sentence1_length': 5, 'average_sentence1_length': 42.59, 'max_sentence1_length': 225, 'unique_sentence1': 866, 'min_sentence2_length': 10, 'average_sentence2_length': 109.81, 'max_sentence2_length': 462, 'unique_sentence2': 864}, 'ko-en': {'num_samples': 879, 'number_of_characters': 142659, 'unique_pairs': 874, 'min_sentence1_length': 3, 'average_sentence1_length': 54.56, 'max_sentence1_length': 250, 'unique_sentence1': 872, 'min_sentence2_length': 10, 'average_sentence2_length': 107.74, 'max_sentence2_length': 462, 'unique_sentence2': 872}, 'nl-en': {'num_samples': 1003, 'number_of_characters': 189637, 'unique_pairs': 1000, 'min_sentence1_length': 4, 'average_sentence1_length': 93.8, 'max_sentence1_length': 477, 'unique_sentence1': 1000, 'min_sentence2_length': 10, 'average_sentence2_length': 95.27, 'max_sentence2_length': 433, 'unique_sentence2': 996}, 'nl-it': {'num_samples': 1001, 'number_of_characters': 188858, 'unique_pairs': 998, 'min_sentence1_length': 7, 'average_sentence1_length': 94.03, 'max_sentence1_length': 505, 'unique_sentence1': 998, 'min_sentence2_length': 7, 'average_sentence2_length': 94.64, 'max_sentence2_length': 459, 'unique_sentence2': 994}, 'nl-ro': {'num_samples': 913, 'number_of_characters': 191376, 'unique_pairs': 911, 'min_sentence1_length': 7, 'average_sentence1_length': 102.02, 'max_sentence1_length': 478, 'unique_sentence1': 909, 'min_sentence2_length': 9, 'average_sentence2_length': 107.59, 'max_sentence2_length': 515, 'unique_sentence2': 909}, 'ro-en': {'num_samples': 914, 'number_of_characters': 194128, 'unique_pairs': 910, 'min_sentence1_length': 9, 'average_sentence1_length': 107.67, 'max_sentence1_length': 448, 'unique_sentence1': 910, 'min_sentence2_length': 10, 'average_sentence2_length': 104.72, 'max_sentence2_length': 433, 'unique_sentence2': 907}, 'ro-it': {'num_samples': 914, 'number_of_characters': 193339, 'unique_pairs': 911, 'min_sentence1_length': 9, 'average_sentence1_length': 107.62, 'max_sentence1_length': 448, 'unique_sentence1': 910, 'min_sentence2_length': 7, 'average_sentence2_length': 103.91, 'max_sentence2_length': 435, 'unique_sentence2': 907}, 'ro-nl': {'num_samples': 913, 'number_of_characters': 191376, 'unique_pairs': 911, 'min_sentence1_length': 9, 'average_sentence1_length': 107.59, 'max_sentence1_length': 515, 'unique_sentence1': 909, 'min_sentence2_length': 7, 'average_sentence2_length': 102.02, 'max_sentence2_length': 478, 'unique_sentence2': 909}, 'zh-en': {'num_samples': 879, 'number_of_characters': 131126, 'unique_pairs': 877, 'min_sentence1_length': 2, 'average_sentence1_length': 39.81, 'max_sentence1_length': 230, 'unique_sentence1': 867, 'min_sentence2_length': 10, 'average_sentence2_length': 109.37, 'max_sentence2_length': 462, 'unique_sentence2': 872}}}} | | [ImageCoDe](https://aclanthology.org/2022.acl-long.241.pdf) (Krojer et al., 2022) | ['eng'] | Compositionality | it2i | [Web, Written] | {'test': 2302} | {'test': {'num_samples': 2302, 'num_images': 23020, 'num_texts': 2302, 'num_unique_texts': 2302, 'min_text_length': 1, 'average_text_length': 102.72, 'max_text_length': 350}} | | [ImageCoDeT2IRetrieval](https://aclanthology.org/2022.acl-long.241.pdf) (Krojer et al., 2022) | ['eng'] | Any2AnyRetrieval | t2i | [Web, Written] | {'test': 25322} | {'test': {'number_of_characters': 236457, 'num_samples': 25322, 'num_queries': 2302, 'num_documents': 23020, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 23020, 'min_query_length': 1, 'average_query_length': 102.72, 'max_query_length': 350, 'unique_queries': 2302, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 2302}} | | [ImageNet10Clustering](https://www.kaggle.com/datasets/liusha249/imagenet10) (Deng et al., 2009) | ['eng'] | ImageClustering | i2t | [Web] | {'test': 13000} | {'test': {'num_samples': 13000, 'unique_num_labels': 10, 'min_image_width': 224, 'average_image_width': 224.0, 'max_image_width': 224, 'min_image_height': 224, 'average_image_height': 224.0, 'max_image_height': 224, 'labels': {'0': {'count': 1300}, '1': {'count': 1300}, '2': {'count': 1300}, '3': {'count': 1300}, '4': {'count': 1300}, '5': {'count': 1300}, '6': {'count': 1300}, '7': {'count': 1300}, '8': {'count': 1300}, '10': {'count': 1300}}}} | | [ImageNetDog15Clustering](http://vision.stanford.edu/aditya86/ImageNetDogs/main.html) (Deng et al., 2009) | ['eng'] | ImageClustering | i2i | [Web] | {'test': 1076} | {'test': {'num_samples': 1076, 'unique_num_labels': 15, 'min_image_width': 224, 'average_image_width': 224.0, 'max_image_width': 224, 'min_image_height': 224, 'average_image_height': 224.0, 'max_image_height': 224, 'labels': {'0': {'count': 152}, '1': {'count': 88}, '2': {'count': 75}, '3': {'count': 96}, '4': {'count': 57}, '5': {'count': 50}, '6': {'count': 52}, '7': {'count': 50}, '8': {'count': 50}, '9': {'count': 50}, '10': {'count': 53}, '11': {'count': 57}, '12': {'count': 50}, '13': {'count': 100}, '14': {'count': 96}}}} | | [Imagenet1k](https://ieeexplore.ieee.org/document/5206848) (Deng et al., 2009) | ['eng'] | ImageClassification | i2i | [Scene] | {'test': 50000} | {'test': {'num_samples': 50000, 'unique_num_labels': 1000, 'min_image_width': 54, 'average_image_width': 490.4, 'max_image_width': 4288, 'min_image_height': 56, 'average_image_height': 430.24, 'max_image_height': 5005, 'labels': {'0': {'count': 50}, '1': {'count': 50}, '2': {'count': 50}, '3': {'count': 50}, '4': {'count': 50}, '5': {'count': 50}, '6': {'count': 50}, '7': {'count': 50}, '8': {'count': 50}, '9': {'count': 50}, '10': {'count': 50}, '11': {'count': 50}, '12': {'count': 50}, '13': {'count': 50}, '14': {'count': 50}, '15': {'count': 50}, '16': {'count': 50}, '17': {'count': 50}, '18': {'count': 50}, '19': {'count': 50}, '20': {'count': 50}, '21': {'count': 50}, '22': {'count': 50}, '23': {'count': 50}, '24': {'count': 50}, '25': {'count': 50}, '26': {'count': 50}, '27': {'count': 50}, '28': {'count': 50}, '29': {'count': 50}, '30': {'count': 50}, '31': {'count': 50}, '32': {'count': 50}, '33': {'count': 50}, '34': {'count': 50}, '35': {'count': 50}, '36': {'count': 50}, '37': {'count': 50}, '38': {'count': 50}, '39': {'count': 50}, '40': {'count': 50}, '41': {'count': 50}, '42': {'count': 50}, '43': {'count': 50}, '44': {'count': 50}, '45': {'count': 50}, '46': {'count': 50}, '47': {'count': 50}, '48': {'count': 50}, '49': {'count': 50}, '50': {'count': 50}, '51': {'count': 50}, '52': {'count': 50}, '53': {'count': 50}, '54': {'count': 50}, '55': {'count': 50}, '56': {'count': 50}, '57': {'count': 50}, '58': {'count': 50}, '59': {'count': 50}, '60': {'count': 50}, '61': {'count': 50}, '62': {'count': 50}, '63': {'count': 50}, '64': {'count': 50}, '65': {'count': 50}, '66': {'count': 50}, '67': {'count': 50}, '68': {'count': 50}, '69': {'count': 50}, '70': {'count': 50}, '71': {'count': 50}, '72': {'count': 50}, '73': {'count': 50}, '74': {'count': 50}, '75': {'count': 50}, '76': {'count': 50}, '77': {'count': 50}, '78': {'count': 50}, '79': {'count': 50}, '80': {'count': 50}, '81': {'count': 50}, '82': {'count': 50}, '83': {'count': 50}, '84': {'count': 50}, '85': {'count': 50}, '86': {'count': 50}, '87': {'count': 50}, '88': {'count': 50}, '89': {'count': 50}, '90': {'count': 50}, '91': {'count': 50}, '92': {'count': 50}, '93': {'count': 50}, '94': {'count': 50}, '95': {'count': 50}, '96': {'count': 50}, '97': {'count': 50}, '98': {'count': 50}, '99': {'count': 50}, '100': {'count': 50}, '101': {'count': 50}, '102': {'count': 50}, '103': {'count': 50}, '104': {'count': 50}, '105': {'count': 50}, '106': {'count': 50}, '107': {'count': 50}, '108': {'count': 50}, '109': {'count': 50}, '110': {'count': 50}, '111': {'count': 50}, '112': {'count': 50}, '113': {'count': 50}, '114': {'count': 50}, '115': {'count': 50}, '116': {'count': 50}, '117': {'count': 50}, '118': {'count': 50}, '119': {'count': 50}, '120': {'count': 50}, '121': {'count': 50}, '122': {'count': 50}, '123': {'count': 50}, '124': {'count': 50}, '125': {'count': 50}, '126': {'count': 50}, '127': {'count': 50}, '128': {'count': 50}, '129': {'count': 50}, '130': {'count': 50}, '131': {'count': 50}, '132': {'count': 50}, '133': {'count': 50}, '134': {'count': 50}, '135': {'count': 50}, '136': {'count': 50}, '137': {'count': 50}, '138': {'count': 50}, '139': {'count': 50}, '140': {'count': 50}, '141': {'count': 50}, '142': {'count': 50}, '143': {'count': 50}, '144': {'count': 50}, '145': {'count': 50}, '146': {'count': 50}, '147': {'count': 50}, '148': {'count': 50}, '149': {'count': 50}, '150': {'count': 50}, '151': {'count': 50}, '152': {'count': 50}, '153': {'count': 50}, '154': {'count': 50}, '155': {'count': 50}, '156': {'count': 50}, '157': {'count': 50}, '158': {'count': 50}, '159': {'count': 50}, '160': {'count': 50}, '161': {'count': 50}, '162': {'count': 50}, '163': {'count': 50}, '164': {'count': 50}, '165': {'count': 50}, '166': {'count': 50}, '167': {'count': 50}, '168': {'count': 50}, '169': {'count': 50}, '170': {'count': 50}, '171': {'count': 50}, '172': {'count': 50}, '173': {'count': 50}, '174': {'count': 50}, '175': {'count': 50}, '176': {'count': 50}, '177': {'count': 50}, '178': {'count': 50}, '179': {'count': 50}, '180': {'count': 50}, '181': {'count': 50}, '182': {'count': 50}, '183': {'count': 50}, '184': {'count': 50}, '185': {'count': 50}, '186': {'count': 50}, '187': {'count': 50}, '188': {'count': 50}, '189': {'count': 50}, '190': {'count': 50}, '191': {'count': 50}, '192': {'count': 50}, '193': {'count': 50}, '194': {'count': 50}, '195': {'count': 50}, '196': {'count': 50}, '197': {'count': 50}, '198': {'count': 50}, '199': {'count': 50}, '200': {'count': 50}, '201': {'count': 50}, '202': {'count': 50}, '203': {'count': 50}, '204': {'count': 50}, '205': {'count': 50}, '206': {'count': 50}, '207': {'count': 50}, '208': {'count': 50}, '209': {'count': 50}, '210': {'count': 50}, '211': {'count': 50}, '212': {'count': 50}, '213': {'count': 50}, '214': {'count': 50}, '215': {'count': 50}, '216': {'count': 50}, '217': {'count': 50}, '218': {'count': 50}, '219': {'count': 50}, '220': {'count': 50}, '221': {'count': 50}, '222': {'count': 50}, '223': {'count': 50}, '224': {'count': 50}, '225': {'count': 50}, '226': {'count': 50}, '227': {'count': 50}, '228': {'count': 50}, '229': {'count': 50}, '230': {'count': 50}, '231': {'count': 50}, '232': {'count': 50}, '233': {'count': 50}, '234': {'count': 50}, '235': {'count': 50}, '236': {'count': 50}, '237': {'count': 50}, '238': {'count': 50}, '239': {'count': 50}, '240': {'count': 50}, '241': {'count': 50}, '242': {'count': 50}, '243': {'count': 50}, '244': {'count': 50}, '245': {'count': 50}, '246': {'count': 50}, '247': {'count': 50}, '248': {'count': 50}, '249': {'count': 50}, '250': {'count': 50}, '251': {'count': 50}, '252': {'count': 50}, '253': {'count': 50}, '254': {'count': 50}, '255': {'count': 50}, '256': {'count': 50}, '257': {'count': 50}, '258': {'count': 50}, '259': {'count': 50}, '260': {'count': 50}, '261': {'count': 50}, '262': {'count': 50}, '263': {'count': 50}, '264': {'count': 50}, '265': {'count': 50}, '266': {'count': 50}, '267': {'count': 50}, '268': {'count': 50}, '269': {'count': 50}, '270': {'count': 50}, '271': {'count': 50}, '272': {'count': 50}, '273': {'count': 50}, '274': {'count': 50}, '275': {'count': 50}, '276': {'count': 50}, '277': {'count': 50}, '278': {'count': 50}, '279': {'count': 50}, '280': {'count': 50}, '281': {'count': 50}, '282': {'count': 50}, '283': {'count': 50}, '284': {'count': 50}, '285': {'count': 50}, '286': {'count': 50}, '287': {'count': 50}, '288': {'count': 50}, '289': {'count': 50}, '290': {'count': 50}, '291': {'count': 50}, '292': {'count': 50}, '293': {'count': 50}, '294': {'count': 50}, '295': {'count': 50}, '296': {'count': 50}, '297': {'count': 50}, '298': {'count': 50}, '299': {'count': 50}, '300': {'count': 50}, '301': {'count': 50}, '302': {'count': 50}, '303': {'count': 50}, '304': {'count': 50}, '305': {'count': 50}, '306': {'count': 50}, '307': {'count': 50}, '308': {'count': 50}, '309': {'count': 50}, '310': {'count': 50}, '311': {'count': 50}, '312': {'count': 50}, '313': {'count': 50}, '314': {'count': 50}, '315': {'count': 50}, '316': {'count': 50}, '317': {'count': 50}, '318': {'count': 50}, '319': {'count': 50}, '320': {'count': 50}, '321': {'count': 50}, '322': {'count': 50}, '323': {'count': 50}, '324': {'count': 50}, '325': {'count': 50}, '326': {'count': 50}, '327': {'count': 50}, '328': {'count': 50}, '329': {'count': 50}, '330': {'count': 50}, '331': {'count': 50}, '332': {'count': 50}, '333': {'count': 50}, '334': {'count': 50}, '335': {'count': 50}, '336': {'count': 50}, '337': {'count': 50}, '338': {'count': 50}, '339': {'count': 50}, '340': {'count': 50}, '341': {'count': 50}, '342': {'count': 50}, '343': {'count': 50}, '344': {'count': 50}, '345': {'count': 50}, '346': {'count': 50}, '347': {'count': 50}, '348': {'count': 50}, '349': {'count': 50}, '350': {'count': 50}, '351': {'count': 50}, '352': {'count': 50}, '353': {'count': 50}, '354': {'count': 50}, '355': {'count': 50}, '356': {'count': 50}, '357': {'count': 50}, '358': {'count': 50}, '359': {'count': 50}, '360': {'count': 50}, '361': {'count': 50}, '362': {'count': 50}, '363': {'count': 50}, '364': {'count': 50}, '365': {'count': 50}, '366': {'count': 50}, '367': {'count': 50}, '368': {'count': 50}, '369': {'count': 50}, '370': {'count': 50}, '371': {'count': 50}, '372': {'count': 50}, '373': {'count': 50}, '374': {'count': 50}, '375': {'count': 50}, '376': {'count': 50}, '377': {'count': 50}, '378': {'count': 50}, '379': {'count': 50}, '380': {'count': 50}, '381': {'count': 50}, '382': {'count': 50}, '383': {'count': 50}, '384': {'count': 50}, '385': {'count': 50}, '386': {'count': 50}, '387': {'count': 50}, '388': {'count': 50}, '389': {'count': 50}, '390': {'count': 50}, '391': {'count': 50}, '392': {'count': 50}, '393': {'count': 50}, '394': {'count': 50}, '395': {'count': 50}, '396': {'count': 50}, '397': {'count': 50}, '398': {'count': 50}, '399': {'count': 50}, '400': {'count': 50}, '401': {'count': 50}, '402': {'count': 50}, '403': {'count': 50}, '404': {'count': 50}, '405': {'count': 50}, '406': {'count': 50}, '407': {'count': 50}, '408': {'count': 50}, '409': {'count': 50}, '410': {'count': 50}, '411': {'count': 50}, '412': {'count': 50}, '413': {'count': 50}, '414': {'count': 50}, '415': {'count': 50}, '416': {'count': 50}, '417': {'count': 50}, '418': {'count': 50}, '419': {'count': 50}, '420': {'count': 50}, '421': {'count': 50}, '422': {'count': 50}, '423': {'count': 50}, '424': {'count': 50}, '425': {'count': 50}, '426': {'count': 50}, '427': {'count': 50}, '428': {'count': 50}, '429': {'count': 50}, '430': {'count': 50}, '431': {'count': 50}, '432': {'count': 50}, '433': {'count': 50}, '434': {'count': 50}, '435': {'count': 50}, '436': {'count': 50}, '437': {'count': 50}, '438': {'count': 50}, '439': {'count': 50}, '440': {'count': 50}, '441': {'count': 50}, '442': {'count': 50}, '443': {'count': 50}, '444': {'count': 50}, '445': {'count': 50}, '446': {'count': 50}, '447': {'count': 50}, '448': {'count': 50}, '449': {'count': 50}, '450': {'count': 50}, '451': {'count': 50}, '452': {'count': 50}, '453': {'count': 50}, '454': {'count': 50}, '455': {'count': 50}, '456': {'count': 50}, '457': {'count': 50}, '458': {'count': 50}, '459': {'count': 50}, '460': {'count': 50}, '461': {'count': 50}, '462': {'count': 50}, '463': {'count': 50}, '464': {'count': 50}, '465': {'count': 50}, '466': {'count': 50}, '467': {'count': 50}, '468': {'count': 50}, '469': {'count': 50}, '470': {'count': 50}, '471': {'count': 50}, '472': {'count': 50}, '473': {'count': 50}, '474': {'count': 50}, '475': {'count': 50}, '476': {'count': 50}, '477': {'count': 50}, '478': {'count': 50}, '479': {'count': 50}, '480': {'count': 50}, '481': {'count': 50}, '482': {'count': 50}, '483': {'count': 50}, '484': {'count': 50}, '485': {'count': 50}, '486': {'count': 50}, '487': {'count': 50}, '488': {'count': 50}, '489': {'count': 50}, '490': {'count': 50}, '491': {'count': 50}, '492': {'count': 50}, '493': {'count': 50}, '494': {'count': 50}, '495': {'count': 50}, '496': {'count': 50}, '497': {'count': 50}, '498': {'count': 50}, '499': {'count': 50}, '500': {'count': 50}, '501': {'count': 50}, '502': {'count': 50}, '503': {'count': 50}, '504': {'count': 50}, '505': {'count': 50}, '506': {'count': 50}, '507': {'count': 50}, '508': {'count': 50}, '509': {'count': 50}, '510': {'count': 50}, '511': {'count': 50}, '512': {'count': 50}, '513': {'count': 50}, '514': {'count': 50}, '515': {'count': 50}, '516': {'count': 50}, '517': {'count': 50}, '518': {'count': 50}, '519': {'count': 50}, '520': {'count': 50}, '521': {'count': 50}, '522': {'count': 50}, '523': {'count': 50}, '524': {'count': 50}, '525': {'count': 50}, '526': {'count': 50}, '527': {'count': 50}, '528': {'count': 50}, '529': {'count': 50}, '530': {'count': 50}, '531': {'count': 50}, '532': {'count': 50}, '533': {'count': 50}, '534': {'count': 50}, '535': {'count': 50}, '536': {'count': 50}, '537': {'count': 50}, '538': {'count': 50}, '539': {'count': 50}, '540': {'count': 50}, '541': {'count': 50}, '542': {'count': 50}, '543': {'count': 50}, '544': {'count': 50}, '545': {'count': 50}, '546': {'count': 50}, '547': {'count': 50}, '548': {'count': 50}, '549': {'count': 50}, '550': {'count': 50}, '551': {'count': 50}, '552': {'count': 50}, '553': {'count': 50}, '554': {'count': 50}, '555': {'count': 50}, '556': {'count': 50}, '557': {'count': 50}, '558': {'count': 50}, '559': {'count': 50}, '560': {'count': 50}, '561': {'count': 50}, '562': {'count': 50}, '563': {'count': 50}, '564': {'count': 50}, '565': {'count': 50}, '566': {'count': 50}, '567': {'count': 50}, '568': {'count': 50}, '569': {'count': 50}, '570': {'count': 50}, '571': {'count': 50}, '572': {'count': 50}, '573': {'count': 50}, '574': {'count': 50}, '575': {'count': 50}, '576': {'count': 50}, '577': {'count': 50}, '578': {'count': 50}, '579': {'count': 50}, '580': {'count': 50}, '581': {'count': 50}, '582': {'count': 50}, '583': {'count': 50}, '584': {'count': 50}, '585': {'count': 50}, '586': {'count': 50}, '587': {'count': 50}, '588': {'count': 50}, '589': {'count': 50}, '590': {'count': 50}, '591': {'count': 50}, '592': {'count': 50}, '593': {'count': 50}, '594': {'count': 50}, '595': {'count': 50}, '596': {'count': 50}, '597': {'count': 50}, '598': {'count': 50}, '599': {'count': 50}, '600': {'count': 50}, '601': {'count': 50}, '602': {'count': 50}, '603': {'count': 50}, '604': {'count': 50}, '605': {'count': 50}, '606': {'count': 50}, '607': {'count': 50}, '608': {'count': 50}, '609': {'count': 50}, '610': {'count': 50}, '611': {'count': 50}, '612': {'count': 50}, '613': {'count': 50}, '614': {'count': 50}, '615': {'count': 50}, '616': {'count': 50}, '617': {'count': 50}, '618': {'count': 50}, '619': {'count': 50}, '620': {'count': 50}, '621': {'count': 50}, '622': {'count': 50}, '623': {'count': 50}, '624': {'count': 50}, '625': {'count': 50}, '626': {'count': 50}, '627': {'count': 50}, '628': {'count': 50}, '629': {'count': 50}, '630': {'count': 50}, '631': {'count': 50}, '632': {'count': 50}, '633': {'count': 50}, '634': {'count': 50}, '635': {'count': 50}, '636': {'count': 50}, '637': {'count': 50}, '638': {'count': 50}, '639': {'count': 50}, '640': {'count': 50}, '641': {'count': 50}, '642': {'count': 50}, '643': {'count': 50}, '644': {'count': 50}, '645': {'count': 50}, '646': {'count': 50}, '647': {'count': 50}, '648': {'count': 50}, '649': {'count': 50}, '650': {'count': 50}, '651': {'count': 50}, '652': {'count': 50}, '653': {'count': 50}, '654': {'count': 50}, '655': {'count': 50}, '656': {'count': 50}, '657': {'count': 50}, '658': {'count': 50}, '659': {'count': 50}, '660': {'count': 50}, '661': {'count': 50}, '662': {'count': 50}, '663': {'count': 50}, '664': {'count': 50}, '665': {'count': 50}, '666': {'count': 50}, '667': {'count': 50}, '668': {'count': 50}, '669': {'count': 50}, '670': {'count': 50}, '671': {'count': 50}, '672': {'count': 50}, '673': {'count': 50}, '674': {'count': 50}, '675': {'count': 50}, '676': {'count': 50}, '677': {'count': 50}, '678': {'count': 50}, '679': {'count': 50}, '680': {'count': 50}, '681': {'count': 50}, '682': {'count': 50}, '683': {'count': 50}, '684': {'count': 50}, '685': {'count': 50}, '686': {'count': 50}, '687': {'count': 50}, '688': {'count': 50}, '689': {'count': 50}, '690': {'count': 50}, '691': {'count': 50}, '692': {'count': 50}, '693': {'count': 50}, '694': {'count': 50}, '695': {'count': 50}, '696': {'count': 50}, '697': {'count': 50}, '698': {'count': 50}, '699': {'count': 50}, '700': {'count': 50}, '701': {'count': 50}, '702': {'count': 50}, '703': {'count': 50}, '704': {'count': 50}, '705': {'count': 50}, '706': {'count': 50}, '707': {'count': 50}, '708': {'count': 50}, '709': {'count': 50}, '710': {'count': 50}, '711': {'count': 50}, '712': {'count': 50}, '713': {'count': 50}, '714': {'count': 50}, '715': {'count': 50}, '716': {'count': 50}, '717': {'count': 50}, '718': {'count': 50}, '719': {'count': 50}, '720': {'count': 50}, '721': {'count': 50}, '722': {'count': 50}, '723': {'count': 50}, '724': {'count': 50}, '725': {'count': 50}, '726': {'count': 50}, '727': {'count': 50}, '728': {'count': 50}, '729': {'count': 50}, '730': {'count': 50}, '731': {'count': 50}, '732': {'count': 50}, '733': {'count': 50}, '734': {'count': 50}, '735': {'count': 50}, '736': {'count': 50}, '737': {'count': 50}, '738': {'count': 50}, '739': {'count': 50}, '740': {'count': 50}, '741': {'count': 50}, '742': {'count': 50}, '743': {'count': 50}, '744': {'count': 50}, '745': {'count': 50}, '746': {'count': 50}, '747': {'count': 50}, '748': {'count': 50}, '749': {'count': 50}, '750': {'count': 50}, '751': {'count': 50}, '752': {'count': 50}, '753': {'count': 50}, '754': {'count': 50}, '755': {'count': 50}, '756': {'count': 50}, '757': {'count': 50}, '758': {'count': 50}, '759': {'count': 50}, '760': {'count': 50}, '761': {'count': 50}, '762': {'count': 50}, '763': {'count': 50}, '764': {'count': 50}, '765': {'count': 50}, '766': {'count': 50}, '767': {'count': 50}, '768': {'count': 50}, '769': {'count': 50}, '770': {'count': 50}, '771': {'count': 50}, '772': {'count': 50}, '773': {'count': 50}, '774': {'count': 50}, '775': {'count': 50}, '776': {'count': 50}, '777': {'count': 50}, '778': {'count': 50}, '779': {'count': 50}, '780': {'count': 50}, '781': {'count': 50}, '782': {'count': 50}, '783': {'count': 50}, '784': {'count': 50}, '785': {'count': 50}, '786': {'count': 50}, '787': {'count': 50}, '788': {'count': 50}, '789': {'count': 50}, '790': {'count': 50}, '791': {'count': 50}, '792': {'count': 50}, '793': {'count': 50}, '794': {'count': 50}, '795': {'count': 50}, '796': {'count': 50}, '797': {'count': 50}, '798': {'count': 50}, '799': {'count': 50}, '800': {'count': 50}, '801': {'count': 50}, '802': {'count': 50}, '803': {'count': 50}, '804': {'count': 50}, '805': {'count': 50}, '806': {'count': 50}, '807': {'count': 50}, '808': {'count': 50}, '809': {'count': 50}, '810': {'count': 50}, '811': {'count': 50}, '812': {'count': 50}, '813': {'count': 50}, '814': {'count': 50}, '815': {'count': 50}, '816': {'count': 50}, '817': {'count': 50}, '818': {'count': 50}, '819': {'count': 50}, '820': {'count': 50}, '821': {'count': 50}, '822': {'count': 50}, '823': {'count': 50}, '824': {'count': 50}, '825': {'count': 50}, '826': {'count': 50}, '827': {'count': 50}, '828': {'count': 50}, '829': {'count': 50}, '830': {'count': 50}, '831': {'count': 50}, '832': {'count': 50}, '833': {'count': 50}, '834': {'count': 50}, '835': {'count': 50}, '836': {'count': 50}, '837': {'count': 50}, '838': {'count': 50}, '839': {'count': 50}, '840': {'count': 50}, '841': {'count': 50}, '842': {'count': 50}, '843': {'count': 50}, '844': {'count': 50}, '845': {'count': 50}, '846': {'count': 50}, '847': {'count': 50}, '848': {'count': 50}, '849': {'count': 50}, '850': {'count': 50}, '851': {'count': 50}, '852': {'count': 50}, '853': {'count': 50}, '854': {'count': 50}, '855': {'count': 50}, '856': {'count': 50}, '857': {'count': 50}, '858': {'count': 50}, '859': {'count': 50}, '860': {'count': 50}, '861': {'count': 50}, '862': {'count': 50}, '863': {'count': 50}, '864': {'count': 50}, '865': {'count': 50}, '866': {'count': 50}, '867': {'count': 50}, '868': {'count': 50}, '869': {'count': 50}, '870': {'count': 50}, '871': {'count': 50}, '872': {'count': 50}, '873': {'count': 50}, '874': {'count': 50}, '875': {'count': 50}, '876': {'count': 50}, '877': {'count': 50}, '878': {'count': 50}, '879': {'count': 50}, '880': {'count': 50}, '881': {'count': 50}, '882': {'count': 50}, '883': {'count': 50}, '884': {'count': 50}, '885': {'count': 50}, '886': {'count': 50}, '887': {'count': 50}, '888': {'count': 50}, '889': {'count': 50}, '890': {'count': 50}, '891': {'count': 50}, '892': {'count': 50}, '893': {'count': 50}, '894': {'count': 50}, '895': {'count': 50}, '896': {'count': 50}, '897': {'count': 50}, '898': {'count': 50}, '899': {'count': 50}, '900': {'count': 50}, '901': {'count': 50}, '902': {'count': 50}, '903': {'count': 50}, '904': {'count': 50}, '905': {'count': 50}, '906': {'count': 50}, '907': {'count': 50}, '908': {'count': 50}, '909': {'count': 50}, '910': {'count': 50}, '911': {'count': 50}, '912': {'count': 50}, '913': {'count': 50}, '914': {'count': 50}, '915': {'count': 50}, '916': {'count': 50}, '917': {'count': 50}, '918': {'count': 50}, '919': {'count': 50}, '920': {'count': 50}, '921': {'count': 50}, '922': {'count': 50}, '923': {'count': 50}, '924': {'count': 50}, '925': {'count': 50}, '926': {'count': 50}, '927': {'count': 50}, '928': {'count': 50}, '929': {'count': 50}, '930': {'count': 50}, '931': {'count': 50}, '932': {'count': 50}, '933': {'count': 50}, '934': {'count': 50}, '935': {'count': 50}, '936': {'count': 50}, '937': {'count': 50}, '938': {'count': 50}, '939': {'count': 50}, '940': {'count': 50}, '941': {'count': 50}, '942': {'count': 50}, '943': {'count': 50}, '944': {'count': 50}, '945': {'count': 50}, '946': {'count': 50}, '947': {'count': 50}, '948': {'count': 50}, '949': {'count': 50}, '950': {'count': 50}, '951': {'count': 50}, '952': {'count': 50}, '953': {'count': 50}, '954': {'count': 50}, '955': {'count': 50}, '956': {'count': 50}, '957': {'count': 50}, '958': {'count': 50}, '959': {'count': 50}, '960': {'count': 50}, '961': {'count': 50}, '962': {'count': 50}, '963': {'count': 50}, '964': {'count': 50}, '965': {'count': 50}, '966': {'count': 50}, '967': {'count': 50}, '968': {'count': 50}, '969': {'count': 50}, '970': {'count': 50}, '971': {'count': 50}, '972': {'count': 50}, '973': {'count': 50}, '974': {'count': 50}, '975': {'count': 50}, '976': {'count': 50}, '977': {'count': 50}, '978': {'count': 50}, '979': {'count': 50}, '980': {'count': 50}, '981': {'count': 50}, '982': {'count': 50}, '983': {'count': 50}, '984': {'count': 50}, '985': {'count': 50}, '986': {'count': 50}, '987': {'count': 50}, '988': {'count': 50}, '989': {'count': 50}, '990': {'count': 50}, '991': {'count': 50}, '992': {'count': 50}, '993': {'count': 50}, '994': {'count': 50}, '995': {'count': 50}, '996': {'count': 50}, '997': {'count': 50}, '998': {'count': 50}, '999': {'count': 50}}}} | | [Imagenet1kZeroShot](https://ieeexplore.ieee.org/document/5206848) (Deng et al., 2009) | ['eng'] | ZeroShotClassification | i2t | [Scene] | {'test': 50000} | {'test': {'num_samples': 50000, 'unique_num_labels': 1000, 'min_image_width': 54, 'average_image_width': 490.4, 'max_image_width': 4288, 'min_image_height': 56, 'average_image_height': 430.24, 'max_image_height': 5005, 'min_label_text_length': 15, 'average_label_text_length': 23.81, 'max_label_text_length': 50, 'labels': {'0': {'count': 50}, '1': {'count': 50}, '2': {'count': 50}, '3': {'count': 50}, '4': {'count': 50}, '5': {'count': 50}, '6': {'count': 50}, '7': {'count': 50}, '8': {'count': 50}, '9': {'count': 50}, '10': {'count': 50}, '11': {'count': 50}, '12': {'count': 50}, '13': {'count': 50}, '14': {'count': 50}, '15': {'count': 50}, '16': {'count': 50}, '17': {'count': 50}, '18': {'count': 50}, '19': {'count': 50}, '20': {'count': 50}, '21': {'count': 50}, '22': {'count': 50}, '23': {'count': 50}, '24': {'count': 50}, '25': {'count': 50}, '26': {'count': 50}, '27': {'count': 50}, '28': {'count': 50}, '29': {'count': 50}, '30': {'count': 50}, '31': {'count': 50}, '32': {'count': 50}, '33': {'count': 50}, '34': {'count': 50}, '35': {'count': 50}, '36': {'count': 50}, '37': {'count': 50}, '38': {'count': 50}, '39': {'count': 50}, '40': {'count': 50}, '41': {'count': 50}, '42': {'count': 50}, '43': {'count': 50}, '44': {'count': 50}, '45': {'count': 50}, '46': {'count': 50}, '47': {'count': 50}, '48': {'count': 50}, '49': {'count': 50}, '50': {'count': 50}, '51': {'count': 50}, '52': {'count': 50}, '53': {'count': 50}, '54': {'count': 50}, '55': {'count': 50}, '56': {'count': 50}, '57': {'count': 50}, '58': {'count': 50}, '59': {'count': 50}, '60': {'count': 50}, '61': {'count': 50}, '62': {'count': 50}, '63': {'count': 50}, '64': {'count': 50}, '65': {'count': 50}, '66': {'count': 50}, '67': {'count': 50}, '68': {'count': 50}, '69': {'count': 50}, '70': {'count': 50}, '71': {'count': 50}, '72': {'count': 50}, '73': {'count': 50}, '74': {'count': 50}, '75': {'count': 50}, '76': {'count': 50}, '77': {'count': 50}, '78': {'count': 50}, '79': {'count': 50}, '80': {'count': 50}, '81': {'count': 50}, '82': {'count': 50}, '83': {'count': 50}, '84': {'count': 50}, '85': {'count': 50}, '86': {'count': 50}, '87': {'count': 50}, '88': {'count': 50}, '89': {'count': 50}, '90': {'count': 50}, '91': {'count': 50}, '92': {'count': 50}, '93': {'count': 50}, '94': {'count': 50}, '95': {'count': 50}, '96': {'count': 50}, '97': {'count': 50}, '98': {'count': 50}, '99': {'count': 50}, '100': {'count': 50}, '101': {'count': 50}, '102': {'count': 50}, '103': {'count': 50}, '104': {'count': 50}, '105': {'count': 50}, '106': {'count': 50}, '107': {'count': 50}, '108': {'count': 50}, '109': {'count': 50}, '110': {'count': 50}, '111': {'count': 50}, '112': {'count': 50}, '113': {'count': 50}, '114': {'count': 50}, '115': {'count': 50}, '116': {'count': 50}, '117': {'count': 50}, '118': {'count': 50}, '119': {'count': 50}, '120': {'count': 50}, '121': {'count': 50}, '122': {'count': 50}, '123': {'count': 50}, '124': {'count': 50}, '125': {'count': 50}, '126': {'count': 50}, '127': {'count': 50}, '128': {'count': 50}, '129': {'count': 50}, '130': {'count': 50}, '131': {'count': 50}, '132': {'count': 50}, '133': {'count': 50}, '134': {'count': 50}, '135': {'count': 50}, '136': {'count': 50}, '137': {'count': 50}, '138': {'count': 50}, '139': {'count': 50}, '140': {'count': 50}, '141': {'count': 50}, '142': {'count': 50}, '143': {'count': 50}, '144': {'count': 50}, '145': {'count': 50}, '146': {'count': 50}, '147': {'count': 50}, '148': {'count': 50}, '149': {'count': 50}, '150': {'count': 50}, '151': {'count': 50}, '152': {'count': 50}, '153': {'count': 50}, '154': {'count': 50}, '155': {'count': 50}, '156': {'count': 50}, '157': {'count': 50}, '158': {'count': 50}, '159': {'count': 50}, '160': {'count': 50}, '161': {'count': 50}, '162': {'count': 50}, '163': {'count': 50}, '164': {'count': 50}, '165': {'count': 50}, '166': {'count': 50}, '167': {'count': 50}, '168': {'count': 50}, '169': {'count': 50}, '170': {'count': 50}, '171': {'count': 50}, '172': {'count': 50}, '173': {'count': 50}, '174': {'count': 50}, '175': {'count': 50}, '176': {'count': 50}, '177': {'count': 50}, '178': {'count': 50}, '179': {'count': 50}, '180': {'count': 50}, '181': {'count': 50}, '182': {'count': 50}, '183': {'count': 50}, '184': {'count': 50}, '185': {'count': 50}, '186': {'count': 50}, '187': {'count': 50}, '188': {'count': 50}, '189': {'count': 50}, '190': {'count': 50}, '191': {'count': 50}, '192': {'count': 50}, '193': {'count': 50}, '194': {'count': 50}, '195': {'count': 50}, '196': {'count': 50}, '197': {'count': 50}, '198': {'count': 50}, '199': {'count': 50}, '200': {'count': 50}, '201': {'count': 50}, '202': {'count': 50}, '203': {'count': 50}, '204': {'count': 50}, '205': {'count': 50}, '206': {'count': 50}, '207': {'count': 50}, '208': {'count': 50}, '209': {'count': 50}, '210': {'count': 50}, '211': {'count': 50}, '212': {'count': 50}, '213': {'count': 50}, '214': {'count': 50}, '215': {'count': 50}, '216': {'count': 50}, '217': {'count': 50}, '218': {'count': 50}, '219': {'count': 50}, '220': {'count': 50}, '221': {'count': 50}, '222': {'count': 50}, '223': {'count': 50}, '224': {'count': 50}, '225': {'count': 50}, '226': {'count': 50}, '227': {'count': 50}, '228': {'count': 50}, '229': {'count': 50}, '230': {'count': 50}, '231': {'count': 50}, '232': {'count': 50}, '233': {'count': 50}, '234': {'count': 50}, '235': {'count': 50}, '236': {'count': 50}, '237': {'count': 50}, '238': {'count': 50}, '239': {'count': 50}, '240': {'count': 50}, '241': {'count': 50}, '242': {'count': 50}, '243': {'count': 50}, '244': {'count': 50}, '245': {'count': 50}, '246': {'count': 50}, '247': {'count': 50}, '248': {'count': 50}, '249': {'count': 50}, '250': {'count': 50}, '251': {'count': 50}, '252': {'count': 50}, '253': {'count': 50}, '254': {'count': 50}, '255': {'count': 50}, '256': {'count': 50}, '257': {'count': 50}, '258': {'count': 50}, '259': {'count': 50}, '260': {'count': 50}, '261': {'count': 50}, '262': {'count': 50}, '263': {'count': 50}, '264': {'count': 50}, '265': {'count': 50}, '266': {'count': 50}, '267': {'count': 50}, '268': {'count': 50}, '269': {'count': 50}, '270': {'count': 50}, '271': {'count': 50}, '272': {'count': 50}, '273': {'count': 50}, '274': {'count': 50}, '275': {'count': 50}, '276': {'count': 50}, '277': {'count': 50}, '278': {'count': 50}, '279': {'count': 50}, '280': {'count': 50}, '281': {'count': 50}, '282': {'count': 50}, '283': {'count': 50}, '284': {'count': 50}, '285': {'count': 50}, '286': {'count': 50}, '287': {'count': 50}, '288': {'count': 50}, '289': {'count': 50}, '290': {'count': 50}, '291': {'count': 50}, '292': {'count': 50}, '293': {'count': 50}, '294': {'count': 50}, '295': {'count': 50}, '296': {'count': 50}, '297': {'count': 50}, '298': {'count': 50}, '299': {'count': 50}, '300': {'count': 50}, '301': {'count': 50}, '302': {'count': 50}, '303': {'count': 50}, '304': {'count': 50}, '305': {'count': 50}, '306': {'count': 50}, '307': {'count': 50}, '308': {'count': 50}, '309': {'count': 50}, '310': {'count': 50}, '311': {'count': 50}, '312': {'count': 50}, '313': {'count': 50}, '314': {'count': 50}, '315': {'count': 50}, '316': {'count': 50}, '317': {'count': 50}, '318': {'count': 50}, '319': {'count': 50}, '320': {'count': 50}, '321': {'count': 50}, '322': {'count': 50}, '323': {'count': 50}, '324': {'count': 50}, '325': {'count': 50}, '326': {'count': 50}, '327': {'count': 50}, '328': {'count': 50}, '329': {'count': 50}, '330': {'count': 50}, '331': {'count': 50}, '332': {'count': 50}, '333': {'count': 50}, '334': {'count': 50}, '335': {'count': 50}, '336': {'count': 50}, '337': {'count': 50}, '338': {'count': 50}, '339': {'count': 50}, '340': {'count': 50}, '341': {'count': 50}, '342': {'count': 50}, '343': {'count': 50}, '344': {'count': 50}, '345': {'count': 50}, '346': {'count': 50}, '347': {'count': 50}, '348': {'count': 50}, '349': {'count': 50}, '350': {'count': 50}, '351': {'count': 50}, '352': {'count': 50}, '353': {'count': 50}, '354': {'count': 50}, '355': {'count': 50}, '356': {'count': 50}, '357': {'count': 50}, '358': {'count': 50}, '359': {'count': 50}, '360': {'count': 50}, '361': {'count': 50}, '362': {'count': 50}, '363': {'count': 50}, '364': {'count': 50}, '365': {'count': 50}, '366': {'count': 50}, '367': {'count': 50}, '368': {'count': 50}, '369': {'count': 50}, '370': {'count': 50}, '371': {'count': 50}, '372': {'count': 50}, '373': {'count': 50}, '374': {'count': 50}, '375': {'count': 50}, '376': {'count': 50}, '377': {'count': 50}, '378': {'count': 50}, '379': {'count': 50}, '380': {'count': 50}, '381': {'count': 50}, '382': {'count': 50}, '383': {'count': 50}, '384': {'count': 50}, '385': {'count': 50}, '386': {'count': 50}, '387': {'count': 50}, '388': {'count': 50}, '389': {'count': 50}, '390': {'count': 50}, '391': {'count': 50}, '392': {'count': 50}, '393': {'count': 50}, '394': {'count': 50}, '395': {'count': 50}, '396': {'count': 50}, '397': {'count': 50}, '398': {'count': 50}, '399': {'count': 50}, '400': {'count': 50}, '401': {'count': 50}, '402': {'count': 50}, '403': {'count': 50}, '404': {'count': 50}, '405': {'count': 50}, '406': {'count': 50}, '407': {'count': 50}, '408': {'count': 50}, '409': {'count': 50}, '410': {'count': 50}, '411': {'count': 50}, '412': {'count': 50}, '413': {'count': 50}, '414': {'count': 50}, '415': {'count': 50}, '416': {'count': 50}, '417': {'count': 50}, '418': {'count': 50}, '419': {'count': 50}, '420': {'count': 50}, '421': {'count': 50}, '422': {'count': 50}, '423': {'count': 50}, '424': {'count': 50}, '425': {'count': 50}, '426': {'count': 50}, '427': {'count': 50}, '428': {'count': 50}, '429': {'count': 50}, '430': {'count': 50}, '431': {'count': 50}, '432': {'count': 50}, '433': {'count': 50}, '434': {'count': 50}, '435': {'count': 50}, '436': {'count': 50}, '437': {'count': 50}, '438': {'count': 50}, '439': {'count': 50}, '440': {'count': 50}, '441': {'count': 50}, '442': {'count': 50}, '443': {'count': 50}, '444': {'count': 50}, '445': {'count': 50}, '446': {'count': 50}, '447': {'count': 50}, '448': {'count': 50}, '449': {'count': 50}, '450': {'count': 50}, '451': {'count': 50}, '452': {'count': 50}, '453': {'count': 50}, '454': {'count': 50}, '455': {'count': 50}, '456': {'count': 50}, '457': {'count': 50}, '458': {'count': 50}, '459': {'count': 50}, '460': {'count': 50}, '461': {'count': 50}, '462': {'count': 50}, '463': {'count': 50}, '464': {'count': 50}, '465': {'count': 50}, '466': {'count': 50}, '467': {'count': 50}, '468': {'count': 50}, '469': {'count': 50}, '470': {'count': 50}, '471': {'count': 50}, '472': {'count': 50}, '473': {'count': 50}, '474': {'count': 50}, '475': {'count': 50}, '476': {'count': 50}, '477': {'count': 50}, '478': {'count': 50}, '479': {'count': 50}, '480': {'count': 50}, '481': {'count': 50}, '482': {'count': 50}, '483': {'count': 50}, '484': {'count': 50}, '485': {'count': 50}, '486': {'count': 50}, '487': {'count': 50}, '488': {'count': 50}, '489': {'count': 50}, '490': {'count': 50}, '491': {'count': 50}, '492': {'count': 50}, '493': {'count': 50}, '494': {'count': 50}, '495': {'count': 50}, '496': {'count': 50}, '497': {'count': 50}, '498': {'count': 50}, '499': {'count': 50}, '500': {'count': 50}, '501': {'count': 50}, '502': {'count': 50}, '503': {'count': 50}, '504': {'count': 50}, '505': {'count': 50}, '506': {'count': 50}, '507': {'count': 50}, '508': {'count': 50}, '509': {'count': 50}, '510': {'count': 50}, '511': {'count': 50}, '512': {'count': 50}, '513': {'count': 50}, '514': {'count': 50}, '515': {'count': 50}, '516': {'count': 50}, '517': {'count': 50}, '518': {'count': 50}, '519': {'count': 50}, '520': {'count': 50}, '521': {'count': 50}, '522': {'count': 50}, '523': {'count': 50}, '524': {'count': 50}, '525': {'count': 50}, '526': {'count': 50}, '527': {'count': 50}, '528': {'count': 50}, '529': {'count': 50}, '530': {'count': 50}, '531': {'count': 50}, '532': {'count': 50}, '533': {'count': 50}, '534': {'count': 50}, '535': {'count': 50}, '536': {'count': 50}, '537': {'count': 50}, '538': {'count': 50}, '539': {'count': 50}, '540': {'count': 50}, '541': {'count': 50}, '542': {'count': 50}, '543': {'count': 50}, '544': {'count': 50}, '545': {'count': 50}, '546': {'count': 50}, '547': {'count': 50}, '548': {'count': 50}, '549': {'count': 50}, '550': {'count': 50}, '551': {'count': 50}, '552': {'count': 50}, '553': {'count': 50}, '554': {'count': 50}, '555': {'count': 50}, '556': {'count': 50}, '557': {'count': 50}, '558': {'count': 50}, '559': {'count': 50}, '560': {'count': 50}, '561': {'count': 50}, '562': {'count': 50}, '563': {'count': 50}, '564': {'count': 50}, '565': {'count': 50}, '566': {'count': 50}, '567': {'count': 50}, '568': {'count': 50}, '569': {'count': 50}, '570': {'count': 50}, '571': {'count': 50}, '572': {'count': 50}, '573': {'count': 50}, '574': {'count': 50}, '575': {'count': 50}, '576': {'count': 50}, '577': {'count': 50}, '578': {'count': 50}, '579': {'count': 50}, '580': {'count': 50}, '581': {'count': 50}, '582': {'count': 50}, '583': {'count': 50}, '584': {'count': 50}, '585': {'count': 50}, '586': {'count': 50}, '587': {'count': 50}, '588': {'count': 50}, '589': {'count': 50}, '590': {'count': 50}, '591': {'count': 50}, '592': {'count': 50}, '593': {'count': 50}, '594': {'count': 50}, '595': {'count': 50}, '596': {'count': 50}, '597': {'count': 50}, '598': {'count': 50}, '599': {'count': 50}, '600': {'count': 50}, '601': {'count': 50}, '602': {'count': 50}, '603': {'count': 50}, '604': {'count': 50}, '605': {'count': 50}, '606': {'count': 50}, '607': {'count': 50}, '608': {'count': 50}, '609': {'count': 50}, '610': {'count': 50}, '611': {'count': 50}, '612': {'count': 50}, '613': {'count': 50}, '614': {'count': 50}, '615': {'count': 50}, '616': {'count': 50}, '617': {'count': 50}, '618': {'count': 50}, '619': {'count': 50}, '620': {'count': 50}, '621': {'count': 50}, '622': {'count': 50}, '623': {'count': 50}, '624': {'count': 50}, '625': {'count': 50}, '626': {'count': 50}, '627': {'count': 50}, '628': {'count': 50}, '629': {'count': 50}, '630': {'count': 50}, '631': {'count': 50}, '632': {'count': 50}, '633': {'count': 50}, '634': {'count': 50}, '635': {'count': 50}, '636': {'count': 50}, '637': {'count': 50}, '638': {'count': 50}, '639': {'count': 50}, '640': {'count': 50}, '641': {'count': 50}, '642': {'count': 50}, '643': {'count': 50}, '644': {'count': 50}, '645': {'count': 50}, '646': {'count': 50}, '647': {'count': 50}, '648': {'count': 50}, '649': {'count': 50}, '650': {'count': 50}, '651': {'count': 50}, '652': {'count': 50}, '653': {'count': 50}, '654': {'count': 50}, '655': {'count': 50}, '656': {'count': 50}, '657': {'count': 50}, '658': {'count': 50}, '659': {'count': 50}, '660': {'count': 50}, '661': {'count': 50}, '662': {'count': 50}, '663': {'count': 50}, '664': {'count': 50}, '665': {'count': 50}, '666': {'count': 50}, '667': {'count': 50}, '668': {'count': 50}, '669': {'count': 50}, '670': {'count': 50}, '671': {'count': 50}, '672': {'count': 50}, '673': {'count': 50}, '674': {'count': 50}, '675': {'count': 50}, '676': {'count': 50}, '677': {'count': 50}, '678': {'count': 50}, '679': {'count': 50}, '680': {'count': 50}, '681': {'count': 50}, '682': {'count': 50}, '683': {'count': 50}, '684': {'count': 50}, '685': {'count': 50}, '686': {'count': 50}, '687': {'count': 50}, '688': {'count': 50}, '689': {'count': 50}, '690': {'count': 50}, '691': {'count': 50}, '692': {'count': 50}, '693': {'count': 50}, '694': {'count': 50}, '695': {'count': 50}, '696': {'count': 50}, '697': {'count': 50}, '698': {'count': 50}, '699': {'count': 50}, '700': {'count': 50}, '701': {'count': 50}, '702': {'count': 50}, '703': {'count': 50}, '704': {'count': 50}, '705': {'count': 50}, '706': {'count': 50}, '707': {'count': 50}, '708': {'count': 50}, '709': {'count': 50}, '710': {'count': 50}, '711': {'count': 50}, '712': {'count': 50}, '713': {'count': 50}, '714': {'count': 50}, '715': {'count': 50}, '716': {'count': 50}, '717': {'count': 50}, '718': {'count': 50}, '719': {'count': 50}, '720': {'count': 50}, '721': {'count': 50}, '722': {'count': 50}, '723': {'count': 50}, '724': {'count': 50}, '725': {'count': 50}, '726': {'count': 50}, '727': {'count': 50}, '728': {'count': 50}, '729': {'count': 50}, '730': {'count': 50}, '731': {'count': 50}, '732': {'count': 50}, '733': {'count': 50}, '734': {'count': 50}, '735': {'count': 50}, '736': {'count': 50}, '737': {'count': 50}, '738': {'count': 50}, '739': {'count': 50}, '740': {'count': 50}, '741': {'count': 50}, '742': {'count': 50}, '743': {'count': 50}, '744': {'count': 50}, '745': {'count': 50}, '746': {'count': 50}, '747': {'count': 50}, '748': {'count': 50}, '749': {'count': 50}, '750': {'count': 50}, '751': {'count': 50}, '752': {'count': 50}, '753': {'count': 50}, '754': {'count': 50}, '755': {'count': 50}, '756': {'count': 50}, '757': {'count': 50}, '758': {'count': 50}, '759': {'count': 50}, '760': {'count': 50}, '761': {'count': 50}, '762': {'count': 50}, '763': {'count': 50}, '764': {'count': 50}, '765': {'count': 50}, '766': {'count': 50}, '767': {'count': 50}, '768': {'count': 50}, '769': {'count': 50}, '770': {'count': 50}, '771': {'count': 50}, '772': {'count': 50}, '773': {'count': 50}, '774': {'count': 50}, '775': {'count': 50}, '776': {'count': 50}, '777': {'count': 50}, '778': {'count': 50}, '779': {'count': 50}, '780': {'count': 50}, '781': {'count': 50}, '782': {'count': 50}, '783': {'count': 50}, '784': {'count': 50}, '785': {'count': 50}, '786': {'count': 50}, '787': {'count': 50}, '788': {'count': 50}, '789': {'count': 50}, '790': {'count': 50}, '791': {'count': 50}, '792': {'count': 50}, '793': {'count': 50}, '794': {'count': 50}, '795': {'count': 50}, '796': {'count': 50}, '797': {'count': 50}, '798': {'count': 50}, '799': {'count': 50}, '800': {'count': 50}, '801': {'count': 50}, '802': {'count': 50}, '803': {'count': 50}, '804': {'count': 50}, '805': {'count': 50}, '806': {'count': 50}, '807': {'count': 50}, '808': {'count': 50}, '809': {'count': 50}, '810': {'count': 50}, '811': {'count': 50}, '812': {'count': 50}, '813': {'count': 50}, '814': {'count': 50}, '815': {'count': 50}, '816': {'count': 50}, '817': {'count': 50}, '818': {'count': 50}, '819': {'count': 50}, '820': {'count': 50}, '821': {'count': 50}, '822': {'count': 50}, '823': {'count': 50}, '824': {'count': 50}, '825': {'count': 50}, '826': {'count': 50}, '827': {'count': 50}, '828': {'count': 50}, '829': {'count': 50}, '830': {'count': 50}, '831': {'count': 50}, '832': {'count': 50}, '833': {'count': 50}, '834': {'count': 50}, '835': {'count': 50}, '836': {'count': 50}, '837': {'count': 50}, '838': {'count': 50}, '839': {'count': 50}, '840': {'count': 50}, '841': {'count': 50}, '842': {'count': 50}, '843': {'count': 50}, '844': {'count': 50}, '845': {'count': 50}, '846': {'count': 50}, '847': {'count': 50}, '848': {'count': 50}, '849': {'count': 50}, '850': {'count': 50}, '851': {'count': 50}, '852': {'count': 50}, '853': {'count': 50}, '854': {'count': 50}, '855': {'count': 50}, '856': {'count': 50}, '857': {'count': 50}, '858': {'count': 50}, '859': {'count': 50}, '860': {'count': 50}, '861': {'count': 50}, '862': {'count': 50}, '863': {'count': 50}, '864': {'count': 50}, '865': {'count': 50}, '866': {'count': 50}, '867': {'count': 50}, '868': {'count': 50}, '869': {'count': 50}, '870': {'count': 50}, '871': {'count': 50}, '872': {'count': 50}, '873': {'count': 50}, '874': {'count': 50}, '875': {'count': 50}, '876': {'count': 50}, '877': {'count': 50}, '878': {'count': 50}, '879': {'count': 50}, '880': {'count': 50}, '881': {'count': 50}, '882': {'count': 50}, '883': {'count': 50}, '884': {'count': 50}, '885': {'count': 50}, '886': {'count': 50}, '887': {'count': 50}, '888': {'count': 50}, '889': {'count': 50}, '890': {'count': 50}, '891': {'count': 50}, '892': {'count': 50}, '893': {'count': 50}, '894': {'count': 50}, '895': {'count': 50}, '896': {'count': 50}, '897': {'count': 50}, '898': {'count': 50}, '899': {'count': 50}, '900': {'count': 50}, '901': {'count': 50}, '902': {'count': 50}, '903': {'count': 50}, '904': {'count': 50}, '905': {'count': 50}, '906': {'count': 50}, '907': {'count': 50}, '908': {'count': 50}, '909': {'count': 50}, '910': {'count': 50}, '911': {'count': 50}, '912': {'count': 50}, '913': {'count': 50}, '914': {'count': 50}, '915': {'count': 50}, '916': {'count': 50}, '917': {'count': 50}, '918': {'count': 50}, '919': {'count': 50}, '920': {'count': 50}, '921': {'count': 50}, '922': {'count': 50}, '923': {'count': 50}, '924': {'count': 50}, '925': {'count': 50}, '926': {'count': 50}, '927': {'count': 50}, '928': {'count': 50}, '929': {'count': 50}, '930': {'count': 50}, '931': {'count': 50}, '932': {'count': 50}, '933': {'count': 50}, '934': {'count': 50}, '935': {'count': 50}, '936': {'count': 50}, '937': {'count': 50}, '938': {'count': 50}, '939': {'count': 50}, '940': {'count': 50}, '941': {'count': 50}, '942': {'count': 50}, '943': {'count': 50}, '944': {'count': 50}, '945': {'count': 50}, '946': {'count': 50}, '947': {'count': 50}, '948': {'count': 50}, '949': {'count': 50}, '950': {'count': 50}, '951': {'count': 50}, '952': {'count': 50}, '953': {'count': 50}, '954': {'count': 50}, '955': {'count': 50}, '956': {'count': 50}, '957': {'count': 50}, '958': {'count': 50}, '959': {'count': 50}, '960': {'count': 50}, '961': {'count': 50}, '962': {'count': 50}, '963': {'count': 50}, '964': {'count': 50}, '965': {'count': 50}, '966': {'count': 50}, '967': {'count': 50}, '968': {'count': 50}, '969': {'count': 50}, '970': {'count': 50}, '971': {'count': 50}, '972': {'count': 50}, '973': {'count': 50}, '974': {'count': 50}, '975': {'count': 50}, '976': {'count': 50}, '977': {'count': 50}, '978': {'count': 50}, '979': {'count': 50}, '980': {'count': 50}, '981': {'count': 50}, '982': {'count': 50}, '983': {'count': 50}, '984': {'count': 50}, '985': {'count': 50}, '986': {'count': 50}, '987': {'count': 50}, '988': {'count': 50}, '989': {'count': 50}, '990': {'count': 50}, '991': {'count': 50}, '992': {'count': 50}, '993': {'count': 50}, '994': {'count': 50}, '995': {'count': 50}, '996': {'count': 50}, '997': {'count': 50}, '998': {'count': 50}, '999': {'count': 50}}}} | -| [ImdbClassification](http://www.aclweb.org/anthology/P11-1015) | ['eng'] | Classification | p2p | [Reviews, Written] | None | None | -| [InappropriatenessClassification](https://aclanthology.org/2021.bsnlp-1.4) | ['rus'] | Classification | s2s | [Social, Web, Written] | None | None | +| [ImdbClassification](http://www.aclweb.org/anthology/P11-1015) (Maas et al., 2011) | ['eng'] | Classification | p2p | [Reviews, Written] | None | None | +| [InappropriatenessClassification](https://aclanthology.org/2021.bsnlp-1.4) (Babakov et al., 2021) | ['rus'] | Classification | s2s | [Social, Web, Written] | None | None | +| [InappropriatenessClassificationv2](https://aclanthology.org/2021.bsnlp-1.4) (Babakov et al., 2021) | ['rus'] | Classification | t2t | [Social, Web, Written] | None | None | | [IndicCrosslingualSTS](https://huggingface.co/datasets/jaygala24/indic_sts) (Ramesh et al., 2022) | ['asm', 'ben', 'eng', 'guj', 'hin', 'kan', 'mal', 'mar', 'ory', 'pan', 'tam', 'tel', 'urd'] | STS | s2s | [Government, News, Non-fiction, Spoken, Spoken, Web, Written] | None | None | | [IndicGenBenchFloresBitextMining](https://github.com/google-research-datasets/indic-gen-bench/) (Harman Singh, 2024) | ['asm', 'awa', 'ben', 'bgc', 'bho', 'bod', 'boy', 'eng', 'gbm', 'gom', 'guj', 'hin', 'hne', 'kan', 'mai', 'mal', 'mar', 'mni', 'mup', 'mwr', 'nep', 'ory', 'pan', 'pus', 'raj', 'san', 'sat', 'tam', 'tel', 'urd'] | BitextMining | s2s | [News, Web, Written] | {'validation': 57826, 'test': 58696} | {'validation': {'num_samples': 57826, 'number_of_characters': 14600950, 'unique_pairs': 57826, 'min_sentence1_length': 24, 'average_sentence1_length': 126.25, 'max_sentence1_length': 368, 'unique_sentence1': 29903, 'min_sentence2_length': 24, 'average_sentence2_length': 126.24, 'max_sentence2_length': 368, 'unique_sentence2': 29903, 'hf_subset_descriptive_stats': {'ben-eng': {'num_samples': 997, 'number_of_characters': 248469, 'unique_pairs': 997, 'min_sentence1_length': 30, 'average_sentence1_length': 123.65, 'max_sentence1_length': 320, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-ben': {'num_samples': 997, 'number_of_characters': 248469, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 30, 'average_sentence2_length': 123.65, 'max_sentence2_length': 320, 'unique_sentence2': 997}, 'guj-eng': {'num_samples': 997, 'number_of_characters': 245477, 'unique_pairs': 997, 'min_sentence1_length': 30, 'average_sentence1_length': 120.64, 'max_sentence1_length': 368, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-guj': {'num_samples': 997, 'number_of_characters': 245477, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 30, 'average_sentence2_length': 120.64, 'max_sentence2_length': 368, 'unique_sentence2': 997}, 'hin-eng': {'num_samples': 997, 'number_of_characters': 250573, 'unique_pairs': 997, 'min_sentence1_length': 31, 'average_sentence1_length': 125.76, 'max_sentence1_length': 355, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-hin': {'num_samples': 997, 'number_of_characters': 250564, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 31, 'average_sentence2_length': 125.75, 'max_sentence2_length': 355, 'unique_sentence2': 997}, 'kan-eng': {'num_samples': 997, 'number_of_characters': 257131, 'unique_pairs': 997, 'min_sentence1_length': 34, 'average_sentence1_length': 132.33, 'max_sentence1_length': 331, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-kan': {'num_samples': 997, 'number_of_characters': 256986, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 34, 'average_sentence2_length': 132.19, 'max_sentence2_length': 331, 'unique_sentence2': 997}, 'mal-eng': {'num_samples': 997, 'number_of_characters': 267295, 'unique_pairs': 997, 'min_sentence1_length': 31, 'average_sentence1_length': 142.53, 'max_sentence1_length': 360, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-mal': {'num_samples': 997, 'number_of_characters': 267296, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 31, 'average_sentence2_length': 142.53, 'max_sentence2_length': 360, 'unique_sentence2': 997}, 'mar-eng': {'num_samples': 997, 'number_of_characters': 251107, 'unique_pairs': 997, 'min_sentence1_length': 29, 'average_sentence1_length': 126.29, 'max_sentence1_length': 321, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-mar': {'num_samples': 997, 'number_of_characters': 250897, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 29, 'average_sentence2_length': 126.08, 'max_sentence2_length': 321, 'unique_sentence2': 997}, 'tam-eng': {'num_samples': 997, 'number_of_characters': 271322, 'unique_pairs': 997, 'min_sentence1_length': 30, 'average_sentence1_length': 146.57, 'max_sentence1_length': 358, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-tam': {'num_samples': 997, 'number_of_characters': 271322, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 30, 'average_sentence2_length': 146.57, 'max_sentence2_length': 358, 'unique_sentence2': 997}, 'tel-eng': {'num_samples': 997, 'number_of_characters': 252385, 'unique_pairs': 997, 'min_sentence1_length': 29, 'average_sentence1_length': 127.57, 'max_sentence1_length': 317, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-tel': {'num_samples': 997, 'number_of_characters': 252380, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 29, 'average_sentence2_length': 127.57, 'max_sentence2_length': 317, 'unique_sentence2': 997}, 'urd-eng': {'num_samples': 997, 'number_of_characters': 249824, 'unique_pairs': 997, 'min_sentence1_length': 37, 'average_sentence1_length': 125.01, 'max_sentence1_length': 295, 'unique_sentence1': 996, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-urd': {'num_samples': 997, 'number_of_characters': 249824, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 37, 'average_sentence2_length': 125.01, 'max_sentence2_length': 295, 'unique_sentence2': 996}, 'asm-eng': {'num_samples': 997, 'number_of_characters': 246220, 'unique_pairs': 997, 'min_sentence1_length': 30, 'average_sentence1_length': 121.39, 'max_sentence1_length': 314, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-asm': {'num_samples': 997, 'number_of_characters': 246224, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 30, 'average_sentence2_length': 121.39, 'max_sentence2_length': 314, 'unique_sentence2': 997}, 'bho-eng': {'num_samples': 997, 'number_of_characters': 246895, 'unique_pairs': 997, 'min_sentence1_length': 25, 'average_sentence1_length': 122.07, 'max_sentence1_length': 326, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-bho': {'num_samples': 997, 'number_of_characters': 246919, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 25, 'average_sentence2_length': 122.09, 'max_sentence2_length': 326, 'unique_sentence2': 997}, 'nep-eng': {'num_samples': 997, 'number_of_characters': 245984, 'unique_pairs': 997, 'min_sentence1_length': 24, 'average_sentence1_length': 121.15, 'max_sentence1_length': 307, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-nep': {'num_samples': 997, 'number_of_characters': 245984, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 24, 'average_sentence2_length': 121.15, 'max_sentence2_length': 307, 'unique_sentence2': 997}, 'ory-eng': {'num_samples': 997, 'number_of_characters': 254206, 'unique_pairs': 997, 'min_sentence1_length': 34, 'average_sentence1_length': 129.4, 'max_sentence1_length': 308, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-ory': {'num_samples': 997, 'number_of_characters': 254206, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 34, 'average_sentence2_length': 129.4, 'max_sentence2_length': 308, 'unique_sentence2': 997}, 'pan-eng': {'num_samples': 997, 'number_of_characters': 251598, 'unique_pairs': 997, 'min_sentence1_length': 29, 'average_sentence1_length': 126.78, 'max_sentence1_length': 309, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-pan': {'num_samples': 997, 'number_of_characters': 251597, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 29, 'average_sentence2_length': 126.78, 'max_sentence2_length': 309, 'unique_sentence2': 997}, 'pus-eng': {'num_samples': 997, 'number_of_characters': 247450, 'unique_pairs': 997, 'min_sentence1_length': 32, 'average_sentence1_length': 122.62, 'max_sentence1_length': 300, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-pus': {'num_samples': 997, 'number_of_characters': 247450, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 32, 'average_sentence2_length': 122.62, 'max_sentence2_length': 300, 'unique_sentence2': 997}, 'san-eng': {'num_samples': 997, 'number_of_characters': 249042, 'unique_pairs': 997, 'min_sentence1_length': 31, 'average_sentence1_length': 124.22, 'max_sentence1_length': 311, 'unique_sentence1': 994, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-san': {'num_samples': 997, 'number_of_characters': 248877, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 31, 'average_sentence2_length': 124.06, 'max_sentence2_length': 311, 'unique_sentence2': 994}, 'awa-eng': {'num_samples': 997, 'number_of_characters': 247944, 'unique_pairs': 997, 'min_sentence1_length': 34, 'average_sentence1_length': 123.12, 'max_sentence1_length': 329, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-awa': {'num_samples': 997, 'number_of_characters': 247884, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 34, 'average_sentence2_length': 123.06, 'max_sentence2_length': 329, 'unique_sentence2': 997}, 'bgc-eng': {'num_samples': 997, 'number_of_characters': 245935, 'unique_pairs': 997, 'min_sentence1_length': 27, 'average_sentence1_length': 121.1, 'max_sentence1_length': 303, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-bgc': {'num_samples': 997, 'number_of_characters': 245935, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 27, 'average_sentence2_length': 121.1, 'max_sentence2_length': 303, 'unique_sentence2': 997}, 'bod-eng': {'num_samples': 997, 'number_of_characters': 266515, 'unique_pairs': 997, 'min_sentence1_length': 26, 'average_sentence1_length': 141.75, 'max_sentence1_length': 355, 'unique_sentence1': 996, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-bod': {'num_samples': 997, 'number_of_characters': 266495, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 26, 'average_sentence2_length': 141.73, 'max_sentence2_length': 355, 'unique_sentence2': 996}, 'boy-eng': {'num_samples': 997, 'number_of_characters': 260174, 'unique_pairs': 997, 'min_sentence1_length': 31, 'average_sentence1_length': 135.39, 'max_sentence1_length': 312, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-boy': {'num_samples': 997, 'number_of_characters': 260174, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 31, 'average_sentence2_length': 135.39, 'max_sentence2_length': 312, 'unique_sentence2': 997}, 'gbm-eng': {'num_samples': 997, 'number_of_characters': 247009, 'unique_pairs': 997, 'min_sentence1_length': 30, 'average_sentence1_length': 122.18, 'max_sentence1_length': 344, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-gbm': {'num_samples': 997, 'number_of_characters': 247009, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 30, 'average_sentence2_length': 122.18, 'max_sentence2_length': 344, 'unique_sentence2': 997}, 'gom-eng': {'num_samples': 997, 'number_of_characters': 244553, 'unique_pairs': 997, 'min_sentence1_length': 31, 'average_sentence1_length': 119.72, 'max_sentence1_length': 306, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-gom': {'num_samples': 997, 'number_of_characters': 244553, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 31, 'average_sentence2_length': 119.72, 'max_sentence2_length': 306, 'unique_sentence2': 997}, 'hne-eng': {'num_samples': 997, 'number_of_characters': 246416, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 121.59, 'max_sentence1_length': 321, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-hne': {'num_samples': 997, 'number_of_characters': 246405, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 121.58, 'max_sentence2_length': 321, 'unique_sentence2': 997}, 'raj-eng': {'num_samples': 997, 'number_of_characters': 249541, 'unique_pairs': 997, 'min_sentence1_length': 32, 'average_sentence1_length': 124.72, 'max_sentence1_length': 313, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-raj': {'num_samples': 997, 'number_of_characters': 249541, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 32, 'average_sentence2_length': 124.72, 'max_sentence2_length': 313, 'unique_sentence2': 997}, 'mai-eng': {'num_samples': 997, 'number_of_characters': 247991, 'unique_pairs': 997, 'min_sentence1_length': 29, 'average_sentence1_length': 123.17, 'max_sentence1_length': 312, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-mai': {'num_samples': 997, 'number_of_characters': 247994, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 29, 'average_sentence2_length': 123.17, 'max_sentence2_length': 312, 'unique_sentence2': 997}, 'mni-eng': {'num_samples': 997, 'number_of_characters': 254308, 'unique_pairs': 997, 'min_sentence1_length': 39, 'average_sentence1_length': 129.5, 'max_sentence1_length': 310, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-mni': {'num_samples': 997, 'number_of_characters': 254312, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 39, 'average_sentence2_length': 129.51, 'max_sentence2_length': 310, 'unique_sentence2': 997}, 'mup-eng': {'num_samples': 997, 'number_of_characters': 248486, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 123.66, 'max_sentence1_length': 312, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-mup': {'num_samples': 997, 'number_of_characters': 248486, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 123.66, 'max_sentence2_length': 312, 'unique_sentence2': 997}, 'mwr-eng': {'num_samples': 997, 'number_of_characters': 248641, 'unique_pairs': 997, 'min_sentence1_length': 31, 'average_sentence1_length': 123.82, 'max_sentence1_length': 324, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-mwr': {'num_samples': 997, 'number_of_characters': 248641, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 31, 'average_sentence2_length': 123.82, 'max_sentence2_length': 324, 'unique_sentence2': 997}, 'sat-eng': {'num_samples': 997, 'number_of_characters': 258279, 'unique_pairs': 997, 'min_sentence1_length': 37, 'average_sentence1_length': 133.49, 'max_sentence1_length': 333, 'unique_sentence1': 995, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-sat': {'num_samples': 997, 'number_of_characters': 258279, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 37, 'average_sentence2_length': 133.49, 'max_sentence2_length': 333, 'unique_sentence2': 995}}}, 'test': {'num_samples': 58696, 'number_of_characters': 15359416, 'unique_pairs': 58690, 'min_sentence1_length': 33, 'average_sentence1_length': 130.84, 'max_sentence1_length': 431, 'unique_sentence1': 30351, 'min_sentence2_length': 33, 'average_sentence2_length': 130.83, 'max_sentence2_length': 431, 'unique_sentence2': 30351, 'hf_subset_descriptive_stats': {'ben-eng': {'num_samples': 1012, 'number_of_characters': 261008, 'unique_pairs': 1012, 'min_sentence1_length': 38, 'average_sentence1_length': 127.51, 'max_sentence1_length': 333, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-ben': {'num_samples': 1012, 'number_of_characters': 261008, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 38, 'average_sentence2_length': 127.51, 'max_sentence2_length': 333, 'unique_sentence2': 1012}, 'guj-eng': {'num_samples': 1012, 'number_of_characters': 258394, 'unique_pairs': 1012, 'min_sentence1_length': 38, 'average_sentence1_length': 124.93, 'max_sentence1_length': 349, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-guj': {'num_samples': 1012, 'number_of_characters': 258394, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 38, 'average_sentence2_length': 124.93, 'max_sentence2_length': 349, 'unique_sentence2': 1012}, 'hin-eng': {'num_samples': 1012, 'number_of_characters': 263040, 'unique_pairs': 1012, 'min_sentence1_length': 41, 'average_sentence1_length': 129.52, 'max_sentence1_length': 381, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-hin': {'num_samples': 1012, 'number_of_characters': 263029, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 41, 'average_sentence2_length': 129.51, 'max_sentence2_length': 381, 'unique_sentence2': 1012}, 'kan-eng': {'num_samples': 1012, 'number_of_characters': 270091, 'unique_pairs': 1012, 'min_sentence1_length': 43, 'average_sentence1_length': 136.49, 'max_sentence1_length': 388, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-kan': {'num_samples': 1012, 'number_of_characters': 270021, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 43, 'average_sentence2_length': 136.42, 'max_sentence2_length': 388, 'unique_sentence2': 1012}, 'mal-eng': {'num_samples': 1012, 'number_of_characters': 281302, 'unique_pairs': 1012, 'min_sentence1_length': 48, 'average_sentence1_length': 147.57, 'max_sentence1_length': 376, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-mal': {'num_samples': 1012, 'number_of_characters': 281302, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 48, 'average_sentence2_length': 147.57, 'max_sentence2_length': 376, 'unique_sentence2': 1012}, 'mar-eng': {'num_samples': 1012, 'number_of_characters': 265212, 'unique_pairs': 1012, 'min_sentence1_length': 34, 'average_sentence1_length': 131.67, 'max_sentence1_length': 356, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-mar': {'num_samples': 1012, 'number_of_characters': 265023, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 34, 'average_sentence2_length': 131.48, 'max_sentence2_length': 355, 'unique_sentence2': 1012}, 'tam-eng': {'num_samples': 1012, 'number_of_characters': 286099, 'unique_pairs': 1012, 'min_sentence1_length': 48, 'average_sentence1_length': 152.31, 'max_sentence1_length': 404, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-tam': {'num_samples': 1012, 'number_of_characters': 286099, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 48, 'average_sentence2_length': 152.31, 'max_sentence2_length': 404, 'unique_sentence2': 1012}, 'tel-eng': {'num_samples': 1012, 'number_of_characters': 264460, 'unique_pairs': 1012, 'min_sentence1_length': 39, 'average_sentence1_length': 130.92, 'max_sentence1_length': 359, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-tel': {'num_samples': 1012, 'number_of_characters': 264447, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 39, 'average_sentence2_length': 130.91, 'max_sentence2_length': 359, 'unique_sentence2': 1012}, 'urd-eng': {'num_samples': 1012, 'number_of_characters': 261886, 'unique_pairs': 1012, 'min_sentence1_length': 34, 'average_sentence1_length': 128.38, 'max_sentence1_length': 348, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-urd': {'num_samples': 1012, 'number_of_characters': 261885, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 34, 'average_sentence2_length': 128.38, 'max_sentence2_length': 348, 'unique_sentence2': 1012}, 'asm-eng': {'num_samples': 1012, 'number_of_characters': 257902, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 124.44, 'max_sentence1_length': 329, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-asm': {'num_samples': 1012, 'number_of_characters': 257909, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 124.45, 'max_sentence2_length': 329, 'unique_sentence2': 1012}, 'bho-eng': {'num_samples': 1012, 'number_of_characters': 260578, 'unique_pairs': 1012, 'min_sentence1_length': 36, 'average_sentence1_length': 127.09, 'max_sentence1_length': 367, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-bho': {'num_samples': 1012, 'number_of_characters': 260601, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 36, 'average_sentence2_length': 127.11, 'max_sentence2_length': 367, 'unique_sentence2': 1012}, 'nep-eng': {'num_samples': 1012, 'number_of_characters': 258869, 'unique_pairs': 1012, 'min_sentence1_length': 34, 'average_sentence1_length': 125.4, 'max_sentence1_length': 362, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-nep': {'num_samples': 1012, 'number_of_characters': 258869, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 34, 'average_sentence2_length': 125.4, 'max_sentence2_length': 362, 'unique_sentence2': 1012}, 'ory-eng': {'num_samples': 1012, 'number_of_characters': 266805, 'unique_pairs': 1012, 'min_sentence1_length': 38, 'average_sentence1_length': 133.24, 'max_sentence1_length': 354, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-ory': {'num_samples': 1012, 'number_of_characters': 266805, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 38, 'average_sentence2_length': 133.24, 'max_sentence2_length': 354, 'unique_sentence2': 1012}, 'pan-eng': {'num_samples': 1012, 'number_of_characters': 265391, 'unique_pairs': 1012, 'min_sentence1_length': 37, 'average_sentence1_length': 131.84, 'max_sentence1_length': 380, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-pan': {'num_samples': 1012, 'number_of_characters': 265391, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 37, 'average_sentence2_length': 131.84, 'max_sentence2_length': 380, 'unique_sentence2': 1012}, 'pus-eng': {'num_samples': 1012, 'number_of_characters': 254422, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 121.0, 'max_sentence1_length': 325, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-pus': {'num_samples': 1012, 'number_of_characters': 254421, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 121.0, 'max_sentence2_length': 325, 'unique_sentence2': 1012}, 'san-eng': {'num_samples': 1012, 'number_of_characters': 260339, 'unique_pairs': 1012, 'min_sentence1_length': 33, 'average_sentence1_length': 126.85, 'max_sentence1_length': 358, 'unique_sentence1': 1011, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-san': {'num_samples': 1012, 'number_of_characters': 260224, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 33, 'average_sentence2_length': 126.74, 'max_sentence2_length': 358, 'unique_sentence2': 1011}, 'awa-eng': {'num_samples': 1012, 'number_of_characters': 260179, 'unique_pairs': 1012, 'min_sentence1_length': 34, 'average_sentence1_length': 126.69, 'max_sentence1_length': 378, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-awa': {'num_samples': 1012, 'number_of_characters': 260137, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 34, 'average_sentence2_length': 126.65, 'max_sentence2_length': 378, 'unique_sentence2': 1012}, 'bgc-eng': {'num_samples': 1012, 'number_of_characters': 257450, 'unique_pairs': 1012, 'min_sentence1_length': 38, 'average_sentence1_length': 124.0, 'max_sentence1_length': 332, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-bgc': {'num_samples': 1012, 'number_of_characters': 257450, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 38, 'average_sentence2_length': 124.0, 'max_sentence2_length': 332, 'unique_sentence2': 1012}, 'bod-eng': {'num_samples': 1012, 'number_of_characters': 280188, 'unique_pairs': 1012, 'min_sentence1_length': 42, 'average_sentence1_length': 146.46, 'max_sentence1_length': 431, 'unique_sentence1': 1009, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-bod': {'num_samples': 1012, 'number_of_characters': 280126, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 42, 'average_sentence2_length': 146.4, 'max_sentence2_length': 431, 'unique_sentence2': 1009}, 'boy-eng': {'num_samples': 1012, 'number_of_characters': 277538, 'unique_pairs': 1012, 'min_sentence1_length': 36, 'average_sentence1_length': 143.85, 'max_sentence1_length': 396, 'unique_sentence1': 1011, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-boy': {'num_samples': 1012, 'number_of_characters': 277538, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 36, 'average_sentence2_length': 143.85, 'max_sentence2_length': 396, 'unique_sentence2': 1011}, 'gbm-eng': {'num_samples': 1012, 'number_of_characters': 261027, 'unique_pairs': 1012, 'min_sentence1_length': 38, 'average_sentence1_length': 127.53, 'max_sentence1_length': 333, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-gbm': {'num_samples': 1012, 'number_of_characters': 261027, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 38, 'average_sentence2_length': 127.53, 'max_sentence2_length': 333, 'unique_sentence2': 1012}, 'gom-eng': {'num_samples': 1012, 'number_of_characters': 259182, 'unique_pairs': 1012, 'min_sentence1_length': 37, 'average_sentence1_length': 125.71, 'max_sentence1_length': 335, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-gom': {'num_samples': 1012, 'number_of_characters': 259182, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 37, 'average_sentence2_length': 125.71, 'max_sentence2_length': 335, 'unique_sentence2': 1012}, 'hne-eng': {'num_samples': 1012, 'number_of_characters': 258911, 'unique_pairs': 1012, 'min_sentence1_length': 42, 'average_sentence1_length': 125.44, 'max_sentence1_length': 327, 'unique_sentence1': 1011, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-hne': {'num_samples': 1012, 'number_of_characters': 258915, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 42, 'average_sentence2_length': 125.44, 'max_sentence2_length': 326, 'unique_sentence2': 1011}, 'raj-eng': {'num_samples': 1012, 'number_of_characters': 261987, 'unique_pairs': 1012, 'min_sentence1_length': 38, 'average_sentence1_length': 128.48, 'max_sentence1_length': 338, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-raj': {'num_samples': 1012, 'number_of_characters': 261987, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 38, 'average_sentence2_length': 128.48, 'max_sentence2_length': 338, 'unique_sentence2': 1012}, 'mai-eng': {'num_samples': 1012, 'number_of_characters': 261374, 'unique_pairs': 1012, 'min_sentence1_length': 36, 'average_sentence1_length': 127.87, 'max_sentence1_length': 350, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-mai': {'num_samples': 1012, 'number_of_characters': 261377, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 36, 'average_sentence2_length': 127.88, 'max_sentence2_length': 350, 'unique_sentence2': 1012}, 'mni-eng': {'num_samples': 1012, 'number_of_characters': 268767, 'unique_pairs': 1012, 'min_sentence1_length': 38, 'average_sentence1_length': 135.18, 'max_sentence1_length': 353, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-mni': {'num_samples': 1012, 'number_of_characters': 268768, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 38, 'average_sentence2_length': 135.18, 'max_sentence2_length': 354, 'unique_sentence2': 1012}, 'mup-eng': {'num_samples': 1012, 'number_of_characters': 262034, 'unique_pairs': 1012, 'min_sentence1_length': 40, 'average_sentence1_length': 128.53, 'max_sentence1_length': 340, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-mup': {'num_samples': 1012, 'number_of_characters': 262034, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 40, 'average_sentence2_length': 128.53, 'max_sentence2_length': 340, 'unique_sentence2': 1012}, 'mwr-eng': {'num_samples': 1012, 'number_of_characters': 263749, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.22, 'max_sentence1_length': 345, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-mwr': {'num_samples': 1012, 'number_of_characters': 263749, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.22, 'max_sentence2_length': 345, 'unique_sentence2': 1012}, 'sat-eng': {'num_samples': 1012, 'number_of_characters': 271757, 'unique_pairs': 1012, 'min_sentence1_length': 43, 'average_sentence1_length': 138.13, 'max_sentence1_length': 366, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-sat': {'num_samples': 1012, 'number_of_characters': 271757, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 43, 'average_sentence2_length': 138.13, 'max_sentence2_length': 366, 'unique_sentence2': 1012}}}} | -| [IndicLangClassification](https://arxiv.org/abs/2305.15814) | ['asm', 'ben', 'brx', 'doi', 'gom', 'guj', 'hin', 'kan', 'kas', 'mai', 'mal', 'mar', 'mni', 'npi', 'ory', 'pan', 'san', 'sat', 'snd', 'tam', 'tel', 'urd'] | Classification | s2s | [Non-fiction, Web, Written] | None | None | +| [IndicLangClassification](https://arxiv.org/abs/2305.15814) (Madhani et al., 2023) | ['asm', 'ben', 'brx', 'doi', 'gom', 'guj', 'hin', 'kan', 'kas', 'mai', 'mal', 'mar', 'mni', 'npi', 'ory', 'pan', 'san', 'sat', 'snd', 'tam', 'tel', 'urd'] | Classification | s2s | [Non-fiction, Web, Written] | None | None | | [IndicNLPNewsClassification](https://github.com/AI4Bharat/indicnlp_corpus#indicnlp-news-article-classification-dataset) (Anoop Kunchukuttan, 2020) | ['guj', 'kan', 'mal', 'mar', 'ori', 'pan', 'tam', 'tel'] | Classification | s2s | [News, Written] | None | None | | [IndicQARetrieval](https://arxiv.org/abs/2212.05409) (Sumanth Doddapaneni, 2022) | ['asm', 'ben', 'guj', 'hin', 'kan', 'mal', 'mar', 'ory', 'pan', 'tam', 'tel'] | Retrieval | s2p | [Web, Written] | None | None | | [IndicReviewsClusteringP2P](https://arxiv.org/abs/2212.05409) (Sumanth Doddapaneni, 2022) | ['asm', 'ben', 'brx', 'guj', 'hin', 'kan', 'mal', 'mar', 'ory', 'pan', 'tam', 'tel', 'urd'] | Clustering | p2p | [Reviews, Written] | None | None | | [IndicSentimentClassification](https://arxiv.org/abs/2212.05409) (Sumanth Doddapaneni, 2022) | ['asm', 'ben', 'brx', 'guj', 'hin', 'kan', 'mal', 'mar', 'ory', 'pan', 'tam', 'tel', 'urd'] | Classification | s2s | [Reviews, Written] | None | None | -| [IndonesianIdClickbaitClassification](http://www.sciencedirect.com/science/article/pii/S2352340920311252) | ['ind'] | Classification | s2s | [News, Written] | None | None | -| [IndonesianMongabayConservationClassification](https://aclanthology.org/2023.sealp-1.4/) | ['ind'] | Classification | s2s | [Web, Written] | None | None | +| [IndonesianIdClickbaitClassification](http://www.sciencedirect.com/science/article/pii/S2352340920311252) (Andika William, 2020) | ['ind'] | Classification | s2s | [News, Written] | None | None | +| [IndonesianMongabayConservationClassification](https://aclanthology.org/2023.sealp-1.4/) (Fransiska et al., 2023) | ['ind'] | Classification | s2s | [Web, Written] | None | None | | [InfoSeekIT2ITRetrieval](https://aclanthology.org/2023.emnlp-main.925) (Chen et al., 2023) | ['eng'] | Any2AnyRetrieval | it2it | [Encyclopaedic] | {'test': 499375} | {'test': {'number_of_characters': 291755841, 'num_samples': 499375, 'num_queries': 17593, 'num_documents': 481782, 'min_document_length': 8, 'average_document_length': 603.95, 'max_document_length': 4062, 'unique_documents': 481782, 'num_document_images': 481782, 'min_query_length': 21, 'average_query_length': 44.41, 'max_query_length': 87, 'unique_queries': 350, 'num_query_images': 17593, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 7.47, 'max_relevant_docs_per_query': 128, 'unique_relevant_docs': 7891}} | | [InfoSeekIT2TRetrieval](https://aclanthology.org/2023.emnlp-main.925) (Chen et al., 2023) | ['eng'] | Any2AnyRetrieval | it2t | [Encyclopaedic] | {'test': 622974} | {'test': {'number_of_characters': 360966739, 'num_samples': 622974, 'num_queries': 11323, 'num_documents': 611651, 'min_document_length': 9, 'average_document_length': 589.22, 'max_document_length': 7650, 'unique_documents': 611651, 'num_document_images': 0, 'min_query_length': 23, 'average_query_length': 50.18, 'max_query_length': 270, 'unique_queries': 269, 'num_query_images': 11323, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 6.52, 'max_relevant_docs_per_query': 66, 'unique_relevant_docs': 7799}} | | [InsurancePolicyInterpretationLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | | [InternationalCitizenshipQuestionsLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | | [IsiZuluNewsClassification](https://huggingface.co/datasets/dsfsi/za-isizulu-siswati-news) (Madodonga et al., 2023) | ['zul'] | Classification | s2s | [News, Written] | None | None | | [ItaCaseholdClassification](https://doi.org/10.1145/3594536.3595177) (Licari et al., 2023) | ['ita'] | Classification | s2s | [Government, Legal, Written] | None | None | -| [Itacola](https://aclanthology.org/2021.findings-emnlp.250/) | ['ita'] | Classification | s2s | [Non-fiction, Spoken, Written] | None | None | +| [Itacola](https://aclanthology.org/2021.findings-emnlp.250/) (Trotta et al., 2021) | ['ita'] | Classification | s2s | [Non-fiction, Spoken, Written] | None | None | | [JCrewBlockerLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | | [JDReview](https://aclanthology.org/2023.nodalida-1.20/) (Xiao et al., 2023) | ['cmn'] | Classification | s2s | | None | None | | [JSICK](https://github.com/sbintuitions/JMTEB) (Yanaka et al., 2022) | ['jpn'] | STS | s2s | [Web, Written] | None | None | -| [JSTS](https://aclanthology.org/2022.lrec-1.317.pdf#page=2.00) | ['jpn'] | STS | s2s | [Web, Written] | None | None | +| [JSTS](https://aclanthology.org/2022.lrec-1.317.pdf#page=2.00) (Kurihara et al., 2022) | ['jpn'] | STS | s2s | [Web, Written] | None | None | | [JaGovFaqsRetrieval](https://github.com/sbintuitions/JMTEB) | ['jpn'] | Retrieval | s2s | [Web, Written] | None | None | | [JaQuADRetrieval](https://arxiv.org/abs/2202.01764) (ByungHoon So, 2022) | ['jpn'] | Retrieval | p2p | [Encyclopaedic, Non-fiction, Written] | None | None | -| [JaqketRetrieval](https://github.com/kumapo/JAQKET-dataset) | ['jpn'] | Retrieval | s2p | [Encyclopaedic, Non-fiction, Written] | {'test': 115226} | {'test': {'number_of_characters': 428294530, 'num_samples': 115226, 'num_queries': 997, 'num_documents': 114229, 'min_document_length': 16, 'average_document_length': 0.44, 'max_document_length': 98, 'unique_documents': 114229, 'min_query_length': 8, 'average_query_length': 429532.57, 'max_query_length': 188424, 'unique_queries': 997, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 989}} | +| [JaqketRetrieval](https://github.com/kumapo/JAQKET-dataset) (鈴木正敏, 2020) | ['jpn'] | Retrieval | s2p | [Encyclopaedic, Non-fiction, Written] | {'test': 115226} | {'test': {'number_of_characters': 428294530, 'num_samples': 115226, 'num_queries': 997, 'num_documents': 114229, 'min_document_length': 16, 'average_document_length': 0.44, 'max_document_length': 98, 'unique_documents': 114229, 'min_query_length': 8, 'average_query_length': 429532.57, 'max_query_length': 188424, 'unique_queries': 997, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 989}} | | [JavaneseIMDBClassification](https://github.com/w11wo/nlp-datasets#javanese-imdb) (Wongso et al., 2021) | ['jav'] | Classification | s2s | [Reviews, Written] | None | None | | [KLUE-NLI](https://arxiv.org/abs/2105.09680) (Sungjoon Park, 2021) | ['kor'] | PairClassification | s2s | [Encyclopaedic, News, Written] | None | None | | [KLUE-STS](https://arxiv.org/abs/2105.09680) (Sungjoon Park, 2021) | ['kor'] | STS | s2s | [News, Reviews, Spoken, Spoken, Written] | None | None | @@ -389,20 +403,20 @@ The following tables give you an overview of the tasks in MTEB. | Ko-StrategyQA (Geva et al., 2021) | ['kor'] | Retrieval | s2p | | None | None | | [KorFin](https://huggingface.co/datasets/amphora/korfin-asc) (Son et al., 2023) | ['kor'] | Classification | s2s | [Financial, News, Written] | None | None | | [KorHateClassification](https://paperswithcode.com/dataset/korean-hatespeech-dataset) (Jihyung Moon, 2020) | ['kor'] | Classification | s2s | [Social, Written] | None | None | -| [KorHateSpeechMLClassification](https://paperswithcode.com/dataset/korean-multi-label-hate-speech-dataset) | ['kor'] | MultilabelClassification | s2s | [Social, Written] | None | None | +| [KorHateSpeechMLClassification](https://paperswithcode.com/dataset/korean-multi-label-hate-speech-dataset) (Lee et al., 2022) | ['kor'] | MultilabelClassification | s2s | [Social, Written] | None | None | | [KorSTS](https://arxiv.org/abs/2004.03289) (Ham et al., 2020) | ['kor'] | STS | s2s | [News, Web] | None | None | | [KorSarcasmClassification](https://github.com/SpellOnYou/korean-sarcasm) (Kim et al., 2019) | ['kor'] | Classification | s2s | [Social, Written] | None | None | | [KurdishSentimentClassification](https://link.springer.com/article/10.1007/s10579-023-09716-6) (Badawi et al., 2024) | ['kur'] | Classification | s2s | [Web, Written] | None | None | | [LCQMC](https://aclanthology.org/2021.emnlp-main.357) (Shitao Xiao, 2024) | ['cmn'] | STS | s2s | | None | None | -| [LEMBNarrativeQARetrieval](https://huggingface.co/datasets/dwzhu/LongEmbed) | ['eng'] | Retrieval | s2p | [Fiction, Non-fiction, Written] | None | None | +| [LEMBNarrativeQARetrieval](https://huggingface.co/datasets/dwzhu/LongEmbed) (Ko{\v{c, 2018) | ['eng'] | Retrieval | s2p | [Fiction, Non-fiction, Written] | None | None | | [LEMBNeedleRetrieval](https://huggingface.co/datasets/dwzhu/LongEmbed) (Zhu et al., 2024) | ['eng'] | Retrieval | s2p | [Academic, Blog, Written] | None | None | | [LEMBPasskeyRetrieval](https://huggingface.co/datasets/dwzhu/LongEmbed) (Zhu et al., 2024) | ['eng'] | Retrieval | s2p | [Fiction, Written] | None | None | -| [LEMBQMSumRetrieval](https://huggingface.co/datasets/dwzhu/LongEmbed) | ['eng'] | Retrieval | s2p | [Spoken, Written] | None | None | -| [LEMBSummScreenFDRetrieval](https://huggingface.co/datasets/dwzhu/LongEmbed) | ['eng'] | Retrieval | s2p | [Spoken, Written] | None | None | +| [LEMBQMSumRetrieval](https://huggingface.co/datasets/dwzhu/LongEmbed) (Zhong et al., 2021) | ['eng'] | Retrieval | s2p | [Spoken, Written] | None | None | +| [LEMBSummScreenFDRetrieval](https://huggingface.co/datasets/dwzhu/LongEmbed) (Chen et al., 2022) | ['eng'] | Retrieval | s2p | [Spoken, Written] | None | None | | [LEMBWikimQARetrieval](https://huggingface.co/datasets/dwzhu/LongEmbed) (Ho et al., 2020) | ['eng'] | Retrieval | s2p | [Encyclopaedic, Written] | None | None | -| [LLaVAIT2TRetrieval](https://github.com/LinWeizheDragon/FLMR/blob/main/docs/Datasets.md) | ['eng'] | Any2AnyRetrieval | it2t | [Encyclopaedic] | {'test': 11114} | {'test': {'number_of_characters': 3578932, 'num_samples': 11114, 'num_queries': 5120, 'num_documents': 5994, 'min_document_length': 17, 'average_document_length': 546.19, 'max_document_length': 2562, 'unique_documents': 5994, 'num_document_images': 0, 'min_query_length': 21, 'average_query_length': 59.58, 'max_query_length': 165, 'unique_queries': 3906, 'num_query_images': 5120, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 5108}} | +| [LLaVAIT2TRetrieval](https://github.com/LinWeizheDragon/FLMR/blob/main/docs/Datasets.md) (Lin et al., 2024) | ['eng'] | Any2AnyRetrieval | it2t | [Encyclopaedic] | {'test': 11114} | {'test': {'number_of_characters': 3578932, 'num_samples': 11114, 'num_queries': 5120, 'num_documents': 5994, 'min_document_length': 17, 'average_document_length': 546.19, 'max_document_length': 2562, 'unique_documents': 5994, 'num_document_images': 0, 'min_query_length': 21, 'average_query_length': 59.58, 'max_query_length': 165, 'unique_queries': 3906, 'num_query_images': 5120, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 5108}} | | [LanguageClassification](https://huggingface.co/datasets/papluca/language-identification) (Conneau et al., 2018) | ['ara', 'bul', 'cmn', 'deu', 'ell', 'eng', 'fra', 'hin', 'ita', 'jpn', 'nld', 'pol', 'por', 'rus', 'spa', 'swa', 'tha', 'tur', 'urd', 'vie'] | Classification | s2s | [Fiction, Government, Non-fiction, Reviews, Web, Written] | {'test': 2048, 'train': 70000} | {'test': {'num_samples': 2048, 'number_of_characters': 224352, 'num_texts_in_train': 31, 'min_text_length': 14, 'average_text_length': 109.55, 'max_text_length': 1270, 'unique_text': 2025, 'unique_labels': 20, 'labels': {'17': {'count': 102}, '0': {'count': 102}, '11': {'count': 102}, '4': {'count': 103}, '3': {'count': 102}, '1': {'count': 102}, '10': {'count': 102}, '2': {'count': 103}, '16': {'count': 103}, '9': {'count': 103}, '5': {'count': 102}, '7': {'count': 102}, '13': {'count': 102}, '14': {'count': 103}, '12': {'count': 102}, '15': {'count': 103}, '19': {'count': 102}, '18': {'count': 102}, '6': {'count': 103}, '8': {'count': 103}}}, 'train': {'num_samples': 70000, 'number_of_characters': 7760299, 'num_texts_in_train': None, 'min_text_length': 2, 'average_text_length': 110.86, 'max_text_length': 2422, 'unique_text': 68978, 'unique_labels': 20, 'labels': {'12': {'count': 3500}, '1': {'count': 3500}, '19': {'count': 3500}, '15': {'count': 3500}, '13': {'count': 3500}, '11': {'count': 3500}, '17': {'count': 3500}, '14': {'count': 3500}, '16': {'count': 3500}, '5': {'count': 3500}, '0': {'count': 3500}, '8': {'count': 3500}, '7': {'count': 3500}, '2': {'count': 3500}, '3': {'count': 3500}, '10': {'count': 3500}, '6': {'count': 3500}, '18': {'count': 3500}, '4': {'count': 3500}, '9': {'count': 3500}}}} | -| [LccSentimentClassification](https://github.com/fnielsen/lcc-sentiment) | ['dan'] | Classification | s2s | [News, Web, Written] | None | None | +| [LccSentimentClassification](https://github.com/fnielsen/lcc-sentiment) (Quasthoff et al., 2006) | ['dan'] | Classification | s2s | [News, Web, Written] | None | None | | [LeCaRDv2](https://github.com/THUIR/LeCaRDv2) (Haitao Li, 2023) | ['zho'] | Retrieval | p2p | [Legal, Written] | None | None | | [LearnedHandsBenefitsLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | | [LearnedHandsBusinessLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | @@ -420,12 +434,13 @@ The following tables give you an overview of the tasks in MTEB. | [LearnedHandsImmigrationLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | | [LearnedHandsTortsLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | | [LearnedHandsTrafficLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | -| [LegalBenchConsumerContractsQA](https://huggingface.co/datasets/nguha/legalbench/viewer/consumer_contracts_qa) (Koreeda et al., 2021) | ['eng'] | Retrieval | s2p | [Legal, Written] | None | None | +| [LegalBenchConsumerContractsQA](https://huggingface.co/datasets/nguha/legalbench/viewer/consumer_contracts_qa) (Hendrycks et al., 2021) | ['eng'] | Retrieval | s2p | [Legal, Written] | None | None | | [LegalBenchCorporateLobbying](https://huggingface.co/datasets/nguha/legalbench/viewer/corporate_lobbying) (Neel Guha, 2023) | ['eng'] | Retrieval | s2p | [Legal, Written] | None | None | | [LegalBenchPC](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | PairClassification | s2s | [Legal, Written] | None | None | | [LegalQuAD](https://github.com/Christoph911/AIKE2021_Appendix) (Hoppe et al., 2021) | ['deu'] | Retrieval | s2p | [Legal, Written] | None | None | | [LegalReasoningCausalityLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | -| [LegalSummarization](https://github.com/lauramanor/legal_summarization) | ['eng'] | Retrieval | s2p | [Legal, Written] | None | None | +| [LegalSummarization](https://github.com/lauramanor/legal_summarization) (Manor et al., 2019) | ['eng'] | Retrieval | s2p | [Legal, Written] | None | None | +| [LibriCount](https://huggingface.co/datasets/silky1708/LibriCount) (Stoter et al., 2018) | ['eng'] | AudioClassification | a2t | [Speech] | None | None | | [LinceMTBitextMining](https://ritual.uh.edu/lince/) (Aguilar et al., 2020) | ['eng', 'hin'] | BitextMining | s2s | [Social, Written] | None | None | | [LitSearchRetrieval](https://github.com/princeton-nlp/LitSearch) (Ajith et al., 2024) | ['eng'] | Retrieval | s2p | [Academic, Non-fiction, Written] | None | None | | [LivedoorNewsClustering.v2](https://github.com/sbintuitions/JMTEB) | ['jpn'] | Clustering | s2s | [News, Written] | None | None | @@ -434,8 +449,8 @@ The following tables give you an overview of the tasks in MTEB. | [MIRACLReranking](https://project-miracl.github.io/) (Zhang et al., 2023) | ['ara', 'ben', 'deu', 'eng', 'fas', 'fin', 'fra', 'hin', 'ind', 'jpn', 'kor', 'rus', 'spa', 'swa', 'tel', 'tha', 'yor', 'zho'] | Reranking | s2s | [Encyclopaedic, Written] | None | None | | [MIRACLRetrieval](http://miracl.ai/) (Zhang et al., 2023) | ['ara', 'ben', 'deu', 'eng', 'fas', 'fin', 'fra', 'hin', 'ind', 'jpn', 'kor', 'rus', 'spa', 'swa', 'tel', 'tha', 'yor', 'zho'] | Retrieval | s2p | [Encyclopaedic, Written] | None | None | | [MIRACLRetrievalHardNegatives](http://miracl.ai/) (Zhang et al., 2023) | ['ara', 'ben', 'deu', 'eng', 'fas', 'fin', 'fra', 'hin', 'ind', 'jpn', 'kor', 'rus', 'spa', 'swa', 'tel', 'tha', 'yor', 'zho'] | Retrieval | s2p | [Encyclopaedic, Written] | None | None | -| [MLQARetrieval](https://huggingface.co/datasets/mlqa) | ['ara', 'deu', 'eng', 'hin', 'spa', 'vie', 'zho'] | Retrieval | s2p | [Encyclopaedic, Written] | None | None | -| [MLQuestions](https://github.com/McGill-NLP/MLQuestions) | ['eng'] | Retrieval | s2p | [Academic, Encyclopaedic, Written] | None | None | +| [MLQARetrieval](https://huggingface.co/datasets/mlqa) (Lewis et al., 2019) | ['ara', 'deu', 'eng', 'hin', 'spa', 'vie', 'zho'] | Retrieval | s2p | [Encyclopaedic, Written] | None | None | +| [MLQuestions](https://github.com/McGill-NLP/MLQuestions) (Kulshreshtha et al., 2021) | ['eng'] | Retrieval | s2p | [Academic, Encyclopaedic, Written] | None | None | | [MLSUMClusteringP2P.v2](https://huggingface.co/datasets/mteb/mlsum) (Scialom et al., 2020) | ['deu', 'fra', 'rus', 'spa'] | Clustering | p2p | [News, Written] | None | None | | [MLSUMClusteringS2S.v2](https://huggingface.co/datasets/mteb/mlsum) (Scialom et al., 2020) | ['deu', 'fra', 'rus', 'spa'] | Clustering | s2s | [News, Written] | None | None | | [MMarcoReranking](https://github.com/unicamp-dl/mMARCO) (Luiz Henrique Bonifacio, 2021) | ['cmn'] | Reranking | s2s | | None | None | @@ -444,40 +459,43 @@ The following tables give you an overview of the tasks in MTEB. | [MNISTZeroShot](https://en.wikipedia.org/wiki/MNIST_database) (LeCun et al., 2010) | ['eng'] | ZeroShotClassification | i2t | [Encyclopaedic] | {'test': 10000} | {'test': {'num_samples': 10000, 'unique_num_labels': 10, 'min_image_width': 28, 'average_image_width': 28.0, 'max_image_width': 28, 'min_image_height': 28, 'average_image_height': 28.0, 'max_image_height': 28, 'min_label_text_length': 27, 'average_label_text_length': 27.0, 'max_label_text_length': 27, 'labels': {'7': {'count': 1028}, '2': {'count': 1032}, '1': {'count': 1135}, '0': {'count': 980}, '4': {'count': 982}, '9': {'count': 1009}, '5': {'count': 892}, '6': {'count': 958}, '3': {'count': 1010}, '8': {'count': 974}}}} | | [MSCOCOI2TRetrieval](https://link.springer.com/chapter/10.1007/978-3-319-10602-1_48) (Lin et al., 2014) | ['eng'] | Any2AnyRetrieval | i2t | [Encyclopaedic] | {'test': 29809} | {'test': {'number_of_characters': 1303643, 'num_samples': 29809, 'num_queries': 5000, 'num_documents': 24809, 'min_document_length': 27, 'average_document_length': 52.55, 'max_document_length': 243, 'unique_documents': 24809, 'num_document_images': 0, 'min_query_length': 0, 'average_query_length': 0, 'max_query_length': 0, 'unique_queries': 0, 'num_query_images': 5000, 'min_relevant_docs_per_query': 4, 'average_relevant_docs_per_query': 5.0, 'max_relevant_docs_per_query': 5, 'unique_relevant_docs': 24809}} | | [MSCOCOT2IRetrieval](https://link.springer.com/chapter/10.1007/978-3-319-10602-1_48) (Lin et al., 2014) | ['eng'] | Any2AnyRetrieval | t2i | [Encyclopaedic] | {'test': 29809} | {'test': {'number_of_characters': 1303643, 'num_samples': 29809, 'num_queries': 24809, 'num_documents': 5000, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 5000, 'min_query_length': 27, 'average_query_length': 52.55, 'max_query_length': 243, 'unique_queries': 24809, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.01, 'max_relevant_docs_per_query': 8, 'unique_relevant_docs': 5000}} | -| [MSMARCO](https://microsoft.github.io/msmarco/) (Tri Nguyen and Mir Rosenberg and Xia Song and Jianfeng Gao and Saurabh Tiwary and Rangan Majumder and Li Deng, 2016) | ['eng'] | Retrieval | s2p | [Academic, Blog, Encyclopaedic, Government, Medical, News, Non-fiction, Reviews, Social, Web] | None | None | +| [MSMARCO](https://microsoft.github.io/msmarco/) (Tri Nguyen and Mir Rosenberg and Xia Song and Jianfeng Gao and Saurabh Tiwary and Rangan Majumder and Li Deng, 2016) | ['eng'] | Retrieval | s2p | [Academic, Blog, Encyclopaedic, Government, Medical, News, Non-fiction, Reviews, Social, Web] | None | None | | [MSMARCO-Fa](https://huggingface.co/datasets/MCINext/msmarco-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [MSMARCO-PL](https://microsoft.github.io/msmarco/) (Konrad Wojtasik, 2024) | ['pol'] | Retrieval | s2p | [Web, Written] | None | None | | [MSMARCO-PLHardNegatives](https://microsoft.github.io/msmarco/) (Konrad Wojtasik, 2024) | ['pol'] | Retrieval | s2p | [Web, Written] | None | None | -| [MSMARCOHardNegatives](https://microsoft.github.io/msmarco/) (Tri Nguyen and Mir Rosenberg and Xia Song and Jianfeng Gao and Saurabh Tiwary and Rangan Majumder and Li Deng, 2016) | ['eng'] | Retrieval | s2p | [Academic, Blog, Encyclopaedic, Government, Medical, News, Non-fiction, Reviews, Social, Web] | None | None | -| [MSMARCOv2](https://microsoft.github.io/msmarco/TREC-Deep-Learning.html) (Tri Nguyen and Mir Rosenberg and Xia Song and Jianfeng Gao and Saurabh Tiwary and Rangan Majumder and Li Deng, 2016) | ['eng'] | Retrieval | s2p | [Academic, Blog, Encyclopaedic, Government, Medical, News, Non-fiction, Reviews, Social, Web] | None | None | -| [MTOPDomainClassification](https://arxiv.org/pdf/2008.09335.pdf) | ['deu', 'eng', 'fra', 'hin', 'spa', 'tha'] | Classification | s2s | [Spoken, Spoken] | None | None | -| [MTOPIntentClassification](https://arxiv.org/pdf/2008.09335.pdf) | ['deu', 'eng', 'fra', 'hin', 'spa', 'tha'] | Classification | s2s | [Spoken, Spoken] | None | None | -| [MacedonianTweetSentimentClassification](https://aclanthology.org/R15-1034/) | ['mkd'] | Classification | s2s | [Social, Written] | None | None | +| [MSMARCOHardNegatives](https://microsoft.github.io/msmarco/) (Tri Nguyen and Mir Rosenberg and Xia Song and Jianfeng Gao and Saurabh Tiwary and Rangan Majumder and Li Deng, 2016) | ['eng'] | Retrieval | s2p | [Academic, Blog, Encyclopaedic, Government, Medical, News, Non-fiction, Reviews, Social, Web] | None | None | +| [MSMARCOv2](https://microsoft.github.io/msmarco/TREC-Deep-Learning.html) (Tri Nguyen and Mir Rosenberg and Xia Song and Jianfeng Gao and Saurabh Tiwary and Rangan Majumder and Li Deng, 2016) | ['eng'] | Retrieval | s2p | [Academic, Blog, Encyclopaedic, Government, Medical, News, Non-fiction, Reviews, Social, Web] | None | None | +| [MTOPDomainClassification](https://arxiv.org/pdf/2008.09335.pdf) (Li et al., 2021) | ['deu', 'eng', 'fra', 'hin', 'spa', 'tha'] | Classification | s2s | [Spoken, Spoken] | None | None | +| [MTOPIntentClassification](https://arxiv.org/pdf/2008.09335.pdf) (Li et al., 2021) | ['deu', 'eng', 'fra', 'hin', 'spa', 'tha'] | Classification | s2s | [Spoken, Spoken] | None | None | +| [MacedonianTweetSentimentClassification](https://aclanthology.org/R15-1034/) (Jovanoski et al., 2015) | ['mkd'] | Classification | s2s | [Social, Written] | None | None | | [MalayalamNewsClassification](https://github.com/goru001/nlp-for-malyalam) (Anoop Kunchukuttan, 2020) | ['mal'] | Classification | s2s | [News, Written] | None | None | -| [MalteseNewsClassification](https://huggingface.co/datasets/MLRS/maltese_news_categories) | ['mlt'] | MultilabelClassification | s2s | [Constructed, Written] | None | None | +| [MalteseNewsClassification](https://huggingface.co/datasets/MLRS/maltese_news_categories) (Chaudhary et al., 2024) | ['mlt'] | MultilabelClassification | s2s | [Constructed, Written] | None | None | | [MarathiNewsClassification](https://github.com/goru001/nlp-for-marathi) (Anoop Kunchukuttan, 2020) | ['mar'] | Classification | s2s | [News, Written] | None | None | | [MasakhaNEWSClassification](https://arxiv.org/abs/2304.09972) (David Ifeoluwa Adelani, 2023) | ['amh', 'eng', 'fra', 'hau', 'ibo', 'lin', 'lug', 'orm', 'pcm', 'run', 'sna', 'som', 'swa', 'tir', 'xho', 'yor'] | Classification | s2s | [News, Written] | None | None | | [MasakhaNEWSClusteringP2P](https://huggingface.co/datasets/masakhane/masakhanews) (David Ifeoluwa Adelani, 2023) | ['amh', 'eng', 'fra', 'hau', 'ibo', 'lin', 'lug', 'orm', 'pcm', 'run', 'sna', 'som', 'swa', 'tir', 'xho', 'yor'] | Clustering | p2p | [News, Non-fiction, Written] | None | None | -| [MasakhaNEWSClusteringS2S](https://huggingface.co/datasets/masakhane/masakhanews) (David Ifeoluwa Adelani, 2023) | ['amh', 'eng', 'fra', 'hau', 'ibo', 'lin', 'lug', 'orm', 'pcm', 'run', 'sna', 'som', 'swa', 'tir', 'xho', 'yor'] | Clustering | s2s | | None | None | +| [MasakhaNEWSClusteringS2S](https://huggingface.co/datasets/masakhane/masakhanews) (David Ifeoluwa Adelani, 2023) | ['amh', 'eng', 'fra', 'hau', 'ibo', 'lin', 'lug', 'orm', 'pcm', 'run', 'sna', 'som', 'swa', 'tir', 'xho', 'yor'] | Clustering | s2s | [News, Written] | None | None | | [MassiveIntentClassification](https://arxiv.org/abs/2204.08582) (Jack FitzGerald, 2022) | ['afr', 'amh', 'ara', 'aze', 'ben', 'cmo', 'cym', 'dan', 'deu', 'ell', 'eng', 'fas', 'fin', 'fra', 'heb', 'hin', 'hun', 'hye', 'ind', 'isl', 'ita', 'jav', 'jpn', 'kan', 'kat', 'khm', 'kor', 'lav', 'mal', 'mon', 'msa', 'mya', 'nld', 'nob', 'pol', 'por', 'ron', 'rus', 'slv', 'spa', 'sqi', 'swa', 'swe', 'tam', 'tel', 'tgl', 'tha', 'tur', 'urd', 'vie'] | Classification | s2s | [Spoken] | None | None | | [MassiveScenarioClassification](https://arxiv.org/abs/2204.08582) (Jack FitzGerald, 2022) | ['afr', 'amh', 'ara', 'aze', 'ben', 'cmo', 'cym', 'dan', 'deu', 'ell', 'eng', 'fas', 'fin', 'fra', 'heb', 'hin', 'hun', 'hye', 'ind', 'isl', 'ita', 'jav', 'jpn', 'kan', 'kat', 'khm', 'kor', 'lav', 'mal', 'mon', 'msa', 'mya', 'nld', 'nob', 'pol', 'por', 'ron', 'rus', 'slv', 'spa', 'sqi', 'swa', 'swe', 'tam', 'tel', 'tgl', 'tha', 'tur', 'urd', 'vie'] | Classification | s2s | [Spoken] | None | None | | [MedicalQARetrieval](https://bmcbioinformatics.biomedcentral.com/articles/10.1186/s12859-019-3119-4) (Asma et al., 2019) | ['eng'] | Retrieval | s2s | [Medical, Written] | None | None | -| [MedicalRetrieval](https://arxiv.org/abs/2203.03367) | ['cmn'] | Retrieval | s2p | | None | None | +| [MedicalRetrieval](https://arxiv.org/abs/2203.03367) (Dingkun Long, 2022) | ['cmn'] | Retrieval | s2p | | None | None | | [MedrxivClusteringP2P.v2](https://api.medrxiv.org/) | ['eng'] | Clustering | p2p | [Academic, Medical, Written] | {'test': 37500} | {'test': {'num_samples': 37500, 'number_of_characters': 74294927, 'min_text_length': 148, 'average_text_length': 1981.2, 'max_text_length': 38759, 'min_labels_per_text': 6, 'average_labels_per_text': 1.0, 'max_labels_per_text': 8830, 'unique_labels': 51, 'labels': {'epidemiology': {'count': 6656}, 'public and global health': {'count': 3595}, 'oncology': {'count': 845}, 'allergy and immunology': {'count': 464}, 'orthopedics': {'count': 104}, 'health informatics': {'count': 1107}, 'occupational and environmental health': {'count': 415}, 'infectious diseases': {'count': 8830}, 'genetic and genomic medicine': {'count': 1918}, 'health policy': {'count': 527}, 'gastroenterology': {'count': 343}, 'radiology and imaging': {'count': 541}, 'pain medicine': {'count': 121}, 'neurology': {'count': 1773}, 'primary care research': {'count': 232}, 'rheumatology': {'count': 189}, 'endocrinology': {'count': 419}, 'hematology': {'count': 202}, 'addiction medicine': {'count': 178}, 'pediatrics': {'count': 589}, 'cardiovascular medicine': {'count': 855}, 'obstetrics and gynecology': {'count': 373}, 'health systems and quality improvement': {'count': 491}, 'nephrology': {'count': 241}, 'respiratory medicine': {'count': 482}, 'geriatric medicine': {'count': 169}, 'dentistry and oral medicine': {'count': 159}, 'psychiatry and clinical psychology': {'count': 1781}, 'nutrition': {'count': 240}, 'intensive care and critical care medicine': {'count': 368}, 'rehabilitation medicine and physical therapy': {'count': 322}, 'otolaryngology': {'count': 166}, 'nursing': {'count': 93}, 'transplantation': {'count': 118}, 'health economics': {'count': 327}, 'sports medicine': {'count': 180}, 'hiv aids': {'count': 363}, 'dermatology': {'count': 98}, 'pathology': {'count': 223}, 'emergency medicine': {'count': 191}, 'pharmacology and therapeutics': {'count': 221}, 'ophthalmology': {'count': 220}, 'medical ethics': {'count': 46}, 'palliative medicine': {'count': 45}, 'sexual and reproductive health': {'count': 156}, 'medical education': {'count': 203}, 'surgery': {'count': 162}, 'urology': {'count': 65}, 'anesthesia': {'count': 72}, 'toxicology': {'count': 16}, 'forensic medicine': {'count': 6}}}} | | [MedrxivClusteringS2S.v2](https://api.medrxiv.org/) | ['eng'] | Clustering | s2s | [Academic, Medical, Written] | {'test': 37500} | {'test': {'num_samples': 37500, 'number_of_characters': 4301276, 'min_text_length': 18, 'average_text_length': 114.7, 'max_text_length': 339, 'min_labels_per_text': 6, 'average_labels_per_text': 1.0, 'max_labels_per_text': 8830, 'unique_labels': 51, 'labels': {'epidemiology': {'count': 6656}, 'public and global health': {'count': 3595}, 'oncology': {'count': 845}, 'allergy and immunology': {'count': 464}, 'orthopedics': {'count': 104}, 'health informatics': {'count': 1107}, 'occupational and environmental health': {'count': 415}, 'infectious diseases': {'count': 8830}, 'genetic and genomic medicine': {'count': 1918}, 'health policy': {'count': 527}, 'gastroenterology': {'count': 343}, 'radiology and imaging': {'count': 541}, 'pain medicine': {'count': 121}, 'neurology': {'count': 1773}, 'primary care research': {'count': 232}, 'rheumatology': {'count': 189}, 'endocrinology': {'count': 419}, 'hematology': {'count': 202}, 'addiction medicine': {'count': 178}, 'pediatrics': {'count': 589}, 'cardiovascular medicine': {'count': 855}, 'obstetrics and gynecology': {'count': 373}, 'health systems and quality improvement': {'count': 491}, 'nephrology': {'count': 241}, 'respiratory medicine': {'count': 482}, 'geriatric medicine': {'count': 169}, 'dentistry and oral medicine': {'count': 159}, 'psychiatry and clinical psychology': {'count': 1781}, 'nutrition': {'count': 240}, 'intensive care and critical care medicine': {'count': 368}, 'rehabilitation medicine and physical therapy': {'count': 322}, 'otolaryngology': {'count': 166}, 'nursing': {'count': 93}, 'transplantation': {'count': 118}, 'health economics': {'count': 327}, 'sports medicine': {'count': 180}, 'hiv aids': {'count': 363}, 'dermatology': {'count': 98}, 'pathology': {'count': 223}, 'emergency medicine': {'count': 191}, 'pharmacology and therapeutics': {'count': 221}, 'ophthalmology': {'count': 220}, 'medical ethics': {'count': 46}, 'palliative medicine': {'count': 45}, 'sexual and reproductive health': {'count': 156}, 'medical education': {'count': 203}, 'surgery': {'count': 162}, 'urology': {'count': 65}, 'anesthesia': {'count': 72}, 'toxicology': {'count': 16}, 'forensic medicine': {'count': 6}}}} | | [MemotionI2TRetrieval](https://aclanthology.org/2020.semeval-1.99/) (Sharma et al., 2020) | ['eng'] | Any2AnyRetrieval | i2t | [Encyclopaedic] | {'test': 7685} | {'test': {'number_of_characters': 578340, 'num_samples': 7685, 'num_queries': 697, 'num_documents': 6988, 'min_document_length': 0, 'average_document_length': 82.76, 'max_document_length': 1026, 'unique_documents': 6939, 'num_document_images': 0, 'min_query_length': 0, 'average_query_length': 0, 'max_query_length': 0, 'unique_queries': 0, 'num_query_images': 697, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 697}} | | [MemotionT2IRetrieval](https://aclanthology.org/2020.semeval-1.99/) (Sharma et al., 2020) | ['eng'] | Any2AnyRetrieval | t2i | [Encyclopaedic] | {'test': 7685} | {'test': {'number_of_characters': 58409, 'num_samples': 7685, 'num_queries': 697, 'num_documents': 6988, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 6988, 'min_query_length': 4, 'average_query_length': 83.8, 'max_query_length': 504, 'unique_queries': 697, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 697}} | -| [MewsC16JaClustering](https://github.com/sbintuitions/JMTEB) | ['jpn'] | Clustering | s2s | [News, Written] | None | None | -| [MindSmallReranking](https://msnews.github.io/assets/doc/ACL2020_MIND.pdf) | ['eng'] | Reranking | s2s | [News, Written] | None | None | -| MintakaRetrieval | ['ara', 'deu', 'fra', 'hin', 'ita', 'jpn', 'por', 'spa'] | Retrieval | s2p | [Encyclopaedic, Written] | None | None | +| [MewsC16JaClustering](https://github.com/sbintuitions/JMTEB) (Nishikawa et al., 2022) | ['jpn'] | Clustering | s2s | [News, Written] | None | None | +| [MindSmallReranking](https://msnews.github.io/assets/doc/ACL2020_MIND.pdf) (Wu et al., 2020) | ['eng'] | Reranking | s2s | [News, Written] | None | None | +| MintakaRetrieval (Sen et al., 2022) | ['ara', 'deu', 'fra', 'hin', 'ita', 'jpn', 'por', 'spa'] | Retrieval | s2p | [Encyclopaedic, Written] | None | None | | [Moroco](https://huggingface.co/datasets/moroco) (Andrei M. Butnaru, 2019) | ['ron'] | Classification | s2s | [News, Written] | None | None | | [MovieReviewSentimentClassification](https://github.com/TheophileBlard/french-sentiment-analysis-with-bert) (Théophile Blard, 2020) | ['fra'] | Classification | s2s | [Reviews, Written] | None | None | | [MrTidyRetrieval](https://huggingface.co/datasets/castorini/mr-tydi) (Xinyu Zhang, 2021) | ['ara', 'ben', 'eng', 'fin', 'ind', 'jpn', 'kor', 'rus', 'swa', 'tel', 'tha'] | Retrieval | s2p | [Encyclopaedic, Written] | None | None | +| [MridinghamStroke](https://huggingface.co/datasets/silky1708/Mridingham-Stroke) (Anantapadmanabhan et al., 2013) | ['eng'] | AudioClassification | a2t | [Music] | None | None | +| [MridinghamTonic](https://huggingface.co/datasets/silky1708/Mridingham-Tonic) (Anantapadmanabhan et al., 2013) | ['eng'] | AudioClassification | a2t | [Music] | None | None | | [MultiEURLEXMultilabelClassification](https://huggingface.co/datasets/coastalcph/multi_eurlex) (Chalkidis et al., 2021) | ['bul', 'ces', 'dan', 'deu', 'ell', 'eng', 'est', 'fin', 'fra', 'hrv', 'hun', 'ita', 'lav', 'lit', 'mlt', 'nld', 'pol', 'por', 'ron', 'slk', 'slv', 'spa', 'swe'] | MultilabelClassification | p2p | [Government, Legal, Written] | None | None | -| [MultiHateClassification](https://aclanthology.org/2022.woah-1.15/) | ['ara', 'cmn', 'deu', 'eng', 'fra', 'hin', 'ita', 'nld', 'pol', 'por', 'spa'] | Classification | s2s | [Constructed, Written] | None | None | +| [MultiHateClassification](https://aclanthology.org/2022.woah-1.15/) (R{\"o, 2021) | ['ara', 'cmn', 'deu', 'eng', 'fra', 'hin', 'ita', 'nld', 'pol', 'por', 'spa'] | Classification | s2s | [Constructed, Written] | None | None | | [MultiLongDocRetrieval](https://arxiv.org/abs/2402.03216) (Jianlv Chen, 2024) | ['ara', 'cmn', 'deu', 'eng', 'fra', 'hin', 'ita', 'jpn', 'kor', 'por', 'rus', 'spa', 'tha'] | Retrieval | s2p | [Encyclopaedic, Fiction, Non-fiction, Web, Written] | None | None | | [MultilingualSentiment](https://github.com/tyqiangz/multilingual-sentiment-datasets) | ['cmn'] | Classification | s2s | | None | None | -| [MultilingualSentimentClassification](https://huggingface.co/datasets/mteb/multilingual-sentiment-classification) | ['ara', 'bam', 'bul', 'cmn', 'cym', 'deu', 'dza', 'ell', 'eng', 'eus', 'fas', 'fin', 'heb', 'hrv', 'ind', 'jpn', 'kor', 'mlt', 'nor', 'pol', 'rus', 'slk', 'spa', 'tha', 'tur', 'uig', 'urd', 'vie', 'zho'] | Classification | s2s | [Reviews, Written] | None | None | +| [MultilingualSentimentClassification](https://huggingface.co/datasets/mteb/multilingual-sentiment-classification) (Mollanorozy et al., 2023) | ['ara', 'bam', 'bul', 'cmn', 'cym', 'deu', 'dza', 'ell', 'eng', 'eus', 'fas', 'fin', 'heb', 'hrv', 'ind', 'jpn', 'kor', 'mlt', 'nor', 'pol', 'rus', 'slk', 'spa', 'tha', 'tur', 'uig', 'urd', 'vie', 'zho'] | Classification | s2s | [Reviews, Written] | None | None | +| [MusicGenreClustering](https://www-ai.cs.tu-dortmund.de/audio.html) (Homburg et al., 2005) | ['eng'] | AudioClustering | a2a | [Music] | None | None | | [MyanmarNews](https://huggingface.co/datasets/myanmar_news) (A. H. Khine, 2017) | ['mya'] | Classification | p2p | [News, Written] | None | None | | [NFCorpus](https://www.cl.uni-heidelberg.de/statnlpgroup/nfcorpus/) (Boteva et al., 2016) | ['eng'] | Retrieval | s2p | [Academic, Medical, Written] | {'test': 3956} | {'test': {'number_of_characters': 1612.55, 'num_samples': 3956, 'num_queries': 323, 'num_documents': 3633, 'average_document_length': 0.44, 'average_query_length': 0.07, 'average_relevant_docs_per_query': 38.19}} | | [NFCorpus-Fa](https://huggingface.co/datasets/MCINext/nfcorpus-fa) | ['fas'] | Retrieval | s2p | [Medical] | None | None | @@ -495,39 +513,40 @@ The following tables give you an overview of the tasks in MTEB. | [NQ-PL](https://ai.google.com/research/NaturalQuestions/) (Konrad Wojtasik, 2024) | ['pol'] | Retrieval | s2p | | None | None | | [NQ-PLHardNegatives](https://ai.google.com/research/NaturalQuestions/) (Konrad Wojtasik, 2024) | ['pol'] | Retrieval | s2p | | None | None | | [NQHardNegatives](https://ai.google.com/research/NaturalQuestions/) (Tom Kwiatkowski, 2019) | ['eng'] | Retrieval | s2p | | None | None | -| [NTREXBitextMining](https://huggingface.co/datasets/davidstap/NTREX) | ['afr', 'amh', 'arb', 'aze', 'bak', 'bel', 'bem', 'ben', 'bod', 'bos', 'bul', 'cat', 'ces', 'ckb', 'cym', 'dan', 'deu', 'div', 'dzo', 'ell', 'eng', 'eus', 'ewe', 'fao', 'fas', 'fij', 'fil', 'fin', 'fra', 'fuc', 'gle', 'glg', 'guj', 'hau', 'heb', 'hin', 'hmn', 'hrv', 'hun', 'hye', 'ibo', 'ind', 'isl', 'ita', 'jpn', 'kan', 'kat', 'kaz', 'khm', 'kin', 'kir', 'kmr', 'kor', 'lao', 'lav', 'lit', 'ltz', 'mal', 'mar', 'mey', 'mkd', 'mlg', 'mlt', 'mon', 'mri', 'msa', 'mya', 'nde', 'nep', 'nld', 'nno', 'nob', 'nso', 'nya', 'orm', 'pan', 'pol', 'por', 'prs', 'pus', 'ron', 'rus', 'shi', 'sin', 'slk', 'slv', 'smo', 'sna', 'snd', 'som', 'spa', 'sqi', 'srp', 'ssw', 'swa', 'swe', 'tah', 'tam', 'tat', 'tel', 'tgk', 'tha', 'tir', 'ton', 'tsn', 'tuk', 'tur', 'uig', 'ukr', 'urd', 'uzb', 'ven', 'vie', 'wol', 'xho', 'yor', 'yue', 'zho', 'zul'] | BitextMining | s2s | [News, Written] | {'test': 3826252} | {'test': {'num_samples': 3826252, 'number_of_characters': 988355274, 'unique_pairs': 3820263, 'min_sentence1_length': 1, 'average_sentence1_length': 129.15, 'max_sentence1_length': 773, 'unique_sentence1': 241259, 'min_sentence2_length': 1, 'average_sentence2_length': 129.15, 'max_sentence2_length': 773, 'unique_sentence2': 241259, 'hf_subset_descriptive_stats': {'afr_Latn-dan_Latn': {'num_samples': 1997, 'number_of_characters': 520490, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 134.38, 'max_sentence1_length': 437, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 126.26, 'max_sentence2_length': 522, 'unique_sentence2': 1995}, 'afr_Latn-deu_Latn': {'num_samples': 1997, 'number_of_characters': 564002, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 134.38, 'max_sentence1_length': 437, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 148.05, 'max_sentence2_length': 508, 'unique_sentence2': 1996}, 'afr_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 516072, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 134.38, 'max_sentence1_length': 437, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'afr_Latn-fao_Latn': {'num_samples': 1997, 'number_of_characters': 526155, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 134.38, 'max_sentence1_length': 437, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 129.1, 'max_sentence2_length': 433, 'unique_sentence2': 1997}, 'afr_Latn-isl_Latn': {'num_samples': 1997, 'number_of_characters': 530560, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 134.38, 'max_sentence1_length': 437, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 131.3, 'max_sentence2_length': 399, 'unique_sentence2': 1996}, 'afr_Latn-ltz_Latn': {'num_samples': 1997, 'number_of_characters': 549109, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 134.38, 'max_sentence1_length': 437, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 140.59, 'max_sentence2_length': 543, 'unique_sentence2': 1996}, 'afr_Latn-nld_Latn': {'num_samples': 1997, 'number_of_characters': 560267, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 134.38, 'max_sentence1_length': 437, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 146.18, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'afr_Latn-nno_Latn': {'num_samples': 1997, 'number_of_characters': 516709, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 134.38, 'max_sentence1_length': 437, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.37, 'max_sentence2_length': 417, 'unique_sentence2': 1996}, 'afr_Latn-nob_Latn': {'num_samples': 1997, 'number_of_characters': 519796, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 134.38, 'max_sentence1_length': 437, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 125.91, 'max_sentence2_length': 482, 'unique_sentence2': 1996}, 'afr_Latn-swe_Latn': {'num_samples': 1997, 'number_of_characters': 520179, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 134.38, 'max_sentence1_length': 437, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 126.1, 'max_sentence2_length': 430, 'unique_sentence2': 1996}, 'amh_Ethi-eng_Latn': {'num_samples': 1997, 'number_of_characters': 415227, 'unique_pairs': 1997, 'min_sentence1_length': 1, 'average_sentence1_length': 83.88, 'max_sentence1_length': 290, 'unique_sentence1': 1994, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'amh_Ethi-hau_Latn': {'num_samples': 1997, 'number_of_characters': 437473, 'unique_pairs': 1997, 'min_sentence1_length': 1, 'average_sentence1_length': 83.88, 'max_sentence1_length': 290, 'unique_sentence1': 1994, 'min_sentence2_length': 8, 'average_sentence2_length': 135.19, 'max_sentence2_length': 483, 'unique_sentence2': 1997}, 'amh_Ethi-ibo_Latn': {'num_samples': 1997, 'number_of_characters': 413608, 'unique_pairs': 1997, 'min_sentence1_length': 1, 'average_sentence1_length': 83.88, 'max_sentence1_length': 290, 'unique_sentence1': 1994, 'min_sentence2_length': 6, 'average_sentence2_length': 123.24, 'max_sentence2_length': 469, 'unique_sentence2': 1997}, 'amh_Ethi-nso_Latn': {'num_samples': 1997, 'number_of_characters': 459006, 'unique_pairs': 1996, 'min_sentence1_length': 1, 'average_sentence1_length': 83.88, 'max_sentence1_length': 290, 'unique_sentence1': 1994, 'min_sentence2_length': 5, 'average_sentence2_length': 145.97, 'max_sentence2_length': 487, 'unique_sentence2': 1996}, 'amh_Ethi-orm_Ethi': {'num_samples': 1997, 'number_of_characters': 404938, 'unique_pairs': 1996, 'min_sentence1_length': 1, 'average_sentence1_length': 83.88, 'max_sentence1_length': 290, 'unique_sentence1': 1994, 'min_sentence2_length': 9, 'average_sentence2_length': 118.89, 'max_sentence2_length': 466, 'unique_sentence2': 1984}, 'amh_Ethi-som_Latn': {'num_samples': 1997, 'number_of_characters': 458799, 'unique_pairs': 1997, 'min_sentence1_length': 1, 'average_sentence1_length': 83.88, 'max_sentence1_length': 290, 'unique_sentence1': 1994, 'min_sentence2_length': 8, 'average_sentence2_length': 145.86, 'max_sentence2_length': 455, 'unique_sentence2': 1997}, 'amh_Ethi-ssw_Latn': {'num_samples': 1997, 'number_of_characters': 455649, 'unique_pairs': 1996, 'min_sentence1_length': 1, 'average_sentence1_length': 83.88, 'max_sentence1_length': 290, 'unique_sentence1': 1994, 'min_sentence2_length': 8, 'average_sentence2_length': 144.29, 'max_sentence2_length': 510, 'unique_sentence2': 1996}, 'amh_Ethi-swa_Latn': {'num_samples': 1997, 'number_of_characters': 440016, 'unique_pairs': 1997, 'min_sentence1_length': 1, 'average_sentence1_length': 83.88, 'max_sentence1_length': 290, 'unique_sentence1': 1994, 'min_sentence2_length': 10, 'average_sentence2_length': 136.46, 'max_sentence2_length': 430, 'unique_sentence2': 1997}, 'amh_Ethi-tir_Ethi': {'num_samples': 1997, 'number_of_characters': 332745, 'unique_pairs': 1996, 'min_sentence1_length': 1, 'average_sentence1_length': 83.88, 'max_sentence1_length': 290, 'unique_sentence1': 1994, 'min_sentence2_length': 5, 'average_sentence2_length': 82.74, 'max_sentence2_length': 272, 'unique_sentence2': 1996}, 'amh_Ethi-tsn_Latn': {'num_samples': 1997, 'number_of_characters': 501790, 'unique_pairs': 1997, 'min_sentence1_length': 1, 'average_sentence1_length': 83.88, 'max_sentence1_length': 290, 'unique_sentence1': 1994, 'min_sentence2_length': 6, 'average_sentence2_length': 167.39, 'max_sentence2_length': 556, 'unique_sentence2': 1997}, 'amh_Ethi-wol_Latn': {'num_samples': 1997, 'number_of_characters': 407310, 'unique_pairs': 1996, 'min_sentence1_length': 1, 'average_sentence1_length': 83.88, 'max_sentence1_length': 290, 'unique_sentence1': 1994, 'min_sentence2_length': 6, 'average_sentence2_length': 120.08, 'max_sentence2_length': 405, 'unique_sentence2': 1990}, 'amh_Ethi-xho_Latn': {'num_samples': 1997, 'number_of_characters': 435597, 'unique_pairs': 1997, 'min_sentence1_length': 1, 'average_sentence1_length': 83.88, 'max_sentence1_length': 290, 'unique_sentence1': 1994, 'min_sentence2_length': 6, 'average_sentence2_length': 134.25, 'max_sentence2_length': 492, 'unique_sentence2': 1997}, 'amh_Ethi-yor_Latn': {'num_samples': 1997, 'number_of_characters': 483595, 'unique_pairs': 1996, 'min_sentence1_length': 1, 'average_sentence1_length': 83.88, 'max_sentence1_length': 290, 'unique_sentence1': 1994, 'min_sentence2_length': 7, 'average_sentence2_length': 158.28, 'max_sentence2_length': 582, 'unique_sentence2': 1996}, 'amh_Ethi-zul_Latn': {'num_samples': 1997, 'number_of_characters': 425239, 'unique_pairs': 1997, 'min_sentence1_length': 1, 'average_sentence1_length': 83.88, 'max_sentence1_length': 290, 'unique_sentence1': 1994, 'min_sentence2_length': 8, 'average_sentence2_length': 129.06, 'max_sentence2_length': 494, 'unique_sentence2': 1996}, 'arb_Arab-ben_Beng': {'num_samples': 1997, 'number_of_characters': 474983, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 115.76, 'max_sentence1_length': 362, 'unique_sentence1': 1995, 'min_sentence2_length': 6, 'average_sentence2_length': 122.08, 'max_sentence2_length': 402, 'unique_sentence2': 1997}, 'arb_Arab-ckb_Arab': {'num_samples': 1997, 'number_of_characters': 483548, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 115.76, 'max_sentence1_length': 362, 'unique_sentence1': 1995, 'min_sentence2_length': 5, 'average_sentence2_length': 126.37, 'max_sentence2_length': 399, 'unique_sentence2': 1995}, 'arb_Arab-deu_Latn': {'num_samples': 1997, 'number_of_characters': 526831, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 115.76, 'max_sentence1_length': 362, 'unique_sentence1': 1995, 'min_sentence2_length': 9, 'average_sentence2_length': 148.05, 'max_sentence2_length': 508, 'unique_sentence2': 1996}, 'arb_Arab-ell_Grek': {'num_samples': 1997, 'number_of_characters': 530308, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 115.76, 'max_sentence1_length': 362, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 149.79, 'max_sentence2_length': 584, 'unique_sentence2': 1996}, 'arb_Arab-eng_Latn': {'num_samples': 1997, 'number_of_characters': 478901, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 115.76, 'max_sentence1_length': 362, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'arb_Arab-fas_Arab': {'num_samples': 1997, 'number_of_characters': 474520, 'unique_pairs': 1995, 'min_sentence1_length': 5, 'average_sentence1_length': 115.76, 'max_sentence1_length': 362, 'unique_sentence1': 1995, 'min_sentence2_length': 9, 'average_sentence2_length': 121.85, 'max_sentence2_length': 389, 'unique_sentence2': 1995}, 'arb_Arab-fin_Latn': {'num_samples': 1997, 'number_of_characters': 500981, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 115.76, 'max_sentence1_length': 362, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 135.1, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'arb_Arab-fra_Latn': {'num_samples': 1997, 'number_of_characters': 524289, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 115.76, 'max_sentence1_length': 362, 'unique_sentence1': 1995, 'min_sentence2_length': 8, 'average_sentence2_length': 146.77, 'max_sentence2_length': 512, 'unique_sentence2': 1996}, 'arb_Arab-heb_Hebr': {'num_samples': 1997, 'number_of_characters': 431477, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 115.76, 'max_sentence1_length': 362, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 100.3, 'max_sentence2_length': 375, 'unique_sentence2': 1996}, 'arb_Arab-hin_Deva': {'num_samples': 1997, 'number_of_characters': 492756, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 115.76, 'max_sentence1_length': 362, 'unique_sentence1': 1995, 'min_sentence2_length': 5, 'average_sentence2_length': 130.98, 'max_sentence2_length': 394, 'unique_sentence2': 1996}, 'arb_Arab-hun_Latn': {'num_samples': 1997, 'number_of_characters': 509557, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 115.76, 'max_sentence1_length': 362, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 139.4, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'arb_Arab-ind_Latn': {'num_samples': 1997, 'number_of_characters': 518153, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 115.76, 'max_sentence1_length': 362, 'unique_sentence1': 1995, 'min_sentence2_length': 6, 'average_sentence2_length': 143.7, 'max_sentence2_length': 486, 'unique_sentence2': 1997}, 'arb_Arab-jpn_Jpan': {'num_samples': 1997, 'number_of_characters': 342807, 'unique_pairs': 1995, 'min_sentence1_length': 5, 'average_sentence1_length': 115.76, 'max_sentence1_length': 362, 'unique_sentence1': 1995, 'min_sentence2_length': 4, 'average_sentence2_length': 55.9, 'max_sentence2_length': 189, 'unique_sentence2': 1994}, 'arb_Arab-kmr_Latn': {'num_samples': 1997, 'number_of_characters': 477127, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 115.76, 'max_sentence1_length': 362, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 123.16, 'max_sentence2_length': 420, 'unique_sentence2': 1996}, 'arb_Arab-kor_Hang': {'num_samples': 1997, 'number_of_characters': 364586, 'unique_pairs': 1995, 'min_sentence1_length': 5, 'average_sentence1_length': 115.76, 'max_sentence1_length': 362, 'unique_sentence1': 1995, 'min_sentence2_length': 5, 'average_sentence2_length': 66.8, 'max_sentence2_length': 217, 'unique_sentence2': 1995}, 'arb_Arab-lit_Latn': {'num_samples': 1997, 'number_of_characters': 490578, 'unique_pairs': 1995, 'min_sentence1_length': 5, 'average_sentence1_length': 115.76, 'max_sentence1_length': 362, 'unique_sentence1': 1995, 'min_sentence2_length': 9, 'average_sentence2_length': 129.89, 'max_sentence2_length': 446, 'unique_sentence2': 1995}, 'arb_Arab-mey_Arab': {'num_samples': 1997, 'number_of_characters': 445016, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 115.76, 'max_sentence1_length': 362, 'unique_sentence1': 1995, 'min_sentence2_length': 6, 'average_sentence2_length': 107.08, 'max_sentence2_length': 392, 'unique_sentence2': 1993}, 'arb_Arab-nld_Latn': {'num_samples': 1997, 'number_of_characters': 523096, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 115.76, 'max_sentence1_length': 362, 'unique_sentence1': 1995, 'min_sentence2_length': 8, 'average_sentence2_length': 146.18, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'arb_Arab-pol_Latn': {'num_samples': 1997, 'number_of_characters': 509047, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 115.76, 'max_sentence1_length': 362, 'unique_sentence1': 1995, 'min_sentence2_length': 9, 'average_sentence2_length': 139.14, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'arb_Arab-por_Latn': {'num_samples': 1997, 'number_of_characters': 508396, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 115.76, 'max_sentence1_length': 362, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 138.82, 'max_sentence2_length': 497, 'unique_sentence2': 1996}, 'arb_Arab-prs_Arab': {'num_samples': 1997, 'number_of_characters': 473717, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 115.76, 'max_sentence1_length': 362, 'unique_sentence1': 1995, 'min_sentence2_length': 8, 'average_sentence2_length': 121.45, 'max_sentence2_length': 365, 'unique_sentence2': 1997}, 'arb_Arab-pus_Arab': {'num_samples': 1997, 'number_of_characters': 473814, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 115.76, 'max_sentence1_length': 362, 'unique_sentence1': 1995, 'min_sentence2_length': 8, 'average_sentence2_length': 121.5, 'max_sentence2_length': 366, 'unique_sentence2': 1996}, 'arb_Arab-rus_Cyrl': {'num_samples': 1997, 'number_of_characters': 506074, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 115.76, 'max_sentence1_length': 362, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 137.65, 'max_sentence2_length': 419, 'unique_sentence2': 1996}, 'arb_Arab-shi_Arab': {'num_samples': 1997, 'number_of_characters': 446094, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 115.76, 'max_sentence1_length': 362, 'unique_sentence1': 1995, 'min_sentence2_length': 3, 'average_sentence2_length': 107.62, 'max_sentence2_length': 378, 'unique_sentence2': 1996}, 'arb_Arab-spa_Latn': {'num_samples': 1997, 'number_of_characters': 519381, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 115.76, 'max_sentence1_length': 362, 'unique_sentence1': 1995, 'min_sentence2_length': 1, 'average_sentence2_length': 144.32, 'max_sentence2_length': 504, 'unique_sentence2': 1996}, 'arb_Arab-swa_Latn': {'num_samples': 1997, 'number_of_characters': 503690, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 115.76, 'max_sentence1_length': 362, 'unique_sentence1': 1995, 'min_sentence2_length': 10, 'average_sentence2_length': 136.46, 'max_sentence2_length': 430, 'unique_sentence2': 1997}, 'arb_Arab-swe_Latn': {'num_samples': 1997, 'number_of_characters': 483008, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 115.76, 'max_sentence1_length': 362, 'unique_sentence1': 1995, 'min_sentence2_length': 8, 'average_sentence2_length': 126.1, 'max_sentence2_length': 430, 'unique_sentence2': 1996}, 'arb_Arab-tam_Taml': {'num_samples': 1997, 'number_of_characters': 541142, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 115.76, 'max_sentence1_length': 362, 'unique_sentence1': 1995, 'min_sentence2_length': 11, 'average_sentence2_length': 155.21, 'max_sentence2_length': 581, 'unique_sentence2': 1997}, 'arb_Arab-tgk_Cyrl': {'num_samples': 1997, 'number_of_characters': 505328, 'unique_pairs': 1995, 'min_sentence1_length': 5, 'average_sentence1_length': 115.76, 'max_sentence1_length': 362, 'unique_sentence1': 1995, 'min_sentence2_length': 3, 'average_sentence2_length': 137.28, 'max_sentence2_length': 451, 'unique_sentence2': 1995}, 'arb_Arab-tur_Latn': {'num_samples': 1997, 'number_of_characters': 496794, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 115.76, 'max_sentence1_length': 362, 'unique_sentence1': 1995, 'min_sentence2_length': 6, 'average_sentence2_length': 133.01, 'max_sentence2_length': 504, 'unique_sentence2': 1997}, 'arb_Arab-vie_Latn': {'num_samples': 1997, 'number_of_characters': 502302, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 115.76, 'max_sentence1_length': 362, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 135.76, 'max_sentence2_length': 437, 'unique_sentence2': 1996}, 'arb_Arab-zho_Hant': {'num_samples': 1997, 'number_of_characters': 322659, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 115.76, 'max_sentence1_length': 362, 'unique_sentence1': 1995, 'min_sentence2_length': 3, 'average_sentence2_length': 45.81, 'max_sentence2_length': 200, 'unique_sentence2': 1996}, 'arb_Arab-zul_Latn': {'num_samples': 1997, 'number_of_characters': 488913, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 115.76, 'max_sentence1_length': 362, 'unique_sentence1': 1995, 'min_sentence2_length': 8, 'average_sentence2_length': 129.06, 'max_sentence2_length': 494, 'unique_sentence2': 1996}, 'aze_Latn-bak_Cyrl': {'num_samples': 1997, 'number_of_characters': 515960, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 135.02, 'max_sentence1_length': 398, 'unique_sentence1': 1997, 'min_sentence2_length': 11, 'average_sentence2_length': 123.35, 'max_sentence2_length': 437, 'unique_sentence2': 1995}, 'aze_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 517354, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 135.02, 'max_sentence1_length': 398, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'aze_Latn-kaz_Cyrl': {'num_samples': 1997, 'number_of_characters': 529910, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 135.02, 'max_sentence1_length': 398, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 130.33, 'max_sentence2_length': 473, 'unique_sentence2': 1996}, 'aze_Latn-kir_Cyrl': {'num_samples': 1997, 'number_of_characters': 520498, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 135.02, 'max_sentence1_length': 398, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 125.62, 'max_sentence2_length': 395, 'unique_sentence2': 1996}, 'aze_Latn-tat_Cyrl': {'num_samples': 1997, 'number_of_characters': 515560, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 135.02, 'max_sentence1_length': 398, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 123.15, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'aze_Latn-tuk_Latn': {'num_samples': 1997, 'number_of_characters': 554908, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 135.02, 'max_sentence1_length': 398, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 142.85, 'max_sentence2_length': 576, 'unique_sentence2': 1996}, 'aze_Latn-tur_Latn': {'num_samples': 1997, 'number_of_characters': 535247, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 135.02, 'max_sentence1_length': 398, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 133.01, 'max_sentence2_length': 504, 'unique_sentence2': 1997}, 'aze_Latn-uig_Arab': {'num_samples': 1997, 'number_of_characters': 580656, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 135.02, 'max_sentence1_length': 398, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 155.74, 'max_sentence2_length': 592, 'unique_sentence2': 1996}, 'aze_Latn-uzb_Latn': {'num_samples': 1997, 'number_of_characters': 563329, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 135.02, 'max_sentence1_length': 398, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 147.07, 'max_sentence2_length': 470, 'unique_sentence2': 1996}, 'bak_Cyrl-aze_Latn': {'num_samples': 1997, 'number_of_characters': 515960, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 123.35, 'max_sentence1_length': 437, 'unique_sentence1': 1995, 'min_sentence2_length': 5, 'average_sentence2_length': 135.02, 'max_sentence2_length': 398, 'unique_sentence2': 1997}, 'bak_Cyrl-eng_Latn': {'num_samples': 1997, 'number_of_characters': 494046, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 123.35, 'max_sentence1_length': 437, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'bak_Cyrl-kaz_Cyrl': {'num_samples': 1997, 'number_of_characters': 506602, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 123.35, 'max_sentence1_length': 437, 'unique_sentence1': 1995, 'min_sentence2_length': 8, 'average_sentence2_length': 130.33, 'max_sentence2_length': 473, 'unique_sentence2': 1996}, 'bak_Cyrl-kir_Cyrl': {'num_samples': 1997, 'number_of_characters': 497190, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 123.35, 'max_sentence1_length': 437, 'unique_sentence1': 1995, 'min_sentence2_length': 8, 'average_sentence2_length': 125.62, 'max_sentence2_length': 395, 'unique_sentence2': 1996}, 'bak_Cyrl-tat_Cyrl': {'num_samples': 1997, 'number_of_characters': 492252, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 123.35, 'max_sentence1_length': 437, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 123.15, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'bak_Cyrl-tuk_Latn': {'num_samples': 1997, 'number_of_characters': 531600, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 123.35, 'max_sentence1_length': 437, 'unique_sentence1': 1995, 'min_sentence2_length': 9, 'average_sentence2_length': 142.85, 'max_sentence2_length': 576, 'unique_sentence2': 1996}, 'bak_Cyrl-tur_Latn': {'num_samples': 1997, 'number_of_characters': 511939, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 123.35, 'max_sentence1_length': 437, 'unique_sentence1': 1995, 'min_sentence2_length': 6, 'average_sentence2_length': 133.01, 'max_sentence2_length': 504, 'unique_sentence2': 1997}, 'bak_Cyrl-uig_Arab': {'num_samples': 1997, 'number_of_characters': 557348, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 123.35, 'max_sentence1_length': 437, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 155.74, 'max_sentence2_length': 592, 'unique_sentence2': 1996}, 'bak_Cyrl-uzb_Latn': {'num_samples': 1997, 'number_of_characters': 540021, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 123.35, 'max_sentence1_length': 437, 'unique_sentence1': 1995, 'min_sentence2_length': 8, 'average_sentence2_length': 147.07, 'max_sentence2_length': 470, 'unique_sentence2': 1996}, 'bel_Cyrl-bos_Latn': {'num_samples': 1997, 'number_of_characters': 511000, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 128.24, 'max_sentence1_length': 422, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 127.65, 'max_sentence2_length': 434, 'unique_sentence2': 1996}, 'bel_Cyrl-bul_Cyrl': {'num_samples': 1997, 'number_of_characters': 525979, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 128.24, 'max_sentence1_length': 422, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 135.15, 'max_sentence2_length': 493, 'unique_sentence2': 1996}, 'bel_Cyrl-ces_Latn': {'num_samples': 1997, 'number_of_characters': 497408, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 128.24, 'max_sentence1_length': 422, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 120.84, 'max_sentence2_length': 474, 'unique_sentence2': 1997}, 'bel_Cyrl-eng_Latn': {'num_samples': 1997, 'number_of_characters': 503810, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 128.24, 'max_sentence1_length': 422, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'bel_Cyrl-hrv_Latn': {'num_samples': 1997, 'number_of_characters': 512015, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 128.24, 'max_sentence1_length': 422, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 128.15, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'bel_Cyrl-mkd_Cyrl': {'num_samples': 1997, 'number_of_characters': 523981, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 128.24, 'max_sentence1_length': 422, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 134.15, 'max_sentence2_length': 451, 'unique_sentence2': 1997}, 'bel_Cyrl-pol_Latn': {'num_samples': 1997, 'number_of_characters': 533956, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 128.24, 'max_sentence1_length': 422, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 139.14, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'bel_Cyrl-rus_Cyrl': {'num_samples': 1997, 'number_of_characters': 530983, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 128.24, 'max_sentence1_length': 422, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 137.65, 'max_sentence2_length': 419, 'unique_sentence2': 1996}, 'bel_Cyrl-slk_Latn': {'num_samples': 1997, 'number_of_characters': 509059, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 128.24, 'max_sentence1_length': 422, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 126.67, 'max_sentence2_length': 403, 'unique_sentence2': 1996}, 'bel_Cyrl-slv_Latn': {'num_samples': 1997, 'number_of_characters': 508986, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 128.24, 'max_sentence1_length': 422, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 126.64, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'bel_Cyrl-srp_Cyrl': {'num_samples': 1997, 'number_of_characters': 508393, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 128.24, 'max_sentence1_length': 422, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 126.34, 'max_sentence2_length': 439, 'unique_sentence2': 1995}, 'bel_Cyrl-srp_Latn': {'num_samples': 1997, 'number_of_characters': 512231, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 128.24, 'max_sentence1_length': 422, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 128.26, 'max_sentence2_length': 452, 'unique_sentence2': 1996}, 'bel_Cyrl-ukr_Cyrl': {'num_samples': 1997, 'number_of_characters': 518873, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 128.24, 'max_sentence1_length': 422, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 131.59, 'max_sentence2_length': 440, 'unique_sentence2': 1996}, 'bem_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 546212, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 149.47, 'max_sentence1_length': 465, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'bem_Latn-ewe_Latn': {'num_samples': 1997, 'number_of_characters': 537470, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 149.47, 'max_sentence1_length': 465, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 119.67, 'max_sentence2_length': 493, 'unique_sentence2': 1994}, 'bem_Latn-fuc_Latn': {'num_samples': 1997, 'number_of_characters': 526972, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 149.47, 'max_sentence1_length': 465, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 114.41, 'max_sentence2_length': 376, 'unique_sentence2': 1996}, 'bem_Latn-kin_Latn': {'num_samples': 1997, 'number_of_characters': 602279, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 149.47, 'max_sentence1_length': 465, 'unique_sentence1': 1997, 'min_sentence2_length': 10, 'average_sentence2_length': 152.12, 'max_sentence2_length': 541, 'unique_sentence2': 1996}, 'bem_Latn-nde_Latn': {'num_samples': 1997, 'number_of_characters': 596231, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 149.47, 'max_sentence1_length': 465, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 149.09, 'max_sentence2_length': 590, 'unique_sentence2': 1997}, 'bem_Latn-nya_Latn': {'num_samples': 1997, 'number_of_characters': 582774, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 149.47, 'max_sentence1_length': 465, 'unique_sentence1': 1997, 'min_sentence2_length': 10, 'average_sentence2_length': 142.35, 'max_sentence2_length': 464, 'unique_sentence2': 1993}, 'bem_Latn-sna_Latn': {'num_samples': 1997, 'number_of_characters': 596822, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 149.47, 'max_sentence1_length': 465, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 149.39, 'max_sentence2_length': 511, 'unique_sentence2': 1995}, 'bem_Latn-ven_Latn': {'num_samples': 1997, 'number_of_characters': 598248, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 149.47, 'max_sentence1_length': 465, 'unique_sentence1': 1997, 'min_sentence2_length': 10, 'average_sentence2_length': 150.1, 'max_sentence2_length': 535, 'unique_sentence2': 1993}, 'ben_Beng-arb_Arab': {'num_samples': 1997, 'number_of_characters': 474983, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 122.08, 'max_sentence1_length': 402, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 115.76, 'max_sentence2_length': 362, 'unique_sentence2': 1995}, 'ben_Beng-deu_Latn': {'num_samples': 1997, 'number_of_characters': 539452, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 122.08, 'max_sentence1_length': 402, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 148.05, 'max_sentence2_length': 508, 'unique_sentence2': 1996}, 'ben_Beng-div_Thaa': {'num_samples': 1997, 'number_of_characters': 547650, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 122.08, 'max_sentence1_length': 402, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 152.15, 'max_sentence2_length': 609, 'unique_sentence2': 1996}, 'ben_Beng-ell_Grek': {'num_samples': 1997, 'number_of_characters': 542929, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 122.08, 'max_sentence1_length': 402, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 149.79, 'max_sentence2_length': 584, 'unique_sentence2': 1996}, 'ben_Beng-eng_Latn': {'num_samples': 1997, 'number_of_characters': 491522, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 122.08, 'max_sentence1_length': 402, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'ben_Beng-eus_Latn': {'num_samples': 1997, 'number_of_characters': 519005, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 122.08, 'max_sentence1_length': 402, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 137.81, 'max_sentence2_length': 393, 'unique_sentence2': 1997}, 'ben_Beng-fas_Arab': {'num_samples': 1997, 'number_of_characters': 487141, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 122.08, 'max_sentence1_length': 402, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 121.85, 'max_sentence2_length': 389, 'unique_sentence2': 1995}, 'ben_Beng-fin_Latn': {'num_samples': 1997, 'number_of_characters': 513602, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 122.08, 'max_sentence1_length': 402, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 135.1, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'ben_Beng-fra_Latn': {'num_samples': 1997, 'number_of_characters': 536910, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 122.08, 'max_sentence1_length': 402, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 146.77, 'max_sentence2_length': 512, 'unique_sentence2': 1996}, 'ben_Beng-guj_Gujr': {'num_samples': 1997, 'number_of_characters': 488733, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 122.08, 'max_sentence1_length': 402, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 122.65, 'max_sentence2_length': 378, 'unique_sentence2': 1997}, 'ben_Beng-heb_Hebr': {'num_samples': 1997, 'number_of_characters': 444098, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 122.08, 'max_sentence1_length': 402, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 100.3, 'max_sentence2_length': 375, 'unique_sentence2': 1996}, 'ben_Beng-hin_Deva': {'num_samples': 1997, 'number_of_characters': 505377, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 122.08, 'max_sentence1_length': 402, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 130.98, 'max_sentence2_length': 394, 'unique_sentence2': 1996}, 'ben_Beng-hun_Latn': {'num_samples': 1997, 'number_of_characters': 522178, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 122.08, 'max_sentence1_length': 402, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 139.4, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'ben_Beng-ind_Latn': {'num_samples': 1997, 'number_of_characters': 530774, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 122.08, 'max_sentence1_length': 402, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 143.7, 'max_sentence2_length': 486, 'unique_sentence2': 1997}, 'ben_Beng-jpn_Jpan': {'num_samples': 1997, 'number_of_characters': 355428, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 122.08, 'max_sentence1_length': 402, 'unique_sentence1': 1997, 'min_sentence2_length': 4, 'average_sentence2_length': 55.9, 'max_sentence2_length': 189, 'unique_sentence2': 1994}, 'ben_Beng-kan_Knda': {'num_samples': 1997, 'number_of_characters': 509338, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 122.08, 'max_sentence1_length': 402, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 132.97, 'max_sentence2_length': 449, 'unique_sentence2': 1996}, 'ben_Beng-kor_Hang': {'num_samples': 1997, 'number_of_characters': 377207, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 122.08, 'max_sentence1_length': 402, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 66.8, 'max_sentence2_length': 217, 'unique_sentence2': 1995}, 'ben_Beng-lit_Latn': {'num_samples': 1997, 'number_of_characters': 503199, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 122.08, 'max_sentence1_length': 402, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 129.89, 'max_sentence2_length': 446, 'unique_sentence2': 1995}, 'ben_Beng-mar_Deva': {'num_samples': 1997, 'number_of_characters': 504689, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 122.08, 'max_sentence1_length': 402, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 130.64, 'max_sentence2_length': 443, 'unique_sentence2': 1995}, 'ben_Beng-nep_Deva': {'num_samples': 1997, 'number_of_characters': 492025, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 122.08, 'max_sentence1_length': 402, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 124.3, 'max_sentence2_length': 412, 'unique_sentence2': 1996}, 'ben_Beng-nld_Latn': {'num_samples': 1997, 'number_of_characters': 535717, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 122.08, 'max_sentence1_length': 402, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 146.18, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'ben_Beng-pan_Guru': {'num_samples': 1997, 'number_of_characters': 494224, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 122.08, 'max_sentence1_length': 402, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 125.4, 'max_sentence2_length': 383, 'unique_sentence2': 1996}, 'ben_Beng-pol_Latn': {'num_samples': 1997, 'number_of_characters': 521668, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 122.08, 'max_sentence1_length': 402, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 139.14, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'ben_Beng-por_Latn': {'num_samples': 1997, 'number_of_characters': 521017, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 122.08, 'max_sentence1_length': 402, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 138.82, 'max_sentence2_length': 497, 'unique_sentence2': 1996}, 'ben_Beng-rus_Cyrl': {'num_samples': 1997, 'number_of_characters': 518695, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 122.08, 'max_sentence1_length': 402, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 137.65, 'max_sentence2_length': 419, 'unique_sentence2': 1996}, 'ben_Beng-sin_Sinh': {'num_samples': 1997, 'number_of_characters': 502543, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 122.08, 'max_sentence1_length': 402, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 129.56, 'max_sentence2_length': 441, 'unique_sentence2': 1996}, 'ben_Beng-snd_Arab': {'num_samples': 1997, 'number_of_characters': 464129, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 122.08, 'max_sentence1_length': 402, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 110.33, 'max_sentence2_length': 335, 'unique_sentence2': 1996}, 'ben_Beng-spa_Latn': {'num_samples': 1997, 'number_of_characters': 532002, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 122.08, 'max_sentence1_length': 402, 'unique_sentence1': 1997, 'min_sentence2_length': 1, 'average_sentence2_length': 144.32, 'max_sentence2_length': 504, 'unique_sentence2': 1996}, 'ben_Beng-swa_Latn': {'num_samples': 1997, 'number_of_characters': 516311, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 122.08, 'max_sentence1_length': 402, 'unique_sentence1': 1997, 'min_sentence2_length': 10, 'average_sentence2_length': 136.46, 'max_sentence2_length': 430, 'unique_sentence2': 1997}, 'ben_Beng-swe_Latn': {'num_samples': 1997, 'number_of_characters': 495629, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 122.08, 'max_sentence1_length': 402, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 126.1, 'max_sentence2_length': 430, 'unique_sentence2': 1996}, 'ben_Beng-tam_Taml': {'num_samples': 1997, 'number_of_characters': 553763, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 122.08, 'max_sentence1_length': 402, 'unique_sentence1': 1997, 'min_sentence2_length': 11, 'average_sentence2_length': 155.21, 'max_sentence2_length': 581, 'unique_sentence2': 1997}, 'ben_Beng-tel_Telu': {'num_samples': 1997, 'number_of_characters': 491329, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 122.08, 'max_sentence1_length': 402, 'unique_sentence1': 1997, 'min_sentence2_length': 10, 'average_sentence2_length': 123.95, 'max_sentence2_length': 412, 'unique_sentence2': 1996}, 'ben_Beng-tur_Latn': {'num_samples': 1997, 'number_of_characters': 509415, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 122.08, 'max_sentence1_length': 402, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 133.01, 'max_sentence2_length': 504, 'unique_sentence2': 1997}, 'ben_Beng-urd_Arab': {'num_samples': 1997, 'number_of_characters': 491800, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 122.08, 'max_sentence1_length': 402, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.19, 'max_sentence2_length': 390, 'unique_sentence2': 1996}, 'ben_Beng-vie_Latn': {'num_samples': 1997, 'number_of_characters': 514923, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 122.08, 'max_sentence1_length': 402, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 135.76, 'max_sentence2_length': 437, 'unique_sentence2': 1996}, 'ben_Beng-zho_Hant': {'num_samples': 1997, 'number_of_characters': 335280, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 122.08, 'max_sentence1_length': 402, 'unique_sentence1': 1997, 'min_sentence2_length': 3, 'average_sentence2_length': 45.81, 'max_sentence2_length': 200, 'unique_sentence2': 1996}, 'ben_Beng-zul_Latn': {'num_samples': 1997, 'number_of_characters': 501534, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 122.08, 'max_sentence1_length': 402, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 129.06, 'max_sentence2_length': 494, 'unique_sentence2': 1996}, 'bod_Tibt-dzo_Tibt': {'num_samples': 1997, 'number_of_characters': 543850, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 150.54, 'max_sentence1_length': 478, 'unique_sentence1': 1993, 'min_sentence2_length': 5, 'average_sentence2_length': 121.79, 'max_sentence2_length': 411, 'unique_sentence2': 1992}, 'bod_Tibt-eng_Latn': {'num_samples': 1997, 'number_of_characters': 548349, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 150.54, 'max_sentence1_length': 478, 'unique_sentence1': 1993, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'bod_Tibt-khm_Khmr': {'num_samples': 1997, 'number_of_characters': 589120, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 150.54, 'max_sentence1_length': 478, 'unique_sentence1': 1993, 'min_sentence2_length': 6, 'average_sentence2_length': 144.46, 'max_sentence2_length': 517, 'unique_sentence2': 1996}, 'bod_Tibt-lao_Laoo': {'num_samples': 1997, 'number_of_characters': 567609, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 150.54, 'max_sentence1_length': 478, 'unique_sentence1': 1993, 'min_sentence2_length': 7, 'average_sentence2_length': 133.69, 'max_sentence2_length': 507, 'unique_sentence2': 1997}, 'bod_Tibt-mon_Mong': {'num_samples': 1997, 'number_of_characters': 559677, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 150.54, 'max_sentence1_length': 478, 'unique_sentence1': 1993, 'min_sentence2_length': 11, 'average_sentence2_length': 129.72, 'max_sentence2_length': 414, 'unique_sentence2': 1997}, 'bod_Tibt-mya_Mymr': {'num_samples': 1997, 'number_of_characters': 612483, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 150.54, 'max_sentence1_length': 478, 'unique_sentence1': 1993, 'min_sentence2_length': 9, 'average_sentence2_length': 156.16, 'max_sentence2_length': 773, 'unique_sentence2': 1997}, 'bod_Tibt-tha_Thai': {'num_samples': 1997, 'number_of_characters': 538097, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 150.54, 'max_sentence1_length': 478, 'unique_sentence1': 1993, 'min_sentence2_length': 7, 'average_sentence2_length': 118.91, 'max_sentence2_length': 439, 'unique_sentence2': 1996}, 'bos_Latn-bel_Cyrl': {'num_samples': 1997, 'number_of_characters': 511000, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 127.65, 'max_sentence1_length': 434, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 128.24, 'max_sentence2_length': 422, 'unique_sentence2': 1996}, 'bos_Latn-bul_Cyrl': {'num_samples': 1997, 'number_of_characters': 524799, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 127.65, 'max_sentence1_length': 434, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 135.15, 'max_sentence2_length': 493, 'unique_sentence2': 1996}, 'bos_Latn-ces_Latn': {'num_samples': 1997, 'number_of_characters': 496228, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 127.65, 'max_sentence1_length': 434, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 120.84, 'max_sentence2_length': 474, 'unique_sentence2': 1997}, 'bos_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 502630, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 127.65, 'max_sentence1_length': 434, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'bos_Latn-hrv_Latn': {'num_samples': 1997, 'number_of_characters': 510835, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 127.65, 'max_sentence1_length': 434, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 128.15, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'bos_Latn-mkd_Cyrl': {'num_samples': 1997, 'number_of_characters': 522801, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 127.65, 'max_sentence1_length': 434, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 134.15, 'max_sentence2_length': 451, 'unique_sentence2': 1997}, 'bos_Latn-pol_Latn': {'num_samples': 1997, 'number_of_characters': 532776, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 127.65, 'max_sentence1_length': 434, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 139.14, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'bos_Latn-rus_Cyrl': {'num_samples': 1997, 'number_of_characters': 529803, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 127.65, 'max_sentence1_length': 434, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 137.65, 'max_sentence2_length': 419, 'unique_sentence2': 1996}, 'bos_Latn-slk_Latn': {'num_samples': 1997, 'number_of_characters': 507879, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 127.65, 'max_sentence1_length': 434, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 126.67, 'max_sentence2_length': 403, 'unique_sentence2': 1996}, 'bos_Latn-slv_Latn': {'num_samples': 1997, 'number_of_characters': 507806, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 127.65, 'max_sentence1_length': 434, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 126.64, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'bos_Latn-srp_Cyrl': {'num_samples': 1997, 'number_of_characters': 507213, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 127.65, 'max_sentence1_length': 434, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 126.34, 'max_sentence2_length': 439, 'unique_sentence2': 1995}, 'bos_Latn-srp_Latn': {'num_samples': 1997, 'number_of_characters': 511051, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 127.65, 'max_sentence1_length': 434, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 128.26, 'max_sentence2_length': 452, 'unique_sentence2': 1996}, 'bos_Latn-ukr_Cyrl': {'num_samples': 1997, 'number_of_characters': 517693, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 127.65, 'max_sentence1_length': 434, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 131.59, 'max_sentence2_length': 440, 'unique_sentence2': 1996}, 'bul_Cyrl-bel_Cyrl': {'num_samples': 1997, 'number_of_characters': 525979, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 135.15, 'max_sentence1_length': 493, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 128.24, 'max_sentence2_length': 422, 'unique_sentence2': 1996}, 'bul_Cyrl-bos_Latn': {'num_samples': 1997, 'number_of_characters': 524799, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 135.15, 'max_sentence1_length': 493, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 127.65, 'max_sentence2_length': 434, 'unique_sentence2': 1996}, 'bul_Cyrl-ces_Latn': {'num_samples': 1997, 'number_of_characters': 511207, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 135.15, 'max_sentence1_length': 493, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 120.84, 'max_sentence2_length': 474, 'unique_sentence2': 1997}, 'bul_Cyrl-eng_Latn': {'num_samples': 1997, 'number_of_characters': 517609, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 135.15, 'max_sentence1_length': 493, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'bul_Cyrl-hrv_Latn': {'num_samples': 1997, 'number_of_characters': 525814, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 135.15, 'max_sentence1_length': 493, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 128.15, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'bul_Cyrl-mkd_Cyrl': {'num_samples': 1997, 'number_of_characters': 537780, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 135.15, 'max_sentence1_length': 493, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 134.15, 'max_sentence2_length': 451, 'unique_sentence2': 1997}, 'bul_Cyrl-pol_Latn': {'num_samples': 1997, 'number_of_characters': 547755, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 135.15, 'max_sentence1_length': 493, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 139.14, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'bul_Cyrl-rus_Cyrl': {'num_samples': 1997, 'number_of_characters': 544782, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 135.15, 'max_sentence1_length': 493, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 137.65, 'max_sentence2_length': 419, 'unique_sentence2': 1996}, 'bul_Cyrl-slk_Latn': {'num_samples': 1997, 'number_of_characters': 522858, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 135.15, 'max_sentence1_length': 493, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 126.67, 'max_sentence2_length': 403, 'unique_sentence2': 1996}, 'bul_Cyrl-slv_Latn': {'num_samples': 1997, 'number_of_characters': 522785, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 135.15, 'max_sentence1_length': 493, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 126.64, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'bul_Cyrl-srp_Cyrl': {'num_samples': 1997, 'number_of_characters': 522192, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 135.15, 'max_sentence1_length': 493, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 126.34, 'max_sentence2_length': 439, 'unique_sentence2': 1995}, 'bul_Cyrl-srp_Latn': {'num_samples': 1997, 'number_of_characters': 526030, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 135.15, 'max_sentence1_length': 493, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 128.26, 'max_sentence2_length': 452, 'unique_sentence2': 1996}, 'bul_Cyrl-ukr_Cyrl': {'num_samples': 1997, 'number_of_characters': 532672, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 135.15, 'max_sentence1_length': 493, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 131.59, 'max_sentence2_length': 440, 'unique_sentence2': 1996}, 'cat_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 530680, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 141.69, 'max_sentence1_length': 460, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'cat_Latn-fra_Latn': {'num_samples': 1997, 'number_of_characters': 576068, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 141.69, 'max_sentence1_length': 460, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 146.77, 'max_sentence2_length': 512, 'unique_sentence2': 1996}, 'cat_Latn-glg_Latn': {'num_samples': 1997, 'number_of_characters': 554946, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 141.69, 'max_sentence1_length': 460, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 136.2, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'cat_Latn-ita_Latn': {'num_samples': 1997, 'number_of_characters': 572177, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 141.69, 'max_sentence1_length': 460, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 144.83, 'max_sentence2_length': 623, 'unique_sentence2': 1996}, 'cat_Latn-mlt_Latn': {'num_samples': 1997, 'number_of_characters': 560435, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 141.69, 'max_sentence1_length': 460, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 138.95, 'max_sentence2_length': 582, 'unique_sentence2': 1996}, 'cat_Latn-por_Latn': {'num_samples': 1997, 'number_of_characters': 560175, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 141.69, 'max_sentence1_length': 460, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 138.82, 'max_sentence2_length': 497, 'unique_sentence2': 1996}, 'cat_Latn-ron_Latn': {'num_samples': 1997, 'number_of_characters': 575445, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 141.69, 'max_sentence1_length': 460, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 146.46, 'max_sentence2_length': 518, 'unique_sentence2': 1997}, 'cat_Latn-spa_Latn': {'num_samples': 1997, 'number_of_characters': 571160, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 141.69, 'max_sentence1_length': 460, 'unique_sentence1': 1997, 'min_sentence2_length': 1, 'average_sentence2_length': 144.32, 'max_sentence2_length': 504, 'unique_sentence2': 1996}, 'ces_Latn-bel_Cyrl': {'num_samples': 1997, 'number_of_characters': 497408, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 120.84, 'max_sentence1_length': 474, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 128.24, 'max_sentence2_length': 422, 'unique_sentence2': 1996}, 'ces_Latn-bos_Latn': {'num_samples': 1997, 'number_of_characters': 496228, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 120.84, 'max_sentence1_length': 474, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 127.65, 'max_sentence2_length': 434, 'unique_sentence2': 1996}, 'ces_Latn-bul_Cyrl': {'num_samples': 1997, 'number_of_characters': 511207, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 120.84, 'max_sentence1_length': 474, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 135.15, 'max_sentence2_length': 493, 'unique_sentence2': 1996}, 'ces_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 489038, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 120.84, 'max_sentence1_length': 474, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'ces_Latn-hrv_Latn': {'num_samples': 1997, 'number_of_characters': 497243, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 120.84, 'max_sentence1_length': 474, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 128.15, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'ces_Latn-mkd_Cyrl': {'num_samples': 1997, 'number_of_characters': 509209, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 120.84, 'max_sentence1_length': 474, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 134.15, 'max_sentence2_length': 451, 'unique_sentence2': 1997}, 'ces_Latn-pol_Latn': {'num_samples': 1997, 'number_of_characters': 519184, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 120.84, 'max_sentence1_length': 474, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 139.14, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'ces_Latn-rus_Cyrl': {'num_samples': 1997, 'number_of_characters': 516211, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 120.84, 'max_sentence1_length': 474, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 137.65, 'max_sentence2_length': 419, 'unique_sentence2': 1996}, 'ces_Latn-slk_Latn': {'num_samples': 1997, 'number_of_characters': 494287, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 120.84, 'max_sentence1_length': 474, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 126.67, 'max_sentence2_length': 403, 'unique_sentence2': 1996}, 'ces_Latn-slv_Latn': {'num_samples': 1997, 'number_of_characters': 494214, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 120.84, 'max_sentence1_length': 474, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 126.64, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'ces_Latn-srp_Cyrl': {'num_samples': 1997, 'number_of_characters': 493621, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 120.84, 'max_sentence1_length': 474, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 126.34, 'max_sentence2_length': 439, 'unique_sentence2': 1995}, 'ces_Latn-srp_Latn': {'num_samples': 1997, 'number_of_characters': 497459, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 120.84, 'max_sentence1_length': 474, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 128.26, 'max_sentence2_length': 452, 'unique_sentence2': 1996}, 'ces_Latn-ukr_Cyrl': {'num_samples': 1997, 'number_of_characters': 504101, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 120.84, 'max_sentence1_length': 474, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 131.59, 'max_sentence2_length': 440, 'unique_sentence2': 1996}, 'ckb_Arab-arb_Arab': {'num_samples': 1997, 'number_of_characters': 483548, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 126.37, 'max_sentence1_length': 399, 'unique_sentence1': 1995, 'min_sentence2_length': 5, 'average_sentence2_length': 115.76, 'max_sentence2_length': 362, 'unique_sentence2': 1995}, 'ckb_Arab-eng_Latn': {'num_samples': 1997, 'number_of_characters': 500087, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 126.37, 'max_sentence1_length': 399, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'ckb_Arab-fas_Arab': {'num_samples': 1997, 'number_of_characters': 495706, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 126.37, 'max_sentence1_length': 399, 'unique_sentence1': 1995, 'min_sentence2_length': 9, 'average_sentence2_length': 121.85, 'max_sentence2_length': 389, 'unique_sentence2': 1995}, 'ckb_Arab-heb_Hebr': {'num_samples': 1997, 'number_of_characters': 452663, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 126.37, 'max_sentence1_length': 399, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 100.3, 'max_sentence2_length': 375, 'unique_sentence2': 1996}, 'ckb_Arab-kmr_Latn': {'num_samples': 1997, 'number_of_characters': 498313, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 126.37, 'max_sentence1_length': 399, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 123.16, 'max_sentence2_length': 420, 'unique_sentence2': 1996}, 'ckb_Arab-mey_Arab': {'num_samples': 1997, 'number_of_characters': 466202, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 126.37, 'max_sentence1_length': 399, 'unique_sentence1': 1995, 'min_sentence2_length': 6, 'average_sentence2_length': 107.08, 'max_sentence2_length': 392, 'unique_sentence2': 1993}, 'ckb_Arab-prs_Arab': {'num_samples': 1997, 'number_of_characters': 494903, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 126.37, 'max_sentence1_length': 399, 'unique_sentence1': 1995, 'min_sentence2_length': 8, 'average_sentence2_length': 121.45, 'max_sentence2_length': 365, 'unique_sentence2': 1997}, 'ckb_Arab-pus_Arab': {'num_samples': 1997, 'number_of_characters': 495000, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 126.37, 'max_sentence1_length': 399, 'unique_sentence1': 1995, 'min_sentence2_length': 8, 'average_sentence2_length': 121.5, 'max_sentence2_length': 366, 'unique_sentence2': 1996}, 'ckb_Arab-shi_Arab': {'num_samples': 1997, 'number_of_characters': 467280, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 126.37, 'max_sentence1_length': 399, 'unique_sentence1': 1995, 'min_sentence2_length': 3, 'average_sentence2_length': 107.62, 'max_sentence2_length': 378, 'unique_sentence2': 1996}, 'ckb_Arab-tgk_Cyrl': {'num_samples': 1997, 'number_of_characters': 526514, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 126.37, 'max_sentence1_length': 399, 'unique_sentence1': 1995, 'min_sentence2_length': 3, 'average_sentence2_length': 137.28, 'max_sentence2_length': 451, 'unique_sentence2': 1995}, 'cym_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 514225, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 133.45, 'max_sentence1_length': 444, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'cym_Latn-gle_Latn': {'num_samples': 1997, 'number_of_characters': 561314, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 133.45, 'max_sentence1_length': 444, 'unique_sentence1': 1997, 'min_sentence2_length': 11, 'average_sentence2_length': 147.63, 'max_sentence2_length': 461, 'unique_sentence2': 1997}, 'dan_Latn-afr_Latn': {'num_samples': 1997, 'number_of_characters': 520490, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 126.26, 'max_sentence1_length': 522, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 134.38, 'max_sentence2_length': 437, 'unique_sentence2': 1996}, 'dan_Latn-deu_Latn': {'num_samples': 1997, 'number_of_characters': 547788, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 126.26, 'max_sentence1_length': 522, 'unique_sentence1': 1995, 'min_sentence2_length': 9, 'average_sentence2_length': 148.05, 'max_sentence2_length': 508, 'unique_sentence2': 1996}, 'dan_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 499858, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 126.26, 'max_sentence1_length': 522, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'dan_Latn-fao_Latn': {'num_samples': 1997, 'number_of_characters': 509941, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 126.26, 'max_sentence1_length': 522, 'unique_sentence1': 1995, 'min_sentence2_length': 8, 'average_sentence2_length': 129.1, 'max_sentence2_length': 433, 'unique_sentence2': 1997}, 'dan_Latn-isl_Latn': {'num_samples': 1997, 'number_of_characters': 514346, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 126.26, 'max_sentence1_length': 522, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 131.3, 'max_sentence2_length': 399, 'unique_sentence2': 1996}, 'dan_Latn-ltz_Latn': {'num_samples': 1997, 'number_of_characters': 532895, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 126.26, 'max_sentence1_length': 522, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 140.59, 'max_sentence2_length': 543, 'unique_sentence2': 1996}, 'dan_Latn-nld_Latn': {'num_samples': 1997, 'number_of_characters': 544053, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 126.26, 'max_sentence1_length': 522, 'unique_sentence1': 1995, 'min_sentence2_length': 8, 'average_sentence2_length': 146.18, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'dan_Latn-nno_Latn': {'num_samples': 1997, 'number_of_characters': 500495, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 126.26, 'max_sentence1_length': 522, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 124.37, 'max_sentence2_length': 417, 'unique_sentence2': 1996}, 'dan_Latn-nob_Latn': {'num_samples': 1997, 'number_of_characters': 503582, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 126.26, 'max_sentence1_length': 522, 'unique_sentence1': 1995, 'min_sentence2_length': 8, 'average_sentence2_length': 125.91, 'max_sentence2_length': 482, 'unique_sentence2': 1996}, 'dan_Latn-swe_Latn': {'num_samples': 1997, 'number_of_characters': 503965, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 126.26, 'max_sentence1_length': 522, 'unique_sentence1': 1995, 'min_sentence2_length': 8, 'average_sentence2_length': 126.1, 'max_sentence2_length': 430, 'unique_sentence2': 1996}, 'deu_Latn-afr_Latn': {'num_samples': 1997, 'number_of_characters': 564002, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 148.05, 'max_sentence1_length': 508, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 134.38, 'max_sentence2_length': 437, 'unique_sentence2': 1996}, 'deu_Latn-arb_Arab': {'num_samples': 1997, 'number_of_characters': 526831, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 148.05, 'max_sentence1_length': 508, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 115.76, 'max_sentence2_length': 362, 'unique_sentence2': 1995}, 'deu_Latn-ben_Beng': {'num_samples': 1997, 'number_of_characters': 539452, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 148.05, 'max_sentence1_length': 508, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 122.08, 'max_sentence2_length': 402, 'unique_sentence2': 1997}, 'deu_Latn-dan_Latn': {'num_samples': 1997, 'number_of_characters': 547788, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 148.05, 'max_sentence1_length': 508, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 126.26, 'max_sentence2_length': 522, 'unique_sentence2': 1995}, 'deu_Latn-ell_Grek': {'num_samples': 1997, 'number_of_characters': 594777, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 148.05, 'max_sentence1_length': 508, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 149.79, 'max_sentence2_length': 584, 'unique_sentence2': 1996}, 'deu_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 543370, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 148.05, 'max_sentence1_length': 508, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'deu_Latn-fao_Latn': {'num_samples': 1997, 'number_of_characters': 553453, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 148.05, 'max_sentence1_length': 508, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 129.1, 'max_sentence2_length': 433, 'unique_sentence2': 1997}, 'deu_Latn-fas_Arab': {'num_samples': 1997, 'number_of_characters': 538989, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 148.05, 'max_sentence1_length': 508, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 121.85, 'max_sentence2_length': 389, 'unique_sentence2': 1995}, 'deu_Latn-fin_Latn': {'num_samples': 1997, 'number_of_characters': 565450, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 148.05, 'max_sentence1_length': 508, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 135.1, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'deu_Latn-fra_Latn': {'num_samples': 1997, 'number_of_characters': 588758, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 148.05, 'max_sentence1_length': 508, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 146.77, 'max_sentence2_length': 512, 'unique_sentence2': 1996}, 'deu_Latn-heb_Hebr': {'num_samples': 1997, 'number_of_characters': 495946, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 148.05, 'max_sentence1_length': 508, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 100.3, 'max_sentence2_length': 375, 'unique_sentence2': 1996}, 'deu_Latn-hin_Deva': {'num_samples': 1997, 'number_of_characters': 557225, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 148.05, 'max_sentence1_length': 508, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 130.98, 'max_sentence2_length': 394, 'unique_sentence2': 1996}, 'deu_Latn-hun_Latn': {'num_samples': 1997, 'number_of_characters': 574026, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 148.05, 'max_sentence1_length': 508, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 139.4, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'deu_Latn-ind_Latn': {'num_samples': 1997, 'number_of_characters': 582622, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 148.05, 'max_sentence1_length': 508, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 143.7, 'max_sentence2_length': 486, 'unique_sentence2': 1997}, 'deu_Latn-isl_Latn': {'num_samples': 1997, 'number_of_characters': 557858, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 148.05, 'max_sentence1_length': 508, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 131.3, 'max_sentence2_length': 399, 'unique_sentence2': 1996}, 'deu_Latn-jpn_Jpan': {'num_samples': 1997, 'number_of_characters': 407276, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 148.05, 'max_sentence1_length': 508, 'unique_sentence1': 1996, 'min_sentence2_length': 4, 'average_sentence2_length': 55.9, 'max_sentence2_length': 189, 'unique_sentence2': 1994}, 'deu_Latn-kor_Hang': {'num_samples': 1997, 'number_of_characters': 429055, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 148.05, 'max_sentence1_length': 508, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 66.8, 'max_sentence2_length': 217, 'unique_sentence2': 1995}, 'deu_Latn-lit_Latn': {'num_samples': 1997, 'number_of_characters': 555047, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 148.05, 'max_sentence1_length': 508, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 129.89, 'max_sentence2_length': 446, 'unique_sentence2': 1995}, 'deu_Latn-ltz_Latn': {'num_samples': 1997, 'number_of_characters': 576407, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 148.05, 'max_sentence1_length': 508, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 140.59, 'max_sentence2_length': 543, 'unique_sentence2': 1996}, 'deu_Latn-nld_Latn': {'num_samples': 1997, 'number_of_characters': 587565, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 148.05, 'max_sentence1_length': 508, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 146.18, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'deu_Latn-nno_Latn': {'num_samples': 1997, 'number_of_characters': 544007, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 148.05, 'max_sentence1_length': 508, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.37, 'max_sentence2_length': 417, 'unique_sentence2': 1996}, 'deu_Latn-nob_Latn': {'num_samples': 1997, 'number_of_characters': 547094, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 148.05, 'max_sentence1_length': 508, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 125.91, 'max_sentence2_length': 482, 'unique_sentence2': 1996}, 'deu_Latn-pol_Latn': {'num_samples': 1997, 'number_of_characters': 573516, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 148.05, 'max_sentence1_length': 508, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 139.14, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'deu_Latn-por_Latn': {'num_samples': 1997, 'number_of_characters': 572865, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 148.05, 'max_sentence1_length': 508, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 138.82, 'max_sentence2_length': 497, 'unique_sentence2': 1996}, 'deu_Latn-rus_Cyrl': {'num_samples': 1997, 'number_of_characters': 570543, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 148.05, 'max_sentence1_length': 508, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 137.65, 'max_sentence2_length': 419, 'unique_sentence2': 1996}, 'deu_Latn-spa_Latn': {'num_samples': 1997, 'number_of_characters': 583850, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 148.05, 'max_sentence1_length': 508, 'unique_sentence1': 1996, 'min_sentence2_length': 1, 'average_sentence2_length': 144.32, 'max_sentence2_length': 504, 'unique_sentence2': 1996}, 'deu_Latn-swa_Latn': {'num_samples': 1997, 'number_of_characters': 568159, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 148.05, 'max_sentence1_length': 508, 'unique_sentence1': 1996, 'min_sentence2_length': 10, 'average_sentence2_length': 136.46, 'max_sentence2_length': 430, 'unique_sentence2': 1997}, 'deu_Latn-swe_Latn': {'num_samples': 1997, 'number_of_characters': 547477, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 148.05, 'max_sentence1_length': 508, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 126.1, 'max_sentence2_length': 430, 'unique_sentence2': 1996}, 'deu_Latn-tam_Taml': {'num_samples': 1997, 'number_of_characters': 605611, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 148.05, 'max_sentence1_length': 508, 'unique_sentence1': 1996, 'min_sentence2_length': 11, 'average_sentence2_length': 155.21, 'max_sentence2_length': 581, 'unique_sentence2': 1997}, 'deu_Latn-tur_Latn': {'num_samples': 1997, 'number_of_characters': 561263, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 148.05, 'max_sentence1_length': 508, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 133.01, 'max_sentence2_length': 504, 'unique_sentence2': 1997}, 'deu_Latn-vie_Latn': {'num_samples': 1997, 'number_of_characters': 566771, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 148.05, 'max_sentence1_length': 508, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 135.76, 'max_sentence2_length': 437, 'unique_sentence2': 1996}, 'deu_Latn-zho_Hant': {'num_samples': 1997, 'number_of_characters': 387128, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 148.05, 'max_sentence1_length': 508, 'unique_sentence1': 1996, 'min_sentence2_length': 3, 'average_sentence2_length': 45.81, 'max_sentence2_length': 200, 'unique_sentence2': 1996}, 'deu_Latn-zul_Latn': {'num_samples': 1997, 'number_of_characters': 553382, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 148.05, 'max_sentence1_length': 508, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 129.06, 'max_sentence2_length': 494, 'unique_sentence2': 1996}, 'div_Thaa-ben_Beng': {'num_samples': 1997, 'number_of_characters': 547650, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 152.15, 'max_sentence1_length': 609, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 122.08, 'max_sentence2_length': 402, 'unique_sentence2': 1997}, 'div_Thaa-eng_Latn': {'num_samples': 1997, 'number_of_characters': 551568, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 152.15, 'max_sentence1_length': 609, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'div_Thaa-eus_Latn': {'num_samples': 1997, 'number_of_characters': 579051, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 152.15, 'max_sentence1_length': 609, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 137.81, 'max_sentence2_length': 393, 'unique_sentence2': 1997}, 'div_Thaa-guj_Gujr': {'num_samples': 1997, 'number_of_characters': 548779, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 152.15, 'max_sentence1_length': 609, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 122.65, 'max_sentence2_length': 378, 'unique_sentence2': 1997}, 'div_Thaa-hin_Deva': {'num_samples': 1997, 'number_of_characters': 565423, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 152.15, 'max_sentence1_length': 609, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 130.98, 'max_sentence2_length': 394, 'unique_sentence2': 1996}, 'div_Thaa-kan_Knda': {'num_samples': 1997, 'number_of_characters': 569384, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 152.15, 'max_sentence1_length': 609, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 132.97, 'max_sentence2_length': 449, 'unique_sentence2': 1996}, 'div_Thaa-mar_Deva': {'num_samples': 1997, 'number_of_characters': 564735, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 152.15, 'max_sentence1_length': 609, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 130.64, 'max_sentence2_length': 443, 'unique_sentence2': 1995}, 'div_Thaa-nep_Deva': {'num_samples': 1997, 'number_of_characters': 552071, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 152.15, 'max_sentence1_length': 609, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 124.3, 'max_sentence2_length': 412, 'unique_sentence2': 1996}, 'div_Thaa-pan_Guru': {'num_samples': 1997, 'number_of_characters': 554270, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 152.15, 'max_sentence1_length': 609, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 125.4, 'max_sentence2_length': 383, 'unique_sentence2': 1996}, 'div_Thaa-sin_Sinh': {'num_samples': 1997, 'number_of_characters': 562589, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 152.15, 'max_sentence1_length': 609, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 129.56, 'max_sentence2_length': 441, 'unique_sentence2': 1996}, 'div_Thaa-snd_Arab': {'num_samples': 1997, 'number_of_characters': 524175, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 152.15, 'max_sentence1_length': 609, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 110.33, 'max_sentence2_length': 335, 'unique_sentence2': 1996}, 'div_Thaa-tam_Taml': {'num_samples': 1997, 'number_of_characters': 613809, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 152.15, 'max_sentence1_length': 609, 'unique_sentence1': 1996, 'min_sentence2_length': 11, 'average_sentence2_length': 155.21, 'max_sentence2_length': 581, 'unique_sentence2': 1997}, 'div_Thaa-tel_Telu': {'num_samples': 1997, 'number_of_characters': 551375, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 152.15, 'max_sentence1_length': 609, 'unique_sentence1': 1996, 'min_sentence2_length': 10, 'average_sentence2_length': 123.95, 'max_sentence2_length': 412, 'unique_sentence2': 1996}, 'div_Thaa-urd_Arab': {'num_samples': 1997, 'number_of_characters': 551846, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 152.15, 'max_sentence1_length': 609, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.19, 'max_sentence2_length': 390, 'unique_sentence2': 1996}, 'dzo_Tibt-bod_Tibt': {'num_samples': 1997, 'number_of_characters': 543850, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 121.79, 'max_sentence1_length': 411, 'unique_sentence1': 1992, 'min_sentence2_length': 8, 'average_sentence2_length': 150.54, 'max_sentence2_length': 478, 'unique_sentence2': 1993}, 'dzo_Tibt-eng_Latn': {'num_samples': 1997, 'number_of_characters': 490941, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 121.79, 'max_sentence1_length': 411, 'unique_sentence1': 1992, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'dzo_Tibt-khm_Khmr': {'num_samples': 1997, 'number_of_characters': 531712, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 121.79, 'max_sentence1_length': 411, 'unique_sentence1': 1992, 'min_sentence2_length': 6, 'average_sentence2_length': 144.46, 'max_sentence2_length': 517, 'unique_sentence2': 1996}, 'dzo_Tibt-lao_Laoo': {'num_samples': 1997, 'number_of_characters': 510201, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 121.79, 'max_sentence1_length': 411, 'unique_sentence1': 1992, 'min_sentence2_length': 7, 'average_sentence2_length': 133.69, 'max_sentence2_length': 507, 'unique_sentence2': 1997}, 'dzo_Tibt-mon_Mong': {'num_samples': 1997, 'number_of_characters': 502269, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 121.79, 'max_sentence1_length': 411, 'unique_sentence1': 1992, 'min_sentence2_length': 11, 'average_sentence2_length': 129.72, 'max_sentence2_length': 414, 'unique_sentence2': 1997}, 'dzo_Tibt-mya_Mymr': {'num_samples': 1997, 'number_of_characters': 555075, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 121.79, 'max_sentence1_length': 411, 'unique_sentence1': 1992, 'min_sentence2_length': 9, 'average_sentence2_length': 156.16, 'max_sentence2_length': 773, 'unique_sentence2': 1997}, 'dzo_Tibt-tha_Thai': {'num_samples': 1997, 'number_of_characters': 480689, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 121.79, 'max_sentence1_length': 411, 'unique_sentence1': 1992, 'min_sentence2_length': 7, 'average_sentence2_length': 118.91, 'max_sentence2_length': 439, 'unique_sentence2': 1996}, 'ell_Grek-arb_Arab': {'num_samples': 1997, 'number_of_characters': 530308, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 149.79, 'max_sentence1_length': 584, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 115.76, 'max_sentence2_length': 362, 'unique_sentence2': 1995}, 'ell_Grek-ben_Beng': {'num_samples': 1997, 'number_of_characters': 542929, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 149.79, 'max_sentence1_length': 584, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 122.08, 'max_sentence2_length': 402, 'unique_sentence2': 1997}, 'ell_Grek-deu_Latn': {'num_samples': 1997, 'number_of_characters': 594777, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 149.79, 'max_sentence1_length': 584, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 148.05, 'max_sentence2_length': 508, 'unique_sentence2': 1996}, 'ell_Grek-eng_Latn': {'num_samples': 1997, 'number_of_characters': 546847, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 149.79, 'max_sentence1_length': 584, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'ell_Grek-fas_Arab': {'num_samples': 1997, 'number_of_characters': 542466, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 149.79, 'max_sentence1_length': 584, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 121.85, 'max_sentence2_length': 389, 'unique_sentence2': 1995}, 'ell_Grek-fin_Latn': {'num_samples': 1997, 'number_of_characters': 568927, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 149.79, 'max_sentence1_length': 584, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 135.1, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'ell_Grek-fra_Latn': {'num_samples': 1997, 'number_of_characters': 592235, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 149.79, 'max_sentence1_length': 584, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 146.77, 'max_sentence2_length': 512, 'unique_sentence2': 1996}, 'ell_Grek-heb_Hebr': {'num_samples': 1997, 'number_of_characters': 499423, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 149.79, 'max_sentence1_length': 584, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 100.3, 'max_sentence2_length': 375, 'unique_sentence2': 1996}, 'ell_Grek-hin_Deva': {'num_samples': 1997, 'number_of_characters': 560702, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 149.79, 'max_sentence1_length': 584, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 130.98, 'max_sentence2_length': 394, 'unique_sentence2': 1996}, 'ell_Grek-hun_Latn': {'num_samples': 1997, 'number_of_characters': 577503, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 149.79, 'max_sentence1_length': 584, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 139.4, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'ell_Grek-hye_Armn': {'num_samples': 1997, 'number_of_characters': 563842, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 149.79, 'max_sentence1_length': 584, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 132.56, 'max_sentence2_length': 441, 'unique_sentence2': 1996}, 'ell_Grek-ind_Latn': {'num_samples': 1997, 'number_of_characters': 586099, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 149.79, 'max_sentence1_length': 584, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 143.7, 'max_sentence2_length': 486, 'unique_sentence2': 1997}, 'ell_Grek-jpn_Jpan': {'num_samples': 1997, 'number_of_characters': 410753, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 149.79, 'max_sentence1_length': 584, 'unique_sentence1': 1996, 'min_sentence2_length': 4, 'average_sentence2_length': 55.9, 'max_sentence2_length': 189, 'unique_sentence2': 1994}, 'ell_Grek-kat_Geor': {'num_samples': 1997, 'number_of_characters': 565719, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 149.79, 'max_sentence1_length': 584, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 133.5, 'max_sentence2_length': 503, 'unique_sentence2': 1995}, 'ell_Grek-kor_Hang': {'num_samples': 1997, 'number_of_characters': 432532, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 149.79, 'max_sentence1_length': 584, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 66.8, 'max_sentence2_length': 217, 'unique_sentence2': 1995}, 'ell_Grek-lit_Latn': {'num_samples': 1997, 'number_of_characters': 558524, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 149.79, 'max_sentence1_length': 584, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 129.89, 'max_sentence2_length': 446, 'unique_sentence2': 1995}, 'ell_Grek-nld_Latn': {'num_samples': 1997, 'number_of_characters': 591042, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 149.79, 'max_sentence1_length': 584, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 146.18, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'ell_Grek-pol_Latn': {'num_samples': 1997, 'number_of_characters': 576993, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 149.79, 'max_sentence1_length': 584, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 139.14, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'ell_Grek-por_Latn': {'num_samples': 1997, 'number_of_characters': 576342, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 149.79, 'max_sentence1_length': 584, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 138.82, 'max_sentence2_length': 497, 'unique_sentence2': 1996}, 'ell_Grek-rus_Cyrl': {'num_samples': 1997, 'number_of_characters': 574020, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 149.79, 'max_sentence1_length': 584, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 137.65, 'max_sentence2_length': 419, 'unique_sentence2': 1996}, 'ell_Grek-spa_Latn': {'num_samples': 1997, 'number_of_characters': 587327, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 149.79, 'max_sentence1_length': 584, 'unique_sentence1': 1996, 'min_sentence2_length': 1, 'average_sentence2_length': 144.32, 'max_sentence2_length': 504, 'unique_sentence2': 1996}, 'ell_Grek-sqi_Latn': {'num_samples': 1997, 'number_of_characters': 582734, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 149.79, 'max_sentence1_length': 584, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 142.02, 'max_sentence2_length': 461, 'unique_sentence2': 1996}, 'ell_Grek-swa_Latn': {'num_samples': 1997, 'number_of_characters': 571636, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 149.79, 'max_sentence1_length': 584, 'unique_sentence1': 1996, 'min_sentence2_length': 10, 'average_sentence2_length': 136.46, 'max_sentence2_length': 430, 'unique_sentence2': 1997}, 'ell_Grek-swe_Latn': {'num_samples': 1997, 'number_of_characters': 550954, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 149.79, 'max_sentence1_length': 584, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 126.1, 'max_sentence2_length': 430, 'unique_sentence2': 1996}, 'ell_Grek-tam_Taml': {'num_samples': 1997, 'number_of_characters': 609088, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 149.79, 'max_sentence1_length': 584, 'unique_sentence1': 1996, 'min_sentence2_length': 11, 'average_sentence2_length': 155.21, 'max_sentence2_length': 581, 'unique_sentence2': 1997}, 'ell_Grek-tur_Latn': {'num_samples': 1997, 'number_of_characters': 564740, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 149.79, 'max_sentence1_length': 584, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 133.01, 'max_sentence2_length': 504, 'unique_sentence2': 1997}, 'ell_Grek-vie_Latn': {'num_samples': 1997, 'number_of_characters': 570248, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 149.79, 'max_sentence1_length': 584, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 135.76, 'max_sentence2_length': 437, 'unique_sentence2': 1996}, 'ell_Grek-zho_Hant': {'num_samples': 1997, 'number_of_characters': 390605, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 149.79, 'max_sentence1_length': 584, 'unique_sentence1': 1996, 'min_sentence2_length': 3, 'average_sentence2_length': 45.81, 'max_sentence2_length': 200, 'unique_sentence2': 1996}, 'ell_Grek-zul_Latn': {'num_samples': 1997, 'number_of_characters': 556859, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 149.79, 'max_sentence1_length': 584, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 129.06, 'max_sentence2_length': 494, 'unique_sentence2': 1996}, 'eng_Latn-afr_Latn': {'num_samples': 1997, 'number_of_characters': 516072, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 134.38, 'max_sentence2_length': 437, 'unique_sentence2': 1996}, 'eng_Latn-amh_Ethi': {'num_samples': 1997, 'number_of_characters': 415227, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 1, 'average_sentence2_length': 83.88, 'max_sentence2_length': 290, 'unique_sentence2': 1994}, 'eng_Latn-arb_Arab': {'num_samples': 1997, 'number_of_characters': 478901, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 115.76, 'max_sentence2_length': 362, 'unique_sentence2': 1995}, 'eng_Latn-aze_Latn': {'num_samples': 1997, 'number_of_characters': 517354, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 135.02, 'max_sentence2_length': 398, 'unique_sentence2': 1997}, 'eng_Latn-bak_Cyrl': {'num_samples': 1997, 'number_of_characters': 494046, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 11, 'average_sentence2_length': 123.35, 'max_sentence2_length': 437, 'unique_sentence2': 1995}, 'eng_Latn-bel_Cyrl': {'num_samples': 1997, 'number_of_characters': 503810, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 128.24, 'max_sentence2_length': 422, 'unique_sentence2': 1996}, 'eng_Latn-bem_Latn': {'num_samples': 1997, 'number_of_characters': 546212, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 149.47, 'max_sentence2_length': 465, 'unique_sentence2': 1997}, 'eng_Latn-ben_Beng': {'num_samples': 1997, 'number_of_characters': 491522, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 122.08, 'max_sentence2_length': 402, 'unique_sentence2': 1997}, 'eng_Latn-bod_Tibt': {'num_samples': 1997, 'number_of_characters': 548349, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 150.54, 'max_sentence2_length': 478, 'unique_sentence2': 1993}, 'eng_Latn-bos_Latn': {'num_samples': 1997, 'number_of_characters': 502630, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 127.65, 'max_sentence2_length': 434, 'unique_sentence2': 1996}, 'eng_Latn-bul_Cyrl': {'num_samples': 1997, 'number_of_characters': 517609, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 135.15, 'max_sentence2_length': 493, 'unique_sentence2': 1996}, 'eng_Latn-cat_Latn': {'num_samples': 1997, 'number_of_characters': 530680, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 141.69, 'max_sentence2_length': 460, 'unique_sentence2': 1997}, 'eng_Latn-ces_Latn': {'num_samples': 1997, 'number_of_characters': 489038, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 120.84, 'max_sentence2_length': 474, 'unique_sentence2': 1997}, 'eng_Latn-ckb_Arab': {'num_samples': 1997, 'number_of_characters': 500087, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 126.37, 'max_sentence2_length': 399, 'unique_sentence2': 1995}, 'eng_Latn-cym_Latn': {'num_samples': 1997, 'number_of_characters': 514225, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 133.45, 'max_sentence2_length': 444, 'unique_sentence2': 1997}, 'eng_Latn-dan_Latn': {'num_samples': 1997, 'number_of_characters': 499858, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 126.26, 'max_sentence2_length': 522, 'unique_sentence2': 1995}, 'eng_Latn-deu_Latn': {'num_samples': 1997, 'number_of_characters': 543370, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 148.05, 'max_sentence2_length': 508, 'unique_sentence2': 1996}, 'eng_Latn-div_Thaa': {'num_samples': 1997, 'number_of_characters': 551568, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 152.15, 'max_sentence2_length': 609, 'unique_sentence2': 1996}, 'eng_Latn-dzo_Tibt': {'num_samples': 1997, 'number_of_characters': 490941, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 121.79, 'max_sentence2_length': 411, 'unique_sentence2': 1992}, 'eng_Latn-ell_Grek': {'num_samples': 1997, 'number_of_characters': 546847, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 149.79, 'max_sentence2_length': 584, 'unique_sentence2': 1996}, 'eng_Latn-eus_Latn': {'num_samples': 1997, 'number_of_characters': 522923, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 137.81, 'max_sentence2_length': 393, 'unique_sentence2': 1997}, 'eng_Latn-ewe_Latn': {'num_samples': 1997, 'number_of_characters': 486698, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 119.67, 'max_sentence2_length': 493, 'unique_sentence2': 1994}, 'eng_Latn-fao_Latn': {'num_samples': 1997, 'number_of_characters': 505523, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 129.1, 'max_sentence2_length': 433, 'unique_sentence2': 1997}, 'eng_Latn-fas_Arab': {'num_samples': 1997, 'number_of_characters': 491059, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 121.85, 'max_sentence2_length': 389, 'unique_sentence2': 1995}, 'eng_Latn-fij_Latn': {'num_samples': 1997, 'number_of_characters': 548225, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 10, 'average_sentence2_length': 150.48, 'max_sentence2_length': 448, 'unique_sentence2': 1988}, 'eng_Latn-fil_Latn': {'num_samples': 1997, 'number_of_characters': 541140, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 10, 'average_sentence2_length': 146.93, 'max_sentence2_length': 554, 'unique_sentence2': 1997}, 'eng_Latn-fin_Latn': {'num_samples': 1997, 'number_of_characters': 517520, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 135.1, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'eng_Latn-fra_Latn': {'num_samples': 1997, 'number_of_characters': 540828, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 146.77, 'max_sentence2_length': 512, 'unique_sentence2': 1996}, 'eng_Latn-fuc_Latn': {'num_samples': 1997, 'number_of_characters': 476200, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 114.41, 'max_sentence2_length': 376, 'unique_sentence2': 1996}, 'eng_Latn-gle_Latn': {'num_samples': 1997, 'number_of_characters': 542529, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 11, 'average_sentence2_length': 147.63, 'max_sentence2_length': 461, 'unique_sentence2': 1997}, 'eng_Latn-glg_Latn': {'num_samples': 1997, 'number_of_characters': 519706, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 136.2, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'eng_Latn-guj_Gujr': {'num_samples': 1997, 'number_of_characters': 492651, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 122.65, 'max_sentence2_length': 378, 'unique_sentence2': 1997}, 'eng_Latn-hau_Latn': {'num_samples': 1997, 'number_of_characters': 517686, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 135.19, 'max_sentence2_length': 483, 'unique_sentence2': 1997}, 'eng_Latn-heb_Hebr': {'num_samples': 1997, 'number_of_characters': 448016, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 100.3, 'max_sentence2_length': 375, 'unique_sentence2': 1996}, 'eng_Latn-hin_Deva': {'num_samples': 1997, 'number_of_characters': 509295, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 130.98, 'max_sentence2_length': 394, 'unique_sentence2': 1996}, 'eng_Latn-hmn_Latn': {'num_samples': 1997, 'number_of_characters': 578510, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 165.64, 'max_sentence2_length': 643, 'unique_sentence2': 1997}, 'eng_Latn-hrv_Latn': {'num_samples': 1997, 'number_of_characters': 503645, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 128.15, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'eng_Latn-hun_Latn': {'num_samples': 1997, 'number_of_characters': 526096, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 139.4, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'eng_Latn-hye_Armn': {'num_samples': 1997, 'number_of_characters': 512435, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 132.56, 'max_sentence2_length': 441, 'unique_sentence2': 1996}, 'eng_Latn-ibo_Latn': {'num_samples': 1997, 'number_of_characters': 493821, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 123.24, 'max_sentence2_length': 469, 'unique_sentence2': 1997}, 'eng_Latn-ind_Latn': {'num_samples': 1997, 'number_of_characters': 534692, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 143.7, 'max_sentence2_length': 486, 'unique_sentence2': 1997}, 'eng_Latn-isl_Latn': {'num_samples': 1997, 'number_of_characters': 509928, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 131.3, 'max_sentence2_length': 399, 'unique_sentence2': 1996}, 'eng_Latn-ita_Latn': {'num_samples': 1997, 'number_of_characters': 536937, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 144.83, 'max_sentence2_length': 623, 'unique_sentence2': 1996}, 'eng_Latn-jpn_Jpan': {'num_samples': 1997, 'number_of_characters': 359346, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 4, 'average_sentence2_length': 55.9, 'max_sentence2_length': 189, 'unique_sentence2': 1994}, 'eng_Latn-kan_Knda': {'num_samples': 1997, 'number_of_characters': 513256, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 132.97, 'max_sentence2_length': 449, 'unique_sentence2': 1996}, 'eng_Latn-kat_Geor': {'num_samples': 1997, 'number_of_characters': 514312, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 133.5, 'max_sentence2_length': 503, 'unique_sentence2': 1995}, 'eng_Latn-kaz_Cyrl': {'num_samples': 1997, 'number_of_characters': 507996, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 130.33, 'max_sentence2_length': 473, 'unique_sentence2': 1996}, 'eng_Latn-khm_Khmr': {'num_samples': 1997, 'number_of_characters': 536211, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 144.46, 'max_sentence2_length': 517, 'unique_sentence2': 1996}, 'eng_Latn-kin_Latn': {'num_samples': 1997, 'number_of_characters': 551507, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 10, 'average_sentence2_length': 152.12, 'max_sentence2_length': 541, 'unique_sentence2': 1996}, 'eng_Latn-kir_Cyrl': {'num_samples': 1997, 'number_of_characters': 498584, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 125.62, 'max_sentence2_length': 395, 'unique_sentence2': 1996}, 'eng_Latn-kmr_Latn': {'num_samples': 1997, 'number_of_characters': 493666, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 123.16, 'max_sentence2_length': 420, 'unique_sentence2': 1996}, 'eng_Latn-kor_Hang': {'num_samples': 1997, 'number_of_characters': 381125, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 66.8, 'max_sentence2_length': 217, 'unique_sentence2': 1995}, 'eng_Latn-lao_Laoo': {'num_samples': 1997, 'number_of_characters': 514700, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 133.69, 'max_sentence2_length': 507, 'unique_sentence2': 1997}, 'eng_Latn-lav_Latn': {'num_samples': 1997, 'number_of_characters': 515908, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 134.3, 'max_sentence2_length': 503, 'unique_sentence2': 1994}, 'eng_Latn-lit_Latn': {'num_samples': 1997, 'number_of_characters': 507117, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 129.89, 'max_sentence2_length': 446, 'unique_sentence2': 1995}, 'eng_Latn-ltz_Latn': {'num_samples': 1997, 'number_of_characters': 528477, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 140.59, 'max_sentence2_length': 543, 'unique_sentence2': 1996}, 'eng_Latn-mal_Mlym': {'num_samples': 1997, 'number_of_characters': 551872, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 152.3, 'max_sentence2_length': 540, 'unique_sentence2': 1996}, 'eng_Latn-mar_Deva': {'num_samples': 1997, 'number_of_characters': 508607, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 130.64, 'max_sentence2_length': 443, 'unique_sentence2': 1995}, 'eng_Latn-mey_Arab': {'num_samples': 1997, 'number_of_characters': 461555, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 107.08, 'max_sentence2_length': 392, 'unique_sentence2': 1993}, 'eng_Latn-mkd_Cyrl': {'num_samples': 1997, 'number_of_characters': 515611, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 134.15, 'max_sentence2_length': 451, 'unique_sentence2': 1997}, 'eng_Latn-mlg_Latn': {'num_samples': 1997, 'number_of_characters': 568028, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 160.39, 'max_sentence2_length': 559, 'unique_sentence2': 1994}, 'eng_Latn-mlt_Latn': {'num_samples': 1997, 'number_of_characters': 525195, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 138.95, 'max_sentence2_length': 582, 'unique_sentence2': 1996}, 'eng_Latn-mon_Mong': {'num_samples': 1997, 'number_of_characters': 506768, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 11, 'average_sentence2_length': 129.72, 'max_sentence2_length': 414, 'unique_sentence2': 1997}, 'eng_Latn-mri_Latn': {'num_samples': 1997, 'number_of_characters': 521844, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 137.27, 'max_sentence2_length': 443, 'unique_sentence2': 1997}, 'eng_Latn-msa_Latn': {'num_samples': 1997, 'number_of_characters': 524903, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 138.8, 'max_sentence2_length': 463, 'unique_sentence2': 1997}, 'eng_Latn-mya_Mymr': {'num_samples': 1997, 'number_of_characters': 559574, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 156.16, 'max_sentence2_length': 773, 'unique_sentence2': 1997}, 'eng_Latn-nde_Latn': {'num_samples': 1997, 'number_of_characters': 545459, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 149.09, 'max_sentence2_length': 590, 'unique_sentence2': 1997}, 'eng_Latn-nep_Deva': {'num_samples': 1997, 'number_of_characters': 495943, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 124.3, 'max_sentence2_length': 412, 'unique_sentence2': 1996}, 'eng_Latn-nld_Latn': {'num_samples': 1997, 'number_of_characters': 539635, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 146.18, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'eng_Latn-nno_Latn': {'num_samples': 1997, 'number_of_characters': 496077, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.37, 'max_sentence2_length': 417, 'unique_sentence2': 1996}, 'eng_Latn-nob_Latn': {'num_samples': 1997, 'number_of_characters': 499164, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 125.91, 'max_sentence2_length': 482, 'unique_sentence2': 1996}, 'eng_Latn-nso_Latn': {'num_samples': 1997, 'number_of_characters': 539219, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 145.97, 'max_sentence2_length': 487, 'unique_sentence2': 1996}, 'eng_Latn-nya_Latn': {'num_samples': 1997, 'number_of_characters': 532002, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 10, 'average_sentence2_length': 142.35, 'max_sentence2_length': 464, 'unique_sentence2': 1993}, 'eng_Latn-orm_Ethi': {'num_samples': 1997, 'number_of_characters': 485151, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 118.89, 'max_sentence2_length': 466, 'unique_sentence2': 1984}, 'eng_Latn-pan_Guru': {'num_samples': 1997, 'number_of_characters': 498142, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 125.4, 'max_sentence2_length': 383, 'unique_sentence2': 1996}, 'eng_Latn-pol_Latn': {'num_samples': 1997, 'number_of_characters': 525586, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 139.14, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'eng_Latn-por_Latn': {'num_samples': 1997, 'number_of_characters': 524935, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 138.82, 'max_sentence2_length': 497, 'unique_sentence2': 1996}, 'eng_Latn-prs_Arab': {'num_samples': 1997, 'number_of_characters': 490256, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 121.45, 'max_sentence2_length': 365, 'unique_sentence2': 1997}, 'eng_Latn-pus_Arab': {'num_samples': 1997, 'number_of_characters': 490353, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 121.5, 'max_sentence2_length': 366, 'unique_sentence2': 1996}, 'eng_Latn-ron_Latn': {'num_samples': 1997, 'number_of_characters': 540205, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 146.46, 'max_sentence2_length': 518, 'unique_sentence2': 1997}, 'eng_Latn-rus_Cyrl': {'num_samples': 1997, 'number_of_characters': 522613, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 137.65, 'max_sentence2_length': 419, 'unique_sentence2': 1996}, 'eng_Latn-shi_Arab': {'num_samples': 1997, 'number_of_characters': 462633, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 3, 'average_sentence2_length': 107.62, 'max_sentence2_length': 378, 'unique_sentence2': 1996}, 'eng_Latn-sin_Sinh': {'num_samples': 1997, 'number_of_characters': 506461, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 129.56, 'max_sentence2_length': 441, 'unique_sentence2': 1996}, 'eng_Latn-slk_Latn': {'num_samples': 1997, 'number_of_characters': 500689, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 126.67, 'max_sentence2_length': 403, 'unique_sentence2': 1996}, 'eng_Latn-slv_Latn': {'num_samples': 1997, 'number_of_characters': 500616, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 126.64, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'eng_Latn-smo_Latn': {'num_samples': 1997, 'number_of_characters': 525575, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 139.14, 'max_sentence2_length': 431, 'unique_sentence2': 1996}, 'eng_Latn-sna_Latn': {'num_samples': 1997, 'number_of_characters': 546050, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 149.39, 'max_sentence2_length': 511, 'unique_sentence2': 1995}, 'eng_Latn-snd_Arab': {'num_samples': 1997, 'number_of_characters': 468047, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 110.33, 'max_sentence2_length': 335, 'unique_sentence2': 1996}, 'eng_Latn-som_Latn': {'num_samples': 1997, 'number_of_characters': 539012, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 145.86, 'max_sentence2_length': 455, 'unique_sentence2': 1997}, 'eng_Latn-spa_Latn': {'num_samples': 1997, 'number_of_characters': 535920, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 1, 'average_sentence2_length': 144.32, 'max_sentence2_length': 504, 'unique_sentence2': 1996}, 'eng_Latn-sqi_Latn': {'num_samples': 1997, 'number_of_characters': 531327, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 142.02, 'max_sentence2_length': 461, 'unique_sentence2': 1996}, 'eng_Latn-srp_Cyrl': {'num_samples': 1997, 'number_of_characters': 500023, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 126.34, 'max_sentence2_length': 439, 'unique_sentence2': 1995}, 'eng_Latn-srp_Latn': {'num_samples': 1997, 'number_of_characters': 503861, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 128.26, 'max_sentence2_length': 452, 'unique_sentence2': 1996}, 'eng_Latn-ssw_Latn': {'num_samples': 1997, 'number_of_characters': 535862, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 144.29, 'max_sentence2_length': 510, 'unique_sentence2': 1996}, 'eng_Latn-swa_Latn': {'num_samples': 1997, 'number_of_characters': 520229, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 10, 'average_sentence2_length': 136.46, 'max_sentence2_length': 430, 'unique_sentence2': 1997}, 'eng_Latn-swe_Latn': {'num_samples': 1997, 'number_of_characters': 499547, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 126.1, 'max_sentence2_length': 430, 'unique_sentence2': 1996}, 'eng_Latn-tah_Latn': {'num_samples': 1997, 'number_of_characters': 557343, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 155.04, 'max_sentence2_length': 524, 'unique_sentence2': 1997}, 'eng_Latn-tam_Taml': {'num_samples': 1997, 'number_of_characters': 557681, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 11, 'average_sentence2_length': 155.21, 'max_sentence2_length': 581, 'unique_sentence2': 1997}, 'eng_Latn-tat_Cyrl': {'num_samples': 1997, 'number_of_characters': 493646, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 123.15, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'eng_Latn-tel_Telu': {'num_samples': 1997, 'number_of_characters': 495247, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 10, 'average_sentence2_length': 123.95, 'max_sentence2_length': 412, 'unique_sentence2': 1996}, 'eng_Latn-tgk_Cyrl': {'num_samples': 1997, 'number_of_characters': 521867, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 3, 'average_sentence2_length': 137.28, 'max_sentence2_length': 451, 'unique_sentence2': 1995}, 'eng_Latn-tha_Thai': {'num_samples': 1997, 'number_of_characters': 485188, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 118.91, 'max_sentence2_length': 439, 'unique_sentence2': 1996}, 'eng_Latn-tir_Ethi': {'num_samples': 1997, 'number_of_characters': 412958, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 82.74, 'max_sentence2_length': 272, 'unique_sentence2': 1996}, 'eng_Latn-ton_Latn': {'num_samples': 1997, 'number_of_characters': 561360, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 157.06, 'max_sentence2_length': 468, 'unique_sentence2': 1997}, 'eng_Latn-tsn_Latn': {'num_samples': 1997, 'number_of_characters': 582003, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 167.39, 'max_sentence2_length': 556, 'unique_sentence2': 1997}, 'eng_Latn-tuk_Latn': {'num_samples': 1997, 'number_of_characters': 532994, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 142.85, 'max_sentence2_length': 576, 'unique_sentence2': 1996}, 'eng_Latn-tur_Latn': {'num_samples': 1997, 'number_of_characters': 513333, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 133.01, 'max_sentence2_length': 504, 'unique_sentence2': 1997}, 'eng_Latn-uig_Arab': {'num_samples': 1997, 'number_of_characters': 558742, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 155.74, 'max_sentence2_length': 592, 'unique_sentence2': 1996}, 'eng_Latn-ukr_Cyrl': {'num_samples': 1997, 'number_of_characters': 510503, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 131.59, 'max_sentence2_length': 440, 'unique_sentence2': 1996}, 'eng_Latn-urd_Arab': {'num_samples': 1997, 'number_of_characters': 495718, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.19, 'max_sentence2_length': 390, 'unique_sentence2': 1996}, 'eng_Latn-uzb_Latn': {'num_samples': 1997, 'number_of_characters': 541415, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 147.07, 'max_sentence2_length': 470, 'unique_sentence2': 1996}, 'eng_Latn-ven_Latn': {'num_samples': 1997, 'number_of_characters': 547476, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 10, 'average_sentence2_length': 150.1, 'max_sentence2_length': 535, 'unique_sentence2': 1993}, 'eng_Latn-vie_Latn': {'num_samples': 1997, 'number_of_characters': 518841, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 135.76, 'max_sentence2_length': 437, 'unique_sentence2': 1996}, 'eng_Latn-wol_Latn': {'num_samples': 1997, 'number_of_characters': 487523, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 120.08, 'max_sentence2_length': 405, 'unique_sentence2': 1990}, 'eng_Latn-xho_Latn': {'num_samples': 1997, 'number_of_characters': 515810, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 134.25, 'max_sentence2_length': 492, 'unique_sentence2': 1997}, 'eng_Latn-yor_Latn': {'num_samples': 1997, 'number_of_characters': 563808, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 158.28, 'max_sentence2_length': 582, 'unique_sentence2': 1996}, 'eng_Latn-yue_Hant': {'num_samples': 1997, 'number_of_characters': 326607, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 4, 'average_sentence2_length': 39.5, 'max_sentence2_length': 133, 'unique_sentence2': 1996}, 'eng_Latn-zho_Hans': {'num_samples': 1997, 'number_of_characters': 332681, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 4, 'average_sentence2_length': 42.54, 'max_sentence2_length': 263, 'unique_sentence2': 1997}, 'eng_Latn-zho_Hant': {'num_samples': 1997, 'number_of_characters': 339198, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 3, 'average_sentence2_length': 45.81, 'max_sentence2_length': 200, 'unique_sentence2': 1996}, 'eng_Latn-zul_Latn': {'num_samples': 1997, 'number_of_characters': 505452, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 129.06, 'max_sentence2_length': 494, 'unique_sentence2': 1996}, 'eus_Latn-ben_Beng': {'num_samples': 1997, 'number_of_characters': 519005, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 137.81, 'max_sentence1_length': 393, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 122.08, 'max_sentence2_length': 402, 'unique_sentence2': 1997}, 'eus_Latn-div_Thaa': {'num_samples': 1997, 'number_of_characters': 579051, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 137.81, 'max_sentence1_length': 393, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 152.15, 'max_sentence2_length': 609, 'unique_sentence2': 1996}, 'eus_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 522923, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 137.81, 'max_sentence1_length': 393, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'eus_Latn-guj_Gujr': {'num_samples': 1997, 'number_of_characters': 520134, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 137.81, 'max_sentence1_length': 393, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 122.65, 'max_sentence2_length': 378, 'unique_sentence2': 1997}, 'eus_Latn-hin_Deva': {'num_samples': 1997, 'number_of_characters': 536778, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 137.81, 'max_sentence1_length': 393, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 130.98, 'max_sentence2_length': 394, 'unique_sentence2': 1996}, 'eus_Latn-kan_Knda': {'num_samples': 1997, 'number_of_characters': 540739, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 137.81, 'max_sentence1_length': 393, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 132.97, 'max_sentence2_length': 449, 'unique_sentence2': 1996}, 'eus_Latn-mar_Deva': {'num_samples': 1997, 'number_of_characters': 536090, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 137.81, 'max_sentence1_length': 393, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 130.64, 'max_sentence2_length': 443, 'unique_sentence2': 1995}, 'eus_Latn-nep_Deva': {'num_samples': 1997, 'number_of_characters': 523426, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 137.81, 'max_sentence1_length': 393, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 124.3, 'max_sentence2_length': 412, 'unique_sentence2': 1996}, 'eus_Latn-pan_Guru': {'num_samples': 1997, 'number_of_characters': 525625, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 137.81, 'max_sentence1_length': 393, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 125.4, 'max_sentence2_length': 383, 'unique_sentence2': 1996}, 'eus_Latn-sin_Sinh': {'num_samples': 1997, 'number_of_characters': 533944, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 137.81, 'max_sentence1_length': 393, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 129.56, 'max_sentence2_length': 441, 'unique_sentence2': 1996}, 'eus_Latn-snd_Arab': {'num_samples': 1997, 'number_of_characters': 495530, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 137.81, 'max_sentence1_length': 393, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 110.33, 'max_sentence2_length': 335, 'unique_sentence2': 1996}, 'eus_Latn-tam_Taml': {'num_samples': 1997, 'number_of_characters': 585164, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 137.81, 'max_sentence1_length': 393, 'unique_sentence1': 1997, 'min_sentence2_length': 11, 'average_sentence2_length': 155.21, 'max_sentence2_length': 581, 'unique_sentence2': 1997}, 'eus_Latn-tel_Telu': {'num_samples': 1997, 'number_of_characters': 522730, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 137.81, 'max_sentence1_length': 393, 'unique_sentence1': 1997, 'min_sentence2_length': 10, 'average_sentence2_length': 123.95, 'max_sentence2_length': 412, 'unique_sentence2': 1996}, 'eus_Latn-urd_Arab': {'num_samples': 1997, 'number_of_characters': 523201, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 137.81, 'max_sentence1_length': 393, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.19, 'max_sentence2_length': 390, 'unique_sentence2': 1996}, 'ewe_Latn-bem_Latn': {'num_samples': 1997, 'number_of_characters': 537470, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 119.67, 'max_sentence1_length': 493, 'unique_sentence1': 1994, 'min_sentence2_length': 8, 'average_sentence2_length': 149.47, 'max_sentence2_length': 465, 'unique_sentence2': 1997}, 'ewe_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 486698, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 119.67, 'max_sentence1_length': 493, 'unique_sentence1': 1994, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'ewe_Latn-fuc_Latn': {'num_samples': 1997, 'number_of_characters': 467458, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 119.67, 'max_sentence1_length': 493, 'unique_sentence1': 1994, 'min_sentence2_length': 6, 'average_sentence2_length': 114.41, 'max_sentence2_length': 376, 'unique_sentence2': 1996}, 'ewe_Latn-kin_Latn': {'num_samples': 1997, 'number_of_characters': 542765, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 119.67, 'max_sentence1_length': 493, 'unique_sentence1': 1994, 'min_sentence2_length': 10, 'average_sentence2_length': 152.12, 'max_sentence2_length': 541, 'unique_sentence2': 1996}, 'ewe_Latn-nde_Latn': {'num_samples': 1997, 'number_of_characters': 536717, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 119.67, 'max_sentence1_length': 493, 'unique_sentence1': 1994, 'min_sentence2_length': 6, 'average_sentence2_length': 149.09, 'max_sentence2_length': 590, 'unique_sentence2': 1997}, 'ewe_Latn-nya_Latn': {'num_samples': 1997, 'number_of_characters': 523260, 'unique_pairs': 1995, 'min_sentence1_length': 6, 'average_sentence1_length': 119.67, 'max_sentence1_length': 493, 'unique_sentence1': 1994, 'min_sentence2_length': 10, 'average_sentence2_length': 142.35, 'max_sentence2_length': 464, 'unique_sentence2': 1993}, 'ewe_Latn-sna_Latn': {'num_samples': 1997, 'number_of_characters': 537308, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 119.67, 'max_sentence1_length': 493, 'unique_sentence1': 1994, 'min_sentence2_length': 6, 'average_sentence2_length': 149.39, 'max_sentence2_length': 511, 'unique_sentence2': 1995}, 'ewe_Latn-ven_Latn': {'num_samples': 1997, 'number_of_characters': 538734, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 119.67, 'max_sentence1_length': 493, 'unique_sentence1': 1994, 'min_sentence2_length': 10, 'average_sentence2_length': 150.1, 'max_sentence2_length': 535, 'unique_sentence2': 1993}, 'fao_Latn-afr_Latn': {'num_samples': 1997, 'number_of_characters': 526155, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.1, 'max_sentence1_length': 433, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 134.38, 'max_sentence2_length': 437, 'unique_sentence2': 1996}, 'fao_Latn-dan_Latn': {'num_samples': 1997, 'number_of_characters': 509941, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.1, 'max_sentence1_length': 433, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 126.26, 'max_sentence2_length': 522, 'unique_sentence2': 1995}, 'fao_Latn-deu_Latn': {'num_samples': 1997, 'number_of_characters': 553453, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.1, 'max_sentence1_length': 433, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 148.05, 'max_sentence2_length': 508, 'unique_sentence2': 1996}, 'fao_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 505523, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.1, 'max_sentence1_length': 433, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'fao_Latn-isl_Latn': {'num_samples': 1997, 'number_of_characters': 520011, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.1, 'max_sentence1_length': 433, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 131.3, 'max_sentence2_length': 399, 'unique_sentence2': 1996}, 'fao_Latn-ltz_Latn': {'num_samples': 1997, 'number_of_characters': 538560, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.1, 'max_sentence1_length': 433, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 140.59, 'max_sentence2_length': 543, 'unique_sentence2': 1996}, 'fao_Latn-nld_Latn': {'num_samples': 1997, 'number_of_characters': 549718, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.1, 'max_sentence1_length': 433, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 146.18, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'fao_Latn-nno_Latn': {'num_samples': 1997, 'number_of_characters': 506160, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.1, 'max_sentence1_length': 433, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.37, 'max_sentence2_length': 417, 'unique_sentence2': 1996}, 'fao_Latn-nob_Latn': {'num_samples': 1997, 'number_of_characters': 509247, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.1, 'max_sentence1_length': 433, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 125.91, 'max_sentence2_length': 482, 'unique_sentence2': 1996}, 'fao_Latn-swe_Latn': {'num_samples': 1997, 'number_of_characters': 509630, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.1, 'max_sentence1_length': 433, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 126.1, 'max_sentence2_length': 430, 'unique_sentence2': 1996}, 'fas_Arab-arb_Arab': {'num_samples': 1997, 'number_of_characters': 474520, 'unique_pairs': 1995, 'min_sentence1_length': 9, 'average_sentence1_length': 121.85, 'max_sentence1_length': 389, 'unique_sentence1': 1995, 'min_sentence2_length': 5, 'average_sentence2_length': 115.76, 'max_sentence2_length': 362, 'unique_sentence2': 1995}, 'fas_Arab-ben_Beng': {'num_samples': 1997, 'number_of_characters': 487141, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 121.85, 'max_sentence1_length': 389, 'unique_sentence1': 1995, 'min_sentence2_length': 6, 'average_sentence2_length': 122.08, 'max_sentence2_length': 402, 'unique_sentence2': 1997}, 'fas_Arab-ckb_Arab': {'num_samples': 1997, 'number_of_characters': 495706, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 121.85, 'max_sentence1_length': 389, 'unique_sentence1': 1995, 'min_sentence2_length': 5, 'average_sentence2_length': 126.37, 'max_sentence2_length': 399, 'unique_sentence2': 1995}, 'fas_Arab-deu_Latn': {'num_samples': 1997, 'number_of_characters': 538989, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 121.85, 'max_sentence1_length': 389, 'unique_sentence1': 1995, 'min_sentence2_length': 9, 'average_sentence2_length': 148.05, 'max_sentence2_length': 508, 'unique_sentence2': 1996}, 'fas_Arab-ell_Grek': {'num_samples': 1997, 'number_of_characters': 542466, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 121.85, 'max_sentence1_length': 389, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 149.79, 'max_sentence2_length': 584, 'unique_sentence2': 1996}, 'fas_Arab-eng_Latn': {'num_samples': 1997, 'number_of_characters': 491059, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 121.85, 'max_sentence1_length': 389, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'fas_Arab-fin_Latn': {'num_samples': 1997, 'number_of_characters': 513139, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 121.85, 'max_sentence1_length': 389, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 135.1, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'fas_Arab-fra_Latn': {'num_samples': 1997, 'number_of_characters': 536447, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 121.85, 'max_sentence1_length': 389, 'unique_sentence1': 1995, 'min_sentence2_length': 8, 'average_sentence2_length': 146.77, 'max_sentence2_length': 512, 'unique_sentence2': 1996}, 'fas_Arab-heb_Hebr': {'num_samples': 1997, 'number_of_characters': 443635, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 121.85, 'max_sentence1_length': 389, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 100.3, 'max_sentence2_length': 375, 'unique_sentence2': 1996}, 'fas_Arab-hin_Deva': {'num_samples': 1997, 'number_of_characters': 504914, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 121.85, 'max_sentence1_length': 389, 'unique_sentence1': 1995, 'min_sentence2_length': 5, 'average_sentence2_length': 130.98, 'max_sentence2_length': 394, 'unique_sentence2': 1996}, 'fas_Arab-hun_Latn': {'num_samples': 1997, 'number_of_characters': 521715, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 121.85, 'max_sentence1_length': 389, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 139.4, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'fas_Arab-ind_Latn': {'num_samples': 1997, 'number_of_characters': 530311, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 121.85, 'max_sentence1_length': 389, 'unique_sentence1': 1995, 'min_sentence2_length': 6, 'average_sentence2_length': 143.7, 'max_sentence2_length': 486, 'unique_sentence2': 1997}, 'fas_Arab-jpn_Jpan': {'num_samples': 1997, 'number_of_characters': 354965, 'unique_pairs': 1995, 'min_sentence1_length': 9, 'average_sentence1_length': 121.85, 'max_sentence1_length': 389, 'unique_sentence1': 1995, 'min_sentence2_length': 4, 'average_sentence2_length': 55.9, 'max_sentence2_length': 189, 'unique_sentence2': 1994}, 'fas_Arab-kmr_Latn': {'num_samples': 1997, 'number_of_characters': 489285, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 121.85, 'max_sentence1_length': 389, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 123.16, 'max_sentence2_length': 420, 'unique_sentence2': 1996}, 'fas_Arab-kor_Hang': {'num_samples': 1997, 'number_of_characters': 376744, 'unique_pairs': 1995, 'min_sentence1_length': 9, 'average_sentence1_length': 121.85, 'max_sentence1_length': 389, 'unique_sentence1': 1995, 'min_sentence2_length': 5, 'average_sentence2_length': 66.8, 'max_sentence2_length': 217, 'unique_sentence2': 1995}, 'fas_Arab-lit_Latn': {'num_samples': 1997, 'number_of_characters': 502736, 'unique_pairs': 1995, 'min_sentence1_length': 9, 'average_sentence1_length': 121.85, 'max_sentence1_length': 389, 'unique_sentence1': 1995, 'min_sentence2_length': 9, 'average_sentence2_length': 129.89, 'max_sentence2_length': 446, 'unique_sentence2': 1995}, 'fas_Arab-mey_Arab': {'num_samples': 1997, 'number_of_characters': 457174, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 121.85, 'max_sentence1_length': 389, 'unique_sentence1': 1995, 'min_sentence2_length': 6, 'average_sentence2_length': 107.08, 'max_sentence2_length': 392, 'unique_sentence2': 1993}, 'fas_Arab-nld_Latn': {'num_samples': 1997, 'number_of_characters': 535254, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 121.85, 'max_sentence1_length': 389, 'unique_sentence1': 1995, 'min_sentence2_length': 8, 'average_sentence2_length': 146.18, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'fas_Arab-pol_Latn': {'num_samples': 1997, 'number_of_characters': 521205, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 121.85, 'max_sentence1_length': 389, 'unique_sentence1': 1995, 'min_sentence2_length': 9, 'average_sentence2_length': 139.14, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'fas_Arab-por_Latn': {'num_samples': 1997, 'number_of_characters': 520554, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 121.85, 'max_sentence1_length': 389, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 138.82, 'max_sentence2_length': 497, 'unique_sentence2': 1996}, 'fas_Arab-prs_Arab': {'num_samples': 1997, 'number_of_characters': 485875, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 121.85, 'max_sentence1_length': 389, 'unique_sentence1': 1995, 'min_sentence2_length': 8, 'average_sentence2_length': 121.45, 'max_sentence2_length': 365, 'unique_sentence2': 1997}, 'fas_Arab-pus_Arab': {'num_samples': 1997, 'number_of_characters': 485972, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 121.85, 'max_sentence1_length': 389, 'unique_sentence1': 1995, 'min_sentence2_length': 8, 'average_sentence2_length': 121.5, 'max_sentence2_length': 366, 'unique_sentence2': 1996}, 'fas_Arab-rus_Cyrl': {'num_samples': 1997, 'number_of_characters': 518232, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 121.85, 'max_sentence1_length': 389, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 137.65, 'max_sentence2_length': 419, 'unique_sentence2': 1996}, 'fas_Arab-shi_Arab': {'num_samples': 1997, 'number_of_characters': 458252, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 121.85, 'max_sentence1_length': 389, 'unique_sentence1': 1995, 'min_sentence2_length': 3, 'average_sentence2_length': 107.62, 'max_sentence2_length': 378, 'unique_sentence2': 1996}, 'fas_Arab-spa_Latn': {'num_samples': 1997, 'number_of_characters': 531539, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 121.85, 'max_sentence1_length': 389, 'unique_sentence1': 1995, 'min_sentence2_length': 1, 'average_sentence2_length': 144.32, 'max_sentence2_length': 504, 'unique_sentence2': 1996}, 'fas_Arab-swa_Latn': {'num_samples': 1997, 'number_of_characters': 515848, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 121.85, 'max_sentence1_length': 389, 'unique_sentence1': 1995, 'min_sentence2_length': 10, 'average_sentence2_length': 136.46, 'max_sentence2_length': 430, 'unique_sentence2': 1997}, 'fas_Arab-swe_Latn': {'num_samples': 1997, 'number_of_characters': 495166, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 121.85, 'max_sentence1_length': 389, 'unique_sentence1': 1995, 'min_sentence2_length': 8, 'average_sentence2_length': 126.1, 'max_sentence2_length': 430, 'unique_sentence2': 1996}, 'fas_Arab-tam_Taml': {'num_samples': 1997, 'number_of_characters': 553300, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 121.85, 'max_sentence1_length': 389, 'unique_sentence1': 1995, 'min_sentence2_length': 11, 'average_sentence2_length': 155.21, 'max_sentence2_length': 581, 'unique_sentence2': 1997}, 'fas_Arab-tgk_Cyrl': {'num_samples': 1997, 'number_of_characters': 517486, 'unique_pairs': 1995, 'min_sentence1_length': 9, 'average_sentence1_length': 121.85, 'max_sentence1_length': 389, 'unique_sentence1': 1995, 'min_sentence2_length': 3, 'average_sentence2_length': 137.28, 'max_sentence2_length': 451, 'unique_sentence2': 1995}, 'fas_Arab-tur_Latn': {'num_samples': 1997, 'number_of_characters': 508952, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 121.85, 'max_sentence1_length': 389, 'unique_sentence1': 1995, 'min_sentence2_length': 6, 'average_sentence2_length': 133.01, 'max_sentence2_length': 504, 'unique_sentence2': 1997}, 'fas_Arab-vie_Latn': {'num_samples': 1997, 'number_of_characters': 514460, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 121.85, 'max_sentence1_length': 389, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 135.76, 'max_sentence2_length': 437, 'unique_sentence2': 1996}, 'fas_Arab-zho_Hant': {'num_samples': 1997, 'number_of_characters': 334817, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 121.85, 'max_sentence1_length': 389, 'unique_sentence1': 1995, 'min_sentence2_length': 3, 'average_sentence2_length': 45.81, 'max_sentence2_length': 200, 'unique_sentence2': 1996}, 'fas_Arab-zul_Latn': {'num_samples': 1997, 'number_of_characters': 501071, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 121.85, 'max_sentence1_length': 389, 'unique_sentence1': 1995, 'min_sentence2_length': 8, 'average_sentence2_length': 129.06, 'max_sentence2_length': 494, 'unique_sentence2': 1996}, 'fij_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 548225, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 150.48, 'max_sentence1_length': 448, 'unique_sentence1': 1988, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'fij_Latn-fil_Latn': {'num_samples': 1997, 'number_of_characters': 593925, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 150.48, 'max_sentence1_length': 448, 'unique_sentence1': 1988, 'min_sentence2_length': 10, 'average_sentence2_length': 146.93, 'max_sentence2_length': 554, 'unique_sentence2': 1997}, 'fij_Latn-ind_Latn': {'num_samples': 1997, 'number_of_characters': 587477, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 150.48, 'max_sentence1_length': 448, 'unique_sentence1': 1988, 'min_sentence2_length': 6, 'average_sentence2_length': 143.7, 'max_sentence2_length': 486, 'unique_sentence2': 1997}, 'fij_Latn-mal_Mlym': {'num_samples': 1997, 'number_of_characters': 604657, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 150.48, 'max_sentence1_length': 448, 'unique_sentence1': 1988, 'min_sentence2_length': 7, 'average_sentence2_length': 152.3, 'max_sentence2_length': 540, 'unique_sentence2': 1996}, 'fij_Latn-mlg_Latn': {'num_samples': 1997, 'number_of_characters': 620813, 'unique_pairs': 1995, 'min_sentence1_length': 10, 'average_sentence1_length': 150.48, 'max_sentence1_length': 448, 'unique_sentence1': 1988, 'min_sentence2_length': 6, 'average_sentence2_length': 160.39, 'max_sentence2_length': 559, 'unique_sentence2': 1994}, 'fij_Latn-mri_Latn': {'num_samples': 1997, 'number_of_characters': 574629, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 150.48, 'max_sentence1_length': 448, 'unique_sentence1': 1988, 'min_sentence2_length': 9, 'average_sentence2_length': 137.27, 'max_sentence2_length': 443, 'unique_sentence2': 1997}, 'fij_Latn-msa_Latn': {'num_samples': 1997, 'number_of_characters': 577688, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 150.48, 'max_sentence1_length': 448, 'unique_sentence1': 1988, 'min_sentence2_length': 6, 'average_sentence2_length': 138.8, 'max_sentence2_length': 463, 'unique_sentence2': 1997}, 'fij_Latn-smo_Latn': {'num_samples': 1997, 'number_of_characters': 578360, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 150.48, 'max_sentence1_length': 448, 'unique_sentence1': 1988, 'min_sentence2_length': 8, 'average_sentence2_length': 139.14, 'max_sentence2_length': 431, 'unique_sentence2': 1996}, 'fij_Latn-tah_Latn': {'num_samples': 1997, 'number_of_characters': 610128, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 150.48, 'max_sentence1_length': 448, 'unique_sentence1': 1988, 'min_sentence2_length': 9, 'average_sentence2_length': 155.04, 'max_sentence2_length': 524, 'unique_sentence2': 1997}, 'fij_Latn-ton_Latn': {'num_samples': 1997, 'number_of_characters': 614145, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 150.48, 'max_sentence1_length': 448, 'unique_sentence1': 1988, 'min_sentence2_length': 7, 'average_sentence2_length': 157.06, 'max_sentence2_length': 468, 'unique_sentence2': 1997}, 'fil_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 541140, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 146.93, 'max_sentence1_length': 554, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'fil_Latn-fij_Latn': {'num_samples': 1997, 'number_of_characters': 593925, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 146.93, 'max_sentence1_length': 554, 'unique_sentence1': 1997, 'min_sentence2_length': 10, 'average_sentence2_length': 150.48, 'max_sentence2_length': 448, 'unique_sentence2': 1988}, 'fil_Latn-ind_Latn': {'num_samples': 1997, 'number_of_characters': 580392, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 146.93, 'max_sentence1_length': 554, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 143.7, 'max_sentence2_length': 486, 'unique_sentence2': 1997}, 'fil_Latn-mal_Mlym': {'num_samples': 1997, 'number_of_characters': 597572, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 146.93, 'max_sentence1_length': 554, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 152.3, 'max_sentence2_length': 540, 'unique_sentence2': 1996}, 'fil_Latn-mlg_Latn': {'num_samples': 1997, 'number_of_characters': 613728, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 146.93, 'max_sentence1_length': 554, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 160.39, 'max_sentence2_length': 559, 'unique_sentence2': 1994}, 'fil_Latn-mri_Latn': {'num_samples': 1997, 'number_of_characters': 567544, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 146.93, 'max_sentence1_length': 554, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 137.27, 'max_sentence2_length': 443, 'unique_sentence2': 1997}, 'fil_Latn-msa_Latn': {'num_samples': 1997, 'number_of_characters': 570603, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 146.93, 'max_sentence1_length': 554, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 138.8, 'max_sentence2_length': 463, 'unique_sentence2': 1997}, 'fil_Latn-smo_Latn': {'num_samples': 1997, 'number_of_characters': 571275, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 146.93, 'max_sentence1_length': 554, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 139.14, 'max_sentence2_length': 431, 'unique_sentence2': 1996}, 'fil_Latn-tah_Latn': {'num_samples': 1997, 'number_of_characters': 603043, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 146.93, 'max_sentence1_length': 554, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 155.04, 'max_sentence2_length': 524, 'unique_sentence2': 1997}, 'fil_Latn-ton_Latn': {'num_samples': 1997, 'number_of_characters': 607060, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 146.93, 'max_sentence1_length': 554, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 157.06, 'max_sentence2_length': 468, 'unique_sentence2': 1997}, 'fin_Latn-arb_Arab': {'num_samples': 1997, 'number_of_characters': 500981, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 135.1, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 115.76, 'max_sentence2_length': 362, 'unique_sentence2': 1995}, 'fin_Latn-ben_Beng': {'num_samples': 1997, 'number_of_characters': 513602, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 135.1, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 122.08, 'max_sentence2_length': 402, 'unique_sentence2': 1997}, 'fin_Latn-deu_Latn': {'num_samples': 1997, 'number_of_characters': 565450, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 135.1, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 148.05, 'max_sentence2_length': 508, 'unique_sentence2': 1996}, 'fin_Latn-ell_Grek': {'num_samples': 1997, 'number_of_characters': 568927, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 135.1, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 149.79, 'max_sentence2_length': 584, 'unique_sentence2': 1996}, 'fin_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 517520, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 135.1, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'fin_Latn-fas_Arab': {'num_samples': 1997, 'number_of_characters': 513139, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 135.1, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 121.85, 'max_sentence2_length': 389, 'unique_sentence2': 1995}, 'fin_Latn-fra_Latn': {'num_samples': 1997, 'number_of_characters': 562908, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 135.1, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 146.77, 'max_sentence2_length': 512, 'unique_sentence2': 1996}, 'fin_Latn-heb_Hebr': {'num_samples': 1997, 'number_of_characters': 470096, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 135.1, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 100.3, 'max_sentence2_length': 375, 'unique_sentence2': 1996}, 'fin_Latn-hin_Deva': {'num_samples': 1997, 'number_of_characters': 531375, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 135.1, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 130.98, 'max_sentence2_length': 394, 'unique_sentence2': 1996}, 'fin_Latn-hun_Latn': {'num_samples': 1997, 'number_of_characters': 548176, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 135.1, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 139.4, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'fin_Latn-ind_Latn': {'num_samples': 1997, 'number_of_characters': 556772, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 135.1, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 143.7, 'max_sentence2_length': 486, 'unique_sentence2': 1997}, 'fin_Latn-jpn_Jpan': {'num_samples': 1997, 'number_of_characters': 381426, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 135.1, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 4, 'average_sentence2_length': 55.9, 'max_sentence2_length': 189, 'unique_sentence2': 1994}, 'fin_Latn-kor_Hang': {'num_samples': 1997, 'number_of_characters': 403205, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 135.1, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 66.8, 'max_sentence2_length': 217, 'unique_sentence2': 1995}, 'fin_Latn-lav_Latn': {'num_samples': 1997, 'number_of_characters': 537988, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 135.1, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 134.3, 'max_sentence2_length': 503, 'unique_sentence2': 1994}, 'fin_Latn-lit_Latn': {'num_samples': 1997, 'number_of_characters': 529197, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 135.1, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 129.89, 'max_sentence2_length': 446, 'unique_sentence2': 1995}, 'fin_Latn-nld_Latn': {'num_samples': 1997, 'number_of_characters': 561715, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 135.1, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 146.18, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'fin_Latn-pol_Latn': {'num_samples': 1997, 'number_of_characters': 547666, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 135.1, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 139.14, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'fin_Latn-por_Latn': {'num_samples': 1997, 'number_of_characters': 547015, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 135.1, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 138.82, 'max_sentence2_length': 497, 'unique_sentence2': 1996}, 'fin_Latn-rus_Cyrl': {'num_samples': 1997, 'number_of_characters': 544693, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 135.1, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 137.65, 'max_sentence2_length': 419, 'unique_sentence2': 1996}, 'fin_Latn-spa_Latn': {'num_samples': 1997, 'number_of_characters': 558000, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 135.1, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 1, 'average_sentence2_length': 144.32, 'max_sentence2_length': 504, 'unique_sentence2': 1996}, 'fin_Latn-swa_Latn': {'num_samples': 1997, 'number_of_characters': 542309, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 135.1, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 10, 'average_sentence2_length': 136.46, 'max_sentence2_length': 430, 'unique_sentence2': 1997}, 'fin_Latn-swe_Latn': {'num_samples': 1997, 'number_of_characters': 521627, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 135.1, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 126.1, 'max_sentence2_length': 430, 'unique_sentence2': 1996}, 'fin_Latn-tam_Taml': {'num_samples': 1997, 'number_of_characters': 579761, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 135.1, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 11, 'average_sentence2_length': 155.21, 'max_sentence2_length': 581, 'unique_sentence2': 1997}, 'fin_Latn-tur_Latn': {'num_samples': 1997, 'number_of_characters': 535413, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 135.1, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 133.01, 'max_sentence2_length': 504, 'unique_sentence2': 1997}, 'fin_Latn-vie_Latn': {'num_samples': 1997, 'number_of_characters': 540921, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 135.1, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 135.76, 'max_sentence2_length': 437, 'unique_sentence2': 1996}, 'fin_Latn-zho_Hant': {'num_samples': 1997, 'number_of_characters': 361278, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 135.1, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 3, 'average_sentence2_length': 45.81, 'max_sentence2_length': 200, 'unique_sentence2': 1996}, 'fin_Latn-zul_Latn': {'num_samples': 1997, 'number_of_characters': 527532, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 135.1, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 129.06, 'max_sentence2_length': 494, 'unique_sentence2': 1996}, 'fra_Latn-arb_Arab': {'num_samples': 1997, 'number_of_characters': 524289, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.77, 'max_sentence1_length': 512, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 115.76, 'max_sentence2_length': 362, 'unique_sentence2': 1995}, 'fra_Latn-ben_Beng': {'num_samples': 1997, 'number_of_characters': 536910, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 146.77, 'max_sentence1_length': 512, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 122.08, 'max_sentence2_length': 402, 'unique_sentence2': 1997}, 'fra_Latn-cat_Latn': {'num_samples': 1997, 'number_of_characters': 576068, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 146.77, 'max_sentence1_length': 512, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 141.69, 'max_sentence2_length': 460, 'unique_sentence2': 1997}, 'fra_Latn-deu_Latn': {'num_samples': 1997, 'number_of_characters': 588758, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.77, 'max_sentence1_length': 512, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 148.05, 'max_sentence2_length': 508, 'unique_sentence2': 1996}, 'fra_Latn-ell_Grek': {'num_samples': 1997, 'number_of_characters': 592235, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.77, 'max_sentence1_length': 512, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 149.79, 'max_sentence2_length': 584, 'unique_sentence2': 1996}, 'fra_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 540828, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 146.77, 'max_sentence1_length': 512, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'fra_Latn-fas_Arab': {'num_samples': 1997, 'number_of_characters': 536447, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.77, 'max_sentence1_length': 512, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 121.85, 'max_sentence2_length': 389, 'unique_sentence2': 1995}, 'fra_Latn-fin_Latn': {'num_samples': 1997, 'number_of_characters': 562908, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.77, 'max_sentence1_length': 512, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 135.1, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'fra_Latn-glg_Latn': {'num_samples': 1997, 'number_of_characters': 565094, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.77, 'max_sentence1_length': 512, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 136.2, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'fra_Latn-heb_Hebr': {'num_samples': 1997, 'number_of_characters': 493404, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.77, 'max_sentence1_length': 512, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 100.3, 'max_sentence2_length': 375, 'unique_sentence2': 1996}, 'fra_Latn-hin_Deva': {'num_samples': 1997, 'number_of_characters': 554683, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.77, 'max_sentence1_length': 512, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 130.98, 'max_sentence2_length': 394, 'unique_sentence2': 1996}, 'fra_Latn-hun_Latn': {'num_samples': 1997, 'number_of_characters': 571484, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 146.77, 'max_sentence1_length': 512, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 139.4, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'fra_Latn-ind_Latn': {'num_samples': 1997, 'number_of_characters': 580080, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 146.77, 'max_sentence1_length': 512, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 143.7, 'max_sentence2_length': 486, 'unique_sentence2': 1997}, 'fra_Latn-ita_Latn': {'num_samples': 1997, 'number_of_characters': 582325, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.77, 'max_sentence1_length': 512, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 144.83, 'max_sentence2_length': 623, 'unique_sentence2': 1996}, 'fra_Latn-jpn_Jpan': {'num_samples': 1997, 'number_of_characters': 404734, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.77, 'max_sentence1_length': 512, 'unique_sentence1': 1996, 'min_sentence2_length': 4, 'average_sentence2_length': 55.9, 'max_sentence2_length': 189, 'unique_sentence2': 1994}, 'fra_Latn-kor_Hang': {'num_samples': 1997, 'number_of_characters': 426513, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.77, 'max_sentence1_length': 512, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 66.8, 'max_sentence2_length': 217, 'unique_sentence2': 1995}, 'fra_Latn-lit_Latn': {'num_samples': 1997, 'number_of_characters': 552505, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.77, 'max_sentence1_length': 512, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 129.89, 'max_sentence2_length': 446, 'unique_sentence2': 1995}, 'fra_Latn-mlt_Latn': {'num_samples': 1997, 'number_of_characters': 570583, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.77, 'max_sentence1_length': 512, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 138.95, 'max_sentence2_length': 582, 'unique_sentence2': 1996}, 'fra_Latn-nld_Latn': {'num_samples': 1997, 'number_of_characters': 585023, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.77, 'max_sentence1_length': 512, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 146.18, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'fra_Latn-pol_Latn': {'num_samples': 1997, 'number_of_characters': 570974, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.77, 'max_sentence1_length': 512, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 139.14, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'fra_Latn-por_Latn': {'num_samples': 1997, 'number_of_characters': 570323, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.77, 'max_sentence1_length': 512, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 138.82, 'max_sentence2_length': 497, 'unique_sentence2': 1996}, 'fra_Latn-ron_Latn': {'num_samples': 1997, 'number_of_characters': 585593, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 146.77, 'max_sentence1_length': 512, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 146.46, 'max_sentence2_length': 518, 'unique_sentence2': 1997}, 'fra_Latn-rus_Cyrl': {'num_samples': 1997, 'number_of_characters': 568001, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.77, 'max_sentence1_length': 512, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 137.65, 'max_sentence2_length': 419, 'unique_sentence2': 1996}, 'fra_Latn-spa_Latn': {'num_samples': 1997, 'number_of_characters': 581308, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.77, 'max_sentence1_length': 512, 'unique_sentence1': 1996, 'min_sentence2_length': 1, 'average_sentence2_length': 144.32, 'max_sentence2_length': 504, 'unique_sentence2': 1996}, 'fra_Latn-swa_Latn': {'num_samples': 1997, 'number_of_characters': 565617, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 146.77, 'max_sentence1_length': 512, 'unique_sentence1': 1996, 'min_sentence2_length': 10, 'average_sentence2_length': 136.46, 'max_sentence2_length': 430, 'unique_sentence2': 1997}, 'fra_Latn-swe_Latn': {'num_samples': 1997, 'number_of_characters': 544935, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.77, 'max_sentence1_length': 512, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 126.1, 'max_sentence2_length': 430, 'unique_sentence2': 1996}, 'fra_Latn-tam_Taml': {'num_samples': 1997, 'number_of_characters': 603069, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 146.77, 'max_sentence1_length': 512, 'unique_sentence1': 1996, 'min_sentence2_length': 11, 'average_sentence2_length': 155.21, 'max_sentence2_length': 581, 'unique_sentence2': 1997}, 'fra_Latn-tur_Latn': {'num_samples': 1997, 'number_of_characters': 558721, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 146.77, 'max_sentence1_length': 512, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 133.01, 'max_sentence2_length': 504, 'unique_sentence2': 1997}, 'fra_Latn-vie_Latn': {'num_samples': 1997, 'number_of_characters': 564229, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 146.77, 'max_sentence1_length': 512, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 135.76, 'max_sentence2_length': 437, 'unique_sentence2': 1996}, 'fra_Latn-zho_Hant': {'num_samples': 1997, 'number_of_characters': 384586, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.77, 'max_sentence1_length': 512, 'unique_sentence1': 1996, 'min_sentence2_length': 3, 'average_sentence2_length': 45.81, 'max_sentence2_length': 200, 'unique_sentence2': 1996}, 'fra_Latn-zul_Latn': {'num_samples': 1997, 'number_of_characters': 550840, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 146.77, 'max_sentence1_length': 512, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 129.06, 'max_sentence2_length': 494, 'unique_sentence2': 1996}, 'fuc_Latn-bem_Latn': {'num_samples': 1997, 'number_of_characters': 526972, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 114.41, 'max_sentence1_length': 376, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 149.47, 'max_sentence2_length': 465, 'unique_sentence2': 1997}, 'fuc_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 476200, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 114.41, 'max_sentence1_length': 376, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'fuc_Latn-ewe_Latn': {'num_samples': 1997, 'number_of_characters': 467458, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 114.41, 'max_sentence1_length': 376, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 119.67, 'max_sentence2_length': 493, 'unique_sentence2': 1994}, 'fuc_Latn-kin_Latn': {'num_samples': 1997, 'number_of_characters': 532267, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 114.41, 'max_sentence1_length': 376, 'unique_sentence1': 1996, 'min_sentence2_length': 10, 'average_sentence2_length': 152.12, 'max_sentence2_length': 541, 'unique_sentence2': 1996}, 'fuc_Latn-nde_Latn': {'num_samples': 1997, 'number_of_characters': 526219, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 114.41, 'max_sentence1_length': 376, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 149.09, 'max_sentence2_length': 590, 'unique_sentence2': 1997}, 'fuc_Latn-nya_Latn': {'num_samples': 1997, 'number_of_characters': 512762, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 114.41, 'max_sentence1_length': 376, 'unique_sentence1': 1996, 'min_sentence2_length': 10, 'average_sentence2_length': 142.35, 'max_sentence2_length': 464, 'unique_sentence2': 1993}, 'fuc_Latn-sna_Latn': {'num_samples': 1997, 'number_of_characters': 526810, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 114.41, 'max_sentence1_length': 376, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 149.39, 'max_sentence2_length': 511, 'unique_sentence2': 1995}, 'fuc_Latn-ven_Latn': {'num_samples': 1997, 'number_of_characters': 528236, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 114.41, 'max_sentence1_length': 376, 'unique_sentence1': 1996, 'min_sentence2_length': 10, 'average_sentence2_length': 150.1, 'max_sentence2_length': 535, 'unique_sentence2': 1993}, 'gle_Latn-cym_Latn': {'num_samples': 1997, 'number_of_characters': 561314, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 147.63, 'max_sentence1_length': 461, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 133.45, 'max_sentence2_length': 444, 'unique_sentence2': 1997}, 'gle_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 542529, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 147.63, 'max_sentence1_length': 461, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'glg_Latn-cat_Latn': {'num_samples': 1997, 'number_of_characters': 554946, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 136.2, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 141.69, 'max_sentence2_length': 460, 'unique_sentence2': 1997}, 'glg_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 519706, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 136.2, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'glg_Latn-fra_Latn': {'num_samples': 1997, 'number_of_characters': 565094, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 136.2, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 146.77, 'max_sentence2_length': 512, 'unique_sentence2': 1996}, 'glg_Latn-ita_Latn': {'num_samples': 1997, 'number_of_characters': 561203, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 136.2, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 144.83, 'max_sentence2_length': 623, 'unique_sentence2': 1996}, 'glg_Latn-mlt_Latn': {'num_samples': 1997, 'number_of_characters': 549461, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 136.2, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 138.95, 'max_sentence2_length': 582, 'unique_sentence2': 1996}, 'glg_Latn-por_Latn': {'num_samples': 1997, 'number_of_characters': 549201, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 136.2, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 138.82, 'max_sentence2_length': 497, 'unique_sentence2': 1996}, 'glg_Latn-ron_Latn': {'num_samples': 1997, 'number_of_characters': 564471, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 136.2, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 146.46, 'max_sentence2_length': 518, 'unique_sentence2': 1997}, 'glg_Latn-spa_Latn': {'num_samples': 1997, 'number_of_characters': 560186, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 136.2, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 1, 'average_sentence2_length': 144.32, 'max_sentence2_length': 504, 'unique_sentence2': 1996}, 'guj_Gujr-ben_Beng': {'num_samples': 1997, 'number_of_characters': 488733, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 122.65, 'max_sentence1_length': 378, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 122.08, 'max_sentence2_length': 402, 'unique_sentence2': 1997}, 'guj_Gujr-div_Thaa': {'num_samples': 1997, 'number_of_characters': 548779, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 122.65, 'max_sentence1_length': 378, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 152.15, 'max_sentence2_length': 609, 'unique_sentence2': 1996}, 'guj_Gujr-eng_Latn': {'num_samples': 1997, 'number_of_characters': 492651, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 122.65, 'max_sentence1_length': 378, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'guj_Gujr-eus_Latn': {'num_samples': 1997, 'number_of_characters': 520134, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 122.65, 'max_sentence1_length': 378, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 137.81, 'max_sentence2_length': 393, 'unique_sentence2': 1997}, 'guj_Gujr-hin_Deva': {'num_samples': 1997, 'number_of_characters': 506506, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 122.65, 'max_sentence1_length': 378, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 130.98, 'max_sentence2_length': 394, 'unique_sentence2': 1996}, 'guj_Gujr-kan_Knda': {'num_samples': 1997, 'number_of_characters': 510467, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 122.65, 'max_sentence1_length': 378, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 132.97, 'max_sentence2_length': 449, 'unique_sentence2': 1996}, 'guj_Gujr-mar_Deva': {'num_samples': 1997, 'number_of_characters': 505818, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 122.65, 'max_sentence1_length': 378, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 130.64, 'max_sentence2_length': 443, 'unique_sentence2': 1995}, 'guj_Gujr-nep_Deva': {'num_samples': 1997, 'number_of_characters': 493154, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 122.65, 'max_sentence1_length': 378, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 124.3, 'max_sentence2_length': 412, 'unique_sentence2': 1996}, 'guj_Gujr-pan_Guru': {'num_samples': 1997, 'number_of_characters': 495353, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 122.65, 'max_sentence1_length': 378, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 125.4, 'max_sentence2_length': 383, 'unique_sentence2': 1996}, 'guj_Gujr-sin_Sinh': {'num_samples': 1997, 'number_of_characters': 503672, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 122.65, 'max_sentence1_length': 378, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 129.56, 'max_sentence2_length': 441, 'unique_sentence2': 1996}, 'guj_Gujr-snd_Arab': {'num_samples': 1997, 'number_of_characters': 465258, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 122.65, 'max_sentence1_length': 378, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 110.33, 'max_sentence2_length': 335, 'unique_sentence2': 1996}, 'guj_Gujr-tam_Taml': {'num_samples': 1997, 'number_of_characters': 554892, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 122.65, 'max_sentence1_length': 378, 'unique_sentence1': 1997, 'min_sentence2_length': 11, 'average_sentence2_length': 155.21, 'max_sentence2_length': 581, 'unique_sentence2': 1997}, 'guj_Gujr-tel_Telu': {'num_samples': 1997, 'number_of_characters': 492458, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 122.65, 'max_sentence1_length': 378, 'unique_sentence1': 1997, 'min_sentence2_length': 10, 'average_sentence2_length': 123.95, 'max_sentence2_length': 412, 'unique_sentence2': 1996}, 'guj_Gujr-urd_Arab': {'num_samples': 1997, 'number_of_characters': 492929, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 122.65, 'max_sentence1_length': 378, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.19, 'max_sentence2_length': 390, 'unique_sentence2': 1996}, 'hau_Latn-amh_Ethi': {'num_samples': 1997, 'number_of_characters': 437473, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 135.19, 'max_sentence1_length': 483, 'unique_sentence1': 1997, 'min_sentence2_length': 1, 'average_sentence2_length': 83.88, 'max_sentence2_length': 290, 'unique_sentence2': 1994}, 'hau_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 517686, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 135.19, 'max_sentence1_length': 483, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'hau_Latn-ibo_Latn': {'num_samples': 1997, 'number_of_characters': 516067, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 135.19, 'max_sentence1_length': 483, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 123.24, 'max_sentence2_length': 469, 'unique_sentence2': 1997}, 'hau_Latn-nso_Latn': {'num_samples': 1997, 'number_of_characters': 561465, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 135.19, 'max_sentence1_length': 483, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 145.97, 'max_sentence2_length': 487, 'unique_sentence2': 1996}, 'hau_Latn-orm_Ethi': {'num_samples': 1997, 'number_of_characters': 507397, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 135.19, 'max_sentence1_length': 483, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 118.89, 'max_sentence2_length': 466, 'unique_sentence2': 1984}, 'hau_Latn-som_Latn': {'num_samples': 1997, 'number_of_characters': 561258, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 135.19, 'max_sentence1_length': 483, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 145.86, 'max_sentence2_length': 455, 'unique_sentence2': 1997}, 'hau_Latn-ssw_Latn': {'num_samples': 1997, 'number_of_characters': 558108, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 135.19, 'max_sentence1_length': 483, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 144.29, 'max_sentence2_length': 510, 'unique_sentence2': 1996}, 'hau_Latn-swa_Latn': {'num_samples': 1997, 'number_of_characters': 542475, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 135.19, 'max_sentence1_length': 483, 'unique_sentence1': 1997, 'min_sentence2_length': 10, 'average_sentence2_length': 136.46, 'max_sentence2_length': 430, 'unique_sentence2': 1997}, 'hau_Latn-tir_Ethi': {'num_samples': 1997, 'number_of_characters': 435204, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 135.19, 'max_sentence1_length': 483, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 82.74, 'max_sentence2_length': 272, 'unique_sentence2': 1996}, 'hau_Latn-tsn_Latn': {'num_samples': 1997, 'number_of_characters': 604249, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 135.19, 'max_sentence1_length': 483, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 167.39, 'max_sentence2_length': 556, 'unique_sentence2': 1997}, 'hau_Latn-wol_Latn': {'num_samples': 1997, 'number_of_characters': 509769, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 135.19, 'max_sentence1_length': 483, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 120.08, 'max_sentence2_length': 405, 'unique_sentence2': 1990}, 'hau_Latn-xho_Latn': {'num_samples': 1997, 'number_of_characters': 538056, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 135.19, 'max_sentence1_length': 483, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 134.25, 'max_sentence2_length': 492, 'unique_sentence2': 1997}, 'hau_Latn-yor_Latn': {'num_samples': 1997, 'number_of_characters': 586054, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 135.19, 'max_sentence1_length': 483, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 158.28, 'max_sentence2_length': 582, 'unique_sentence2': 1996}, 'hau_Latn-zul_Latn': {'num_samples': 1997, 'number_of_characters': 527698, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 135.19, 'max_sentence1_length': 483, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 129.06, 'max_sentence2_length': 494, 'unique_sentence2': 1996}, 'heb_Hebr-arb_Arab': {'num_samples': 1997, 'number_of_characters': 431477, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 100.3, 'max_sentence1_length': 375, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 115.76, 'max_sentence2_length': 362, 'unique_sentence2': 1995}, 'heb_Hebr-ben_Beng': {'num_samples': 1997, 'number_of_characters': 444098, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 100.3, 'max_sentence1_length': 375, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 122.08, 'max_sentence2_length': 402, 'unique_sentence2': 1997}, 'heb_Hebr-ckb_Arab': {'num_samples': 1997, 'number_of_characters': 452663, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 100.3, 'max_sentence1_length': 375, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 126.37, 'max_sentence2_length': 399, 'unique_sentence2': 1995}, 'heb_Hebr-deu_Latn': {'num_samples': 1997, 'number_of_characters': 495946, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 100.3, 'max_sentence1_length': 375, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 148.05, 'max_sentence2_length': 508, 'unique_sentence2': 1996}, 'heb_Hebr-ell_Grek': {'num_samples': 1997, 'number_of_characters': 499423, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 100.3, 'max_sentence1_length': 375, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 149.79, 'max_sentence2_length': 584, 'unique_sentence2': 1996}, 'heb_Hebr-eng_Latn': {'num_samples': 1997, 'number_of_characters': 448016, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 100.3, 'max_sentence1_length': 375, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'heb_Hebr-fas_Arab': {'num_samples': 1997, 'number_of_characters': 443635, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 100.3, 'max_sentence1_length': 375, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 121.85, 'max_sentence2_length': 389, 'unique_sentence2': 1995}, 'heb_Hebr-fin_Latn': {'num_samples': 1997, 'number_of_characters': 470096, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 100.3, 'max_sentence1_length': 375, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 135.1, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'heb_Hebr-fra_Latn': {'num_samples': 1997, 'number_of_characters': 493404, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 100.3, 'max_sentence1_length': 375, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 146.77, 'max_sentence2_length': 512, 'unique_sentence2': 1996}, 'heb_Hebr-hin_Deva': {'num_samples': 1997, 'number_of_characters': 461871, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 100.3, 'max_sentence1_length': 375, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 130.98, 'max_sentence2_length': 394, 'unique_sentence2': 1996}, 'heb_Hebr-hun_Latn': {'num_samples': 1997, 'number_of_characters': 478672, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 100.3, 'max_sentence1_length': 375, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 139.4, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'heb_Hebr-ind_Latn': {'num_samples': 1997, 'number_of_characters': 487268, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 100.3, 'max_sentence1_length': 375, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 143.7, 'max_sentence2_length': 486, 'unique_sentence2': 1997}, 'heb_Hebr-jpn_Jpan': {'num_samples': 1997, 'number_of_characters': 311922, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 100.3, 'max_sentence1_length': 375, 'unique_sentence1': 1996, 'min_sentence2_length': 4, 'average_sentence2_length': 55.9, 'max_sentence2_length': 189, 'unique_sentence2': 1994}, 'heb_Hebr-kmr_Latn': {'num_samples': 1997, 'number_of_characters': 446242, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 100.3, 'max_sentence1_length': 375, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 123.16, 'max_sentence2_length': 420, 'unique_sentence2': 1996}, 'heb_Hebr-kor_Hang': {'num_samples': 1997, 'number_of_characters': 333701, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 100.3, 'max_sentence1_length': 375, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 66.8, 'max_sentence2_length': 217, 'unique_sentence2': 1995}, 'heb_Hebr-lit_Latn': {'num_samples': 1997, 'number_of_characters': 459693, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 100.3, 'max_sentence1_length': 375, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 129.89, 'max_sentence2_length': 446, 'unique_sentence2': 1995}, 'heb_Hebr-mey_Arab': {'num_samples': 1997, 'number_of_characters': 414131, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 100.3, 'max_sentence1_length': 375, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 107.08, 'max_sentence2_length': 392, 'unique_sentence2': 1993}, 'heb_Hebr-nld_Latn': {'num_samples': 1997, 'number_of_characters': 492211, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 100.3, 'max_sentence1_length': 375, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 146.18, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'heb_Hebr-pol_Latn': {'num_samples': 1997, 'number_of_characters': 478162, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 100.3, 'max_sentence1_length': 375, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 139.14, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'heb_Hebr-por_Latn': {'num_samples': 1997, 'number_of_characters': 477511, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 100.3, 'max_sentence1_length': 375, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 138.82, 'max_sentence2_length': 497, 'unique_sentence2': 1996}, 'heb_Hebr-prs_Arab': {'num_samples': 1997, 'number_of_characters': 442832, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 100.3, 'max_sentence1_length': 375, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 121.45, 'max_sentence2_length': 365, 'unique_sentence2': 1997}, 'heb_Hebr-pus_Arab': {'num_samples': 1997, 'number_of_characters': 442929, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 100.3, 'max_sentence1_length': 375, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 121.5, 'max_sentence2_length': 366, 'unique_sentence2': 1996}, 'heb_Hebr-rus_Cyrl': {'num_samples': 1997, 'number_of_characters': 475189, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 100.3, 'max_sentence1_length': 375, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 137.65, 'max_sentence2_length': 419, 'unique_sentence2': 1996}, 'heb_Hebr-shi_Arab': {'num_samples': 1997, 'number_of_characters': 415209, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 100.3, 'max_sentence1_length': 375, 'unique_sentence1': 1996, 'min_sentence2_length': 3, 'average_sentence2_length': 107.62, 'max_sentence2_length': 378, 'unique_sentence2': 1996}, 'heb_Hebr-spa_Latn': {'num_samples': 1997, 'number_of_characters': 488496, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 100.3, 'max_sentence1_length': 375, 'unique_sentence1': 1996, 'min_sentence2_length': 1, 'average_sentence2_length': 144.32, 'max_sentence2_length': 504, 'unique_sentence2': 1996}, 'heb_Hebr-swa_Latn': {'num_samples': 1997, 'number_of_characters': 472805, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 100.3, 'max_sentence1_length': 375, 'unique_sentence1': 1996, 'min_sentence2_length': 10, 'average_sentence2_length': 136.46, 'max_sentence2_length': 430, 'unique_sentence2': 1997}, 'heb_Hebr-swe_Latn': {'num_samples': 1997, 'number_of_characters': 452123, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 100.3, 'max_sentence1_length': 375, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 126.1, 'max_sentence2_length': 430, 'unique_sentence2': 1996}, 'heb_Hebr-tam_Taml': {'num_samples': 1997, 'number_of_characters': 510257, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 100.3, 'max_sentence1_length': 375, 'unique_sentence1': 1996, 'min_sentence2_length': 11, 'average_sentence2_length': 155.21, 'max_sentence2_length': 581, 'unique_sentence2': 1997}, 'heb_Hebr-tgk_Cyrl': {'num_samples': 1997, 'number_of_characters': 474443, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 100.3, 'max_sentence1_length': 375, 'unique_sentence1': 1996, 'min_sentence2_length': 3, 'average_sentence2_length': 137.28, 'max_sentence2_length': 451, 'unique_sentence2': 1995}, 'heb_Hebr-tur_Latn': {'num_samples': 1997, 'number_of_characters': 465909, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 100.3, 'max_sentence1_length': 375, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 133.01, 'max_sentence2_length': 504, 'unique_sentence2': 1997}, 'heb_Hebr-vie_Latn': {'num_samples': 1997, 'number_of_characters': 471417, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 100.3, 'max_sentence1_length': 375, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 135.76, 'max_sentence2_length': 437, 'unique_sentence2': 1996}, 'heb_Hebr-zho_Hant': {'num_samples': 1997, 'number_of_characters': 291774, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 100.3, 'max_sentence1_length': 375, 'unique_sentence1': 1996, 'min_sentence2_length': 3, 'average_sentence2_length': 45.81, 'max_sentence2_length': 200, 'unique_sentence2': 1996}, 'heb_Hebr-zul_Latn': {'num_samples': 1997, 'number_of_characters': 458028, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 100.3, 'max_sentence1_length': 375, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 129.06, 'max_sentence2_length': 494, 'unique_sentence2': 1996}, 'hin_Deva-arb_Arab': {'num_samples': 1997, 'number_of_characters': 492756, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 130.98, 'max_sentence1_length': 394, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 115.76, 'max_sentence2_length': 362, 'unique_sentence2': 1995}, 'hin_Deva-ben_Beng': {'num_samples': 1997, 'number_of_characters': 505377, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 130.98, 'max_sentence1_length': 394, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 122.08, 'max_sentence2_length': 402, 'unique_sentence2': 1997}, 'hin_Deva-deu_Latn': {'num_samples': 1997, 'number_of_characters': 557225, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 130.98, 'max_sentence1_length': 394, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 148.05, 'max_sentence2_length': 508, 'unique_sentence2': 1996}, 'hin_Deva-div_Thaa': {'num_samples': 1997, 'number_of_characters': 565423, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 130.98, 'max_sentence1_length': 394, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 152.15, 'max_sentence2_length': 609, 'unique_sentence2': 1996}, 'hin_Deva-ell_Grek': {'num_samples': 1997, 'number_of_characters': 560702, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 130.98, 'max_sentence1_length': 394, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 149.79, 'max_sentence2_length': 584, 'unique_sentence2': 1996}, 'hin_Deva-eng_Latn': {'num_samples': 1997, 'number_of_characters': 509295, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 130.98, 'max_sentence1_length': 394, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'hin_Deva-eus_Latn': {'num_samples': 1997, 'number_of_characters': 536778, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 130.98, 'max_sentence1_length': 394, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 137.81, 'max_sentence2_length': 393, 'unique_sentence2': 1997}, 'hin_Deva-fas_Arab': {'num_samples': 1997, 'number_of_characters': 504914, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 130.98, 'max_sentence1_length': 394, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 121.85, 'max_sentence2_length': 389, 'unique_sentence2': 1995}, 'hin_Deva-fin_Latn': {'num_samples': 1997, 'number_of_characters': 531375, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 130.98, 'max_sentence1_length': 394, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 135.1, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'hin_Deva-fra_Latn': {'num_samples': 1997, 'number_of_characters': 554683, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 130.98, 'max_sentence1_length': 394, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 146.77, 'max_sentence2_length': 512, 'unique_sentence2': 1996}, 'hin_Deva-guj_Gujr': {'num_samples': 1997, 'number_of_characters': 506506, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 130.98, 'max_sentence1_length': 394, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 122.65, 'max_sentence2_length': 378, 'unique_sentence2': 1997}, 'hin_Deva-heb_Hebr': {'num_samples': 1997, 'number_of_characters': 461871, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 130.98, 'max_sentence1_length': 394, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 100.3, 'max_sentence2_length': 375, 'unique_sentence2': 1996}, 'hin_Deva-hun_Latn': {'num_samples': 1997, 'number_of_characters': 539951, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 130.98, 'max_sentence1_length': 394, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 139.4, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'hin_Deva-ind_Latn': {'num_samples': 1997, 'number_of_characters': 548547, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 130.98, 'max_sentence1_length': 394, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 143.7, 'max_sentence2_length': 486, 'unique_sentence2': 1997}, 'hin_Deva-jpn_Jpan': {'num_samples': 1997, 'number_of_characters': 373201, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 130.98, 'max_sentence1_length': 394, 'unique_sentence1': 1996, 'min_sentence2_length': 4, 'average_sentence2_length': 55.9, 'max_sentence2_length': 189, 'unique_sentence2': 1994}, 'hin_Deva-kan_Knda': {'num_samples': 1997, 'number_of_characters': 527111, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 130.98, 'max_sentence1_length': 394, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 132.97, 'max_sentence2_length': 449, 'unique_sentence2': 1996}, 'hin_Deva-kor_Hang': {'num_samples': 1997, 'number_of_characters': 394980, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 130.98, 'max_sentence1_length': 394, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 66.8, 'max_sentence2_length': 217, 'unique_sentence2': 1995}, 'hin_Deva-lit_Latn': {'num_samples': 1997, 'number_of_characters': 520972, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 130.98, 'max_sentence1_length': 394, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 129.89, 'max_sentence2_length': 446, 'unique_sentence2': 1995}, 'hin_Deva-mar_Deva': {'num_samples': 1997, 'number_of_characters': 522462, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 130.98, 'max_sentence1_length': 394, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 130.64, 'max_sentence2_length': 443, 'unique_sentence2': 1995}, 'hin_Deva-nep_Deva': {'num_samples': 1997, 'number_of_characters': 509798, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 130.98, 'max_sentence1_length': 394, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 124.3, 'max_sentence2_length': 412, 'unique_sentence2': 1996}, 'hin_Deva-nld_Latn': {'num_samples': 1997, 'number_of_characters': 553490, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 130.98, 'max_sentence1_length': 394, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 146.18, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'hin_Deva-pan_Guru': {'num_samples': 1997, 'number_of_characters': 511997, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 130.98, 'max_sentence1_length': 394, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 125.4, 'max_sentence2_length': 383, 'unique_sentence2': 1996}, 'hin_Deva-pol_Latn': {'num_samples': 1997, 'number_of_characters': 539441, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 130.98, 'max_sentence1_length': 394, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 139.14, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'hin_Deva-por_Latn': {'num_samples': 1997, 'number_of_characters': 538790, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 130.98, 'max_sentence1_length': 394, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 138.82, 'max_sentence2_length': 497, 'unique_sentence2': 1996}, 'hin_Deva-rus_Cyrl': {'num_samples': 1997, 'number_of_characters': 536468, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 130.98, 'max_sentence1_length': 394, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 137.65, 'max_sentence2_length': 419, 'unique_sentence2': 1996}, 'hin_Deva-sin_Sinh': {'num_samples': 1997, 'number_of_characters': 520316, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 130.98, 'max_sentence1_length': 394, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 129.56, 'max_sentence2_length': 441, 'unique_sentence2': 1996}, 'hin_Deva-snd_Arab': {'num_samples': 1997, 'number_of_characters': 481902, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 130.98, 'max_sentence1_length': 394, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 110.33, 'max_sentence2_length': 335, 'unique_sentence2': 1996}, 'hin_Deva-spa_Latn': {'num_samples': 1997, 'number_of_characters': 549775, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 130.98, 'max_sentence1_length': 394, 'unique_sentence1': 1996, 'min_sentence2_length': 1, 'average_sentence2_length': 144.32, 'max_sentence2_length': 504, 'unique_sentence2': 1996}, 'hin_Deva-swa_Latn': {'num_samples': 1997, 'number_of_characters': 534084, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 130.98, 'max_sentence1_length': 394, 'unique_sentence1': 1996, 'min_sentence2_length': 10, 'average_sentence2_length': 136.46, 'max_sentence2_length': 430, 'unique_sentence2': 1997}, 'hin_Deva-swe_Latn': {'num_samples': 1997, 'number_of_characters': 513402, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 130.98, 'max_sentence1_length': 394, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 126.1, 'max_sentence2_length': 430, 'unique_sentence2': 1996}, 'hin_Deva-tam_Taml': {'num_samples': 1997, 'number_of_characters': 571536, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 130.98, 'max_sentence1_length': 394, 'unique_sentence1': 1996, 'min_sentence2_length': 11, 'average_sentence2_length': 155.21, 'max_sentence2_length': 581, 'unique_sentence2': 1997}, 'hin_Deva-tel_Telu': {'num_samples': 1997, 'number_of_characters': 509102, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 130.98, 'max_sentence1_length': 394, 'unique_sentence1': 1996, 'min_sentence2_length': 10, 'average_sentence2_length': 123.95, 'max_sentence2_length': 412, 'unique_sentence2': 1996}, 'hin_Deva-tur_Latn': {'num_samples': 1997, 'number_of_characters': 527188, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 130.98, 'max_sentence1_length': 394, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 133.01, 'max_sentence2_length': 504, 'unique_sentence2': 1997}, 'hin_Deva-urd_Arab': {'num_samples': 1997, 'number_of_characters': 509573, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 130.98, 'max_sentence1_length': 394, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.19, 'max_sentence2_length': 390, 'unique_sentence2': 1996}, 'hin_Deva-vie_Latn': {'num_samples': 1997, 'number_of_characters': 532696, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 130.98, 'max_sentence1_length': 394, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 135.76, 'max_sentence2_length': 437, 'unique_sentence2': 1996}, 'hin_Deva-zho_Hant': {'num_samples': 1997, 'number_of_characters': 353053, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 130.98, 'max_sentence1_length': 394, 'unique_sentence1': 1996, 'min_sentence2_length': 3, 'average_sentence2_length': 45.81, 'max_sentence2_length': 200, 'unique_sentence2': 1996}, 'hin_Deva-zul_Latn': {'num_samples': 1997, 'number_of_characters': 519307, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 130.98, 'max_sentence1_length': 394, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 129.06, 'max_sentence2_length': 494, 'unique_sentence2': 1996}, 'hmn_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 578510, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 165.64, 'max_sentence1_length': 643, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'hrv_Latn-bel_Cyrl': {'num_samples': 1997, 'number_of_characters': 512015, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 128.15, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 128.24, 'max_sentence2_length': 422, 'unique_sentence2': 1996}, 'hrv_Latn-bos_Latn': {'num_samples': 1997, 'number_of_characters': 510835, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 128.15, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 127.65, 'max_sentence2_length': 434, 'unique_sentence2': 1996}, 'hrv_Latn-bul_Cyrl': {'num_samples': 1997, 'number_of_characters': 525814, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 128.15, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 135.15, 'max_sentence2_length': 493, 'unique_sentence2': 1996}, 'hrv_Latn-ces_Latn': {'num_samples': 1997, 'number_of_characters': 497243, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 128.15, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 120.84, 'max_sentence2_length': 474, 'unique_sentence2': 1997}, 'hrv_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 503645, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 128.15, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'hrv_Latn-mkd_Cyrl': {'num_samples': 1997, 'number_of_characters': 523816, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 128.15, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 134.15, 'max_sentence2_length': 451, 'unique_sentence2': 1997}, 'hrv_Latn-pol_Latn': {'num_samples': 1997, 'number_of_characters': 533791, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 128.15, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 139.14, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'hrv_Latn-rus_Cyrl': {'num_samples': 1997, 'number_of_characters': 530818, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 128.15, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 137.65, 'max_sentence2_length': 419, 'unique_sentence2': 1996}, 'hrv_Latn-slk_Latn': {'num_samples': 1997, 'number_of_characters': 508894, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 128.15, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 126.67, 'max_sentence2_length': 403, 'unique_sentence2': 1996}, 'hrv_Latn-slv_Latn': {'num_samples': 1997, 'number_of_characters': 508821, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 128.15, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 126.64, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'hrv_Latn-srp_Cyrl': {'num_samples': 1997, 'number_of_characters': 508228, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 128.15, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 126.34, 'max_sentence2_length': 439, 'unique_sentence2': 1995}, 'hrv_Latn-srp_Latn': {'num_samples': 1997, 'number_of_characters': 512066, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 128.15, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 128.26, 'max_sentence2_length': 452, 'unique_sentence2': 1996}, 'hrv_Latn-ukr_Cyrl': {'num_samples': 1997, 'number_of_characters': 518708, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 128.15, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 131.59, 'max_sentence2_length': 440, 'unique_sentence2': 1996}, 'hun_Latn-arb_Arab': {'num_samples': 1997, 'number_of_characters': 509557, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 139.4, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 115.76, 'max_sentence2_length': 362, 'unique_sentence2': 1995}, 'hun_Latn-ben_Beng': {'num_samples': 1997, 'number_of_characters': 522178, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 139.4, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 122.08, 'max_sentence2_length': 402, 'unique_sentence2': 1997}, 'hun_Latn-deu_Latn': {'num_samples': 1997, 'number_of_characters': 574026, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 139.4, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 148.05, 'max_sentence2_length': 508, 'unique_sentence2': 1996}, 'hun_Latn-ell_Grek': {'num_samples': 1997, 'number_of_characters': 577503, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 139.4, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 149.79, 'max_sentence2_length': 584, 'unique_sentence2': 1996}, 'hun_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 526096, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 139.4, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'hun_Latn-fas_Arab': {'num_samples': 1997, 'number_of_characters': 521715, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 139.4, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 121.85, 'max_sentence2_length': 389, 'unique_sentence2': 1995}, 'hun_Latn-fin_Latn': {'num_samples': 1997, 'number_of_characters': 548176, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 139.4, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 135.1, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'hun_Latn-fra_Latn': {'num_samples': 1997, 'number_of_characters': 571484, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 139.4, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 146.77, 'max_sentence2_length': 512, 'unique_sentence2': 1996}, 'hun_Latn-heb_Hebr': {'num_samples': 1997, 'number_of_characters': 478672, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 139.4, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 100.3, 'max_sentence2_length': 375, 'unique_sentence2': 1996}, 'hun_Latn-hin_Deva': {'num_samples': 1997, 'number_of_characters': 539951, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 139.4, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 130.98, 'max_sentence2_length': 394, 'unique_sentence2': 1996}, 'hun_Latn-ind_Latn': {'num_samples': 1997, 'number_of_characters': 565348, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 139.4, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 143.7, 'max_sentence2_length': 486, 'unique_sentence2': 1997}, 'hun_Latn-jpn_Jpan': {'num_samples': 1997, 'number_of_characters': 390002, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 139.4, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 4, 'average_sentence2_length': 55.9, 'max_sentence2_length': 189, 'unique_sentence2': 1994}, 'hun_Latn-kor_Hang': {'num_samples': 1997, 'number_of_characters': 411781, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 139.4, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 66.8, 'max_sentence2_length': 217, 'unique_sentence2': 1995}, 'hun_Latn-lav_Latn': {'num_samples': 1997, 'number_of_characters': 546564, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 139.4, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 134.3, 'max_sentence2_length': 503, 'unique_sentence2': 1994}, 'hun_Latn-lit_Latn': {'num_samples': 1997, 'number_of_characters': 537773, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 139.4, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 129.89, 'max_sentence2_length': 446, 'unique_sentence2': 1995}, 'hun_Latn-nld_Latn': {'num_samples': 1997, 'number_of_characters': 570291, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 139.4, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 146.18, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'hun_Latn-pol_Latn': {'num_samples': 1997, 'number_of_characters': 556242, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 139.4, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 139.14, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'hun_Latn-por_Latn': {'num_samples': 1997, 'number_of_characters': 555591, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 139.4, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 138.82, 'max_sentence2_length': 497, 'unique_sentence2': 1996}, 'hun_Latn-rus_Cyrl': {'num_samples': 1997, 'number_of_characters': 553269, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 139.4, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 137.65, 'max_sentence2_length': 419, 'unique_sentence2': 1996}, 'hun_Latn-spa_Latn': {'num_samples': 1997, 'number_of_characters': 566576, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 139.4, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 1, 'average_sentence2_length': 144.32, 'max_sentence2_length': 504, 'unique_sentence2': 1996}, 'hun_Latn-swa_Latn': {'num_samples': 1997, 'number_of_characters': 550885, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 139.4, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 10, 'average_sentence2_length': 136.46, 'max_sentence2_length': 430, 'unique_sentence2': 1997}, 'hun_Latn-swe_Latn': {'num_samples': 1997, 'number_of_characters': 530203, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 139.4, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 126.1, 'max_sentence2_length': 430, 'unique_sentence2': 1996}, 'hun_Latn-tam_Taml': {'num_samples': 1997, 'number_of_characters': 588337, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 139.4, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 11, 'average_sentence2_length': 155.21, 'max_sentence2_length': 581, 'unique_sentence2': 1997}, 'hun_Latn-tur_Latn': {'num_samples': 1997, 'number_of_characters': 543989, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 139.4, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 133.01, 'max_sentence2_length': 504, 'unique_sentence2': 1997}, 'hun_Latn-vie_Latn': {'num_samples': 1997, 'number_of_characters': 549497, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 139.4, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 135.76, 'max_sentence2_length': 437, 'unique_sentence2': 1996}, 'hun_Latn-zho_Hant': {'num_samples': 1997, 'number_of_characters': 369854, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 139.4, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 3, 'average_sentence2_length': 45.81, 'max_sentence2_length': 200, 'unique_sentence2': 1996}, 'hun_Latn-zul_Latn': {'num_samples': 1997, 'number_of_characters': 536108, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 139.4, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 129.06, 'max_sentence2_length': 494, 'unique_sentence2': 1996}, 'hye_Armn-ell_Grek': {'num_samples': 1997, 'number_of_characters': 563842, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 132.56, 'max_sentence1_length': 441, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 149.79, 'max_sentence2_length': 584, 'unique_sentence2': 1996}, 'hye_Armn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 512435, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 132.56, 'max_sentence1_length': 441, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'hye_Armn-kat_Geor': {'num_samples': 1997, 'number_of_characters': 531307, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 132.56, 'max_sentence1_length': 441, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 133.5, 'max_sentence2_length': 503, 'unique_sentence2': 1995}, 'hye_Armn-sqi_Latn': {'num_samples': 1997, 'number_of_characters': 548322, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 132.56, 'max_sentence1_length': 441, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 142.02, 'max_sentence2_length': 461, 'unique_sentence2': 1996}, 'ibo_Latn-amh_Ethi': {'num_samples': 1997, 'number_of_characters': 413608, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 123.24, 'max_sentence1_length': 469, 'unique_sentence1': 1997, 'min_sentence2_length': 1, 'average_sentence2_length': 83.88, 'max_sentence2_length': 290, 'unique_sentence2': 1994}, 'ibo_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 493821, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 123.24, 'max_sentence1_length': 469, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'ibo_Latn-hau_Latn': {'num_samples': 1997, 'number_of_characters': 516067, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 123.24, 'max_sentence1_length': 469, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 135.19, 'max_sentence2_length': 483, 'unique_sentence2': 1997}, 'ibo_Latn-nso_Latn': {'num_samples': 1997, 'number_of_characters': 537600, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 123.24, 'max_sentence1_length': 469, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 145.97, 'max_sentence2_length': 487, 'unique_sentence2': 1996}, 'ibo_Latn-orm_Ethi': {'num_samples': 1997, 'number_of_characters': 483532, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 123.24, 'max_sentence1_length': 469, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 118.89, 'max_sentence2_length': 466, 'unique_sentence2': 1984}, 'ibo_Latn-som_Latn': {'num_samples': 1997, 'number_of_characters': 537393, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 123.24, 'max_sentence1_length': 469, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 145.86, 'max_sentence2_length': 455, 'unique_sentence2': 1997}, 'ibo_Latn-ssw_Latn': {'num_samples': 1997, 'number_of_characters': 534243, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 123.24, 'max_sentence1_length': 469, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 144.29, 'max_sentence2_length': 510, 'unique_sentence2': 1996}, 'ibo_Latn-swa_Latn': {'num_samples': 1997, 'number_of_characters': 518610, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 123.24, 'max_sentence1_length': 469, 'unique_sentence1': 1997, 'min_sentence2_length': 10, 'average_sentence2_length': 136.46, 'max_sentence2_length': 430, 'unique_sentence2': 1997}, 'ibo_Latn-tir_Ethi': {'num_samples': 1997, 'number_of_characters': 411339, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 123.24, 'max_sentence1_length': 469, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 82.74, 'max_sentence2_length': 272, 'unique_sentence2': 1996}, 'ibo_Latn-tsn_Latn': {'num_samples': 1997, 'number_of_characters': 580384, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 123.24, 'max_sentence1_length': 469, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 167.39, 'max_sentence2_length': 556, 'unique_sentence2': 1997}, 'ibo_Latn-wol_Latn': {'num_samples': 1997, 'number_of_characters': 485904, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 123.24, 'max_sentence1_length': 469, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 120.08, 'max_sentence2_length': 405, 'unique_sentence2': 1990}, 'ibo_Latn-xho_Latn': {'num_samples': 1997, 'number_of_characters': 514191, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 123.24, 'max_sentence1_length': 469, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 134.25, 'max_sentence2_length': 492, 'unique_sentence2': 1997}, 'ibo_Latn-yor_Latn': {'num_samples': 1997, 'number_of_characters': 562189, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 123.24, 'max_sentence1_length': 469, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 158.28, 'max_sentence2_length': 582, 'unique_sentence2': 1996}, 'ibo_Latn-zul_Latn': {'num_samples': 1997, 'number_of_characters': 503833, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 123.24, 'max_sentence1_length': 469, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 129.06, 'max_sentence2_length': 494, 'unique_sentence2': 1996}, 'ind_Latn-arb_Arab': {'num_samples': 1997, 'number_of_characters': 518153, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 143.7, 'max_sentence1_length': 486, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 115.76, 'max_sentence2_length': 362, 'unique_sentence2': 1995}, 'ind_Latn-ben_Beng': {'num_samples': 1997, 'number_of_characters': 530774, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 143.7, 'max_sentence1_length': 486, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 122.08, 'max_sentence2_length': 402, 'unique_sentence2': 1997}, 'ind_Latn-deu_Latn': {'num_samples': 1997, 'number_of_characters': 582622, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 143.7, 'max_sentence1_length': 486, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 148.05, 'max_sentence2_length': 508, 'unique_sentence2': 1996}, 'ind_Latn-ell_Grek': {'num_samples': 1997, 'number_of_characters': 586099, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 143.7, 'max_sentence1_length': 486, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 149.79, 'max_sentence2_length': 584, 'unique_sentence2': 1996}, 'ind_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 534692, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 143.7, 'max_sentence1_length': 486, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'ind_Latn-fas_Arab': {'num_samples': 1997, 'number_of_characters': 530311, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 143.7, 'max_sentence1_length': 486, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 121.85, 'max_sentence2_length': 389, 'unique_sentence2': 1995}, 'ind_Latn-fij_Latn': {'num_samples': 1997, 'number_of_characters': 587477, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 143.7, 'max_sentence1_length': 486, 'unique_sentence1': 1997, 'min_sentence2_length': 10, 'average_sentence2_length': 150.48, 'max_sentence2_length': 448, 'unique_sentence2': 1988}, 'ind_Latn-fil_Latn': {'num_samples': 1997, 'number_of_characters': 580392, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 143.7, 'max_sentence1_length': 486, 'unique_sentence1': 1997, 'min_sentence2_length': 10, 'average_sentence2_length': 146.93, 'max_sentence2_length': 554, 'unique_sentence2': 1997}, 'ind_Latn-fin_Latn': {'num_samples': 1997, 'number_of_characters': 556772, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 143.7, 'max_sentence1_length': 486, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 135.1, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'ind_Latn-fra_Latn': {'num_samples': 1997, 'number_of_characters': 580080, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 143.7, 'max_sentence1_length': 486, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 146.77, 'max_sentence2_length': 512, 'unique_sentence2': 1996}, 'ind_Latn-heb_Hebr': {'num_samples': 1997, 'number_of_characters': 487268, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 143.7, 'max_sentence1_length': 486, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 100.3, 'max_sentence2_length': 375, 'unique_sentence2': 1996}, 'ind_Latn-hin_Deva': {'num_samples': 1997, 'number_of_characters': 548547, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 143.7, 'max_sentence1_length': 486, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 130.98, 'max_sentence2_length': 394, 'unique_sentence2': 1996}, 'ind_Latn-hun_Latn': {'num_samples': 1997, 'number_of_characters': 565348, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 143.7, 'max_sentence1_length': 486, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 139.4, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'ind_Latn-jpn_Jpan': {'num_samples': 1997, 'number_of_characters': 398598, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 143.7, 'max_sentence1_length': 486, 'unique_sentence1': 1997, 'min_sentence2_length': 4, 'average_sentence2_length': 55.9, 'max_sentence2_length': 189, 'unique_sentence2': 1994}, 'ind_Latn-kor_Hang': {'num_samples': 1997, 'number_of_characters': 420377, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 143.7, 'max_sentence1_length': 486, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 66.8, 'max_sentence2_length': 217, 'unique_sentence2': 1995}, 'ind_Latn-lit_Latn': {'num_samples': 1997, 'number_of_characters': 546369, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 143.7, 'max_sentence1_length': 486, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 129.89, 'max_sentence2_length': 446, 'unique_sentence2': 1995}, 'ind_Latn-mal_Mlym': {'num_samples': 1997, 'number_of_characters': 591124, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 143.7, 'max_sentence1_length': 486, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 152.3, 'max_sentence2_length': 540, 'unique_sentence2': 1996}, 'ind_Latn-mlg_Latn': {'num_samples': 1997, 'number_of_characters': 607280, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 143.7, 'max_sentence1_length': 486, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 160.39, 'max_sentence2_length': 559, 'unique_sentence2': 1994}, 'ind_Latn-mri_Latn': {'num_samples': 1997, 'number_of_characters': 561096, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 143.7, 'max_sentence1_length': 486, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 137.27, 'max_sentence2_length': 443, 'unique_sentence2': 1997}, 'ind_Latn-msa_Latn': {'num_samples': 1997, 'number_of_characters': 564155, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 143.7, 'max_sentence1_length': 486, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 138.8, 'max_sentence2_length': 463, 'unique_sentence2': 1997}, 'ind_Latn-nld_Latn': {'num_samples': 1997, 'number_of_characters': 578887, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 143.7, 'max_sentence1_length': 486, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 146.18, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'ind_Latn-pol_Latn': {'num_samples': 1997, 'number_of_characters': 564838, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 143.7, 'max_sentence1_length': 486, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 139.14, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'ind_Latn-por_Latn': {'num_samples': 1997, 'number_of_characters': 564187, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 143.7, 'max_sentence1_length': 486, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 138.82, 'max_sentence2_length': 497, 'unique_sentence2': 1996}, 'ind_Latn-rus_Cyrl': {'num_samples': 1997, 'number_of_characters': 561865, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 143.7, 'max_sentence1_length': 486, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 137.65, 'max_sentence2_length': 419, 'unique_sentence2': 1996}, 'ind_Latn-smo_Latn': {'num_samples': 1997, 'number_of_characters': 564827, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 143.7, 'max_sentence1_length': 486, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 139.14, 'max_sentence2_length': 431, 'unique_sentence2': 1996}, 'ind_Latn-spa_Latn': {'num_samples': 1997, 'number_of_characters': 575172, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 143.7, 'max_sentence1_length': 486, 'unique_sentence1': 1997, 'min_sentence2_length': 1, 'average_sentence2_length': 144.32, 'max_sentence2_length': 504, 'unique_sentence2': 1996}, 'ind_Latn-swa_Latn': {'num_samples': 1997, 'number_of_characters': 559481, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 143.7, 'max_sentence1_length': 486, 'unique_sentence1': 1997, 'min_sentence2_length': 10, 'average_sentence2_length': 136.46, 'max_sentence2_length': 430, 'unique_sentence2': 1997}, 'ind_Latn-swe_Latn': {'num_samples': 1997, 'number_of_characters': 538799, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 143.7, 'max_sentence1_length': 486, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 126.1, 'max_sentence2_length': 430, 'unique_sentence2': 1996}, 'ind_Latn-tah_Latn': {'num_samples': 1997, 'number_of_characters': 596595, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 143.7, 'max_sentence1_length': 486, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 155.04, 'max_sentence2_length': 524, 'unique_sentence2': 1997}, 'ind_Latn-tam_Taml': {'num_samples': 1997, 'number_of_characters': 596933, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 143.7, 'max_sentence1_length': 486, 'unique_sentence1': 1997, 'min_sentence2_length': 11, 'average_sentence2_length': 155.21, 'max_sentence2_length': 581, 'unique_sentence2': 1997}, 'ind_Latn-ton_Latn': {'num_samples': 1997, 'number_of_characters': 600612, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 143.7, 'max_sentence1_length': 486, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 157.06, 'max_sentence2_length': 468, 'unique_sentence2': 1997}, 'ind_Latn-tur_Latn': {'num_samples': 1997, 'number_of_characters': 552585, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 143.7, 'max_sentence1_length': 486, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 133.01, 'max_sentence2_length': 504, 'unique_sentence2': 1997}, 'ind_Latn-vie_Latn': {'num_samples': 1997, 'number_of_characters': 558093, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 143.7, 'max_sentence1_length': 486, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 135.76, 'max_sentence2_length': 437, 'unique_sentence2': 1996}, 'ind_Latn-zho_Hant': {'num_samples': 1997, 'number_of_characters': 378450, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 143.7, 'max_sentence1_length': 486, 'unique_sentence1': 1997, 'min_sentence2_length': 3, 'average_sentence2_length': 45.81, 'max_sentence2_length': 200, 'unique_sentence2': 1996}, 'ind_Latn-zul_Latn': {'num_samples': 1997, 'number_of_characters': 544704, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 143.7, 'max_sentence1_length': 486, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 129.06, 'max_sentence2_length': 494, 'unique_sentence2': 1996}, 'isl_Latn-afr_Latn': {'num_samples': 1997, 'number_of_characters': 530560, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 131.3, 'max_sentence1_length': 399, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 134.38, 'max_sentence2_length': 437, 'unique_sentence2': 1996}, 'isl_Latn-dan_Latn': {'num_samples': 1997, 'number_of_characters': 514346, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 131.3, 'max_sentence1_length': 399, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 126.26, 'max_sentence2_length': 522, 'unique_sentence2': 1995}, 'isl_Latn-deu_Latn': {'num_samples': 1997, 'number_of_characters': 557858, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 131.3, 'max_sentence1_length': 399, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 148.05, 'max_sentence2_length': 508, 'unique_sentence2': 1996}, 'isl_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 509928, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 131.3, 'max_sentence1_length': 399, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'isl_Latn-fao_Latn': {'num_samples': 1997, 'number_of_characters': 520011, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 131.3, 'max_sentence1_length': 399, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 129.1, 'max_sentence2_length': 433, 'unique_sentence2': 1997}, 'isl_Latn-ltz_Latn': {'num_samples': 1997, 'number_of_characters': 542965, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 131.3, 'max_sentence1_length': 399, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 140.59, 'max_sentence2_length': 543, 'unique_sentence2': 1996}, 'isl_Latn-nld_Latn': {'num_samples': 1997, 'number_of_characters': 554123, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 131.3, 'max_sentence1_length': 399, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 146.18, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'isl_Latn-nno_Latn': {'num_samples': 1997, 'number_of_characters': 510565, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 131.3, 'max_sentence1_length': 399, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.37, 'max_sentence2_length': 417, 'unique_sentence2': 1996}, 'isl_Latn-nob_Latn': {'num_samples': 1997, 'number_of_characters': 513652, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 131.3, 'max_sentence1_length': 399, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 125.91, 'max_sentence2_length': 482, 'unique_sentence2': 1996}, 'isl_Latn-swe_Latn': {'num_samples': 1997, 'number_of_characters': 514035, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 131.3, 'max_sentence1_length': 399, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 126.1, 'max_sentence2_length': 430, 'unique_sentence2': 1996}, 'ita_Latn-cat_Latn': {'num_samples': 1997, 'number_of_characters': 572177, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 144.83, 'max_sentence1_length': 623, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 141.69, 'max_sentence2_length': 460, 'unique_sentence2': 1997}, 'ita_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 536937, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 144.83, 'max_sentence1_length': 623, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'ita_Latn-fra_Latn': {'num_samples': 1997, 'number_of_characters': 582325, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 144.83, 'max_sentence1_length': 623, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 146.77, 'max_sentence2_length': 512, 'unique_sentence2': 1996}, 'ita_Latn-glg_Latn': {'num_samples': 1997, 'number_of_characters': 561203, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 144.83, 'max_sentence1_length': 623, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 136.2, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'ita_Latn-mlt_Latn': {'num_samples': 1997, 'number_of_characters': 566692, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 144.83, 'max_sentence1_length': 623, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 138.95, 'max_sentence2_length': 582, 'unique_sentence2': 1996}, 'ita_Latn-por_Latn': {'num_samples': 1997, 'number_of_characters': 566432, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 144.83, 'max_sentence1_length': 623, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 138.82, 'max_sentence2_length': 497, 'unique_sentence2': 1996}, 'ita_Latn-ron_Latn': {'num_samples': 1997, 'number_of_characters': 581702, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 144.83, 'max_sentence1_length': 623, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 146.46, 'max_sentence2_length': 518, 'unique_sentence2': 1997}, 'ita_Latn-spa_Latn': {'num_samples': 1997, 'number_of_characters': 577417, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 144.83, 'max_sentence1_length': 623, 'unique_sentence1': 1996, 'min_sentence2_length': 1, 'average_sentence2_length': 144.32, 'max_sentence2_length': 504, 'unique_sentence2': 1996}, 'jpn_Jpan-arb_Arab': {'num_samples': 1997, 'number_of_characters': 342807, 'unique_pairs': 1995, 'min_sentence1_length': 4, 'average_sentence1_length': 55.9, 'max_sentence1_length': 189, 'unique_sentence1': 1994, 'min_sentence2_length': 5, 'average_sentence2_length': 115.76, 'max_sentence2_length': 362, 'unique_sentence2': 1995}, 'jpn_Jpan-ben_Beng': {'num_samples': 1997, 'number_of_characters': 355428, 'unique_pairs': 1997, 'min_sentence1_length': 4, 'average_sentence1_length': 55.9, 'max_sentence1_length': 189, 'unique_sentence1': 1994, 'min_sentence2_length': 6, 'average_sentence2_length': 122.08, 'max_sentence2_length': 402, 'unique_sentence2': 1997}, 'jpn_Jpan-deu_Latn': {'num_samples': 1997, 'number_of_characters': 407276, 'unique_pairs': 1996, 'min_sentence1_length': 4, 'average_sentence1_length': 55.9, 'max_sentence1_length': 189, 'unique_sentence1': 1994, 'min_sentence2_length': 9, 'average_sentence2_length': 148.05, 'max_sentence2_length': 508, 'unique_sentence2': 1996}, 'jpn_Jpan-ell_Grek': {'num_samples': 1997, 'number_of_characters': 410753, 'unique_pairs': 1996, 'min_sentence1_length': 4, 'average_sentence1_length': 55.9, 'max_sentence1_length': 189, 'unique_sentence1': 1994, 'min_sentence2_length': 7, 'average_sentence2_length': 149.79, 'max_sentence2_length': 584, 'unique_sentence2': 1996}, 'jpn_Jpan-eng_Latn': {'num_samples': 1997, 'number_of_characters': 359346, 'unique_pairs': 1997, 'min_sentence1_length': 4, 'average_sentence1_length': 55.9, 'max_sentence1_length': 189, 'unique_sentence1': 1994, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'jpn_Jpan-fas_Arab': {'num_samples': 1997, 'number_of_characters': 354965, 'unique_pairs': 1995, 'min_sentence1_length': 4, 'average_sentence1_length': 55.9, 'max_sentence1_length': 189, 'unique_sentence1': 1994, 'min_sentence2_length': 9, 'average_sentence2_length': 121.85, 'max_sentence2_length': 389, 'unique_sentence2': 1995}, 'jpn_Jpan-fin_Latn': {'num_samples': 1997, 'number_of_characters': 381426, 'unique_pairs': 1996, 'min_sentence1_length': 4, 'average_sentence1_length': 55.9, 'max_sentence1_length': 189, 'unique_sentence1': 1994, 'min_sentence2_length': 7, 'average_sentence2_length': 135.1, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'jpn_Jpan-fra_Latn': {'num_samples': 1997, 'number_of_characters': 404734, 'unique_pairs': 1996, 'min_sentence1_length': 4, 'average_sentence1_length': 55.9, 'max_sentence1_length': 189, 'unique_sentence1': 1994, 'min_sentence2_length': 8, 'average_sentence2_length': 146.77, 'max_sentence2_length': 512, 'unique_sentence2': 1996}, 'jpn_Jpan-heb_Hebr': {'num_samples': 1997, 'number_of_characters': 311922, 'unique_pairs': 1996, 'min_sentence1_length': 4, 'average_sentence1_length': 55.9, 'max_sentence1_length': 189, 'unique_sentence1': 1994, 'min_sentence2_length': 7, 'average_sentence2_length': 100.3, 'max_sentence2_length': 375, 'unique_sentence2': 1996}, 'jpn_Jpan-hin_Deva': {'num_samples': 1997, 'number_of_characters': 373201, 'unique_pairs': 1996, 'min_sentence1_length': 4, 'average_sentence1_length': 55.9, 'max_sentence1_length': 189, 'unique_sentence1': 1994, 'min_sentence2_length': 5, 'average_sentence2_length': 130.98, 'max_sentence2_length': 394, 'unique_sentence2': 1996}, 'jpn_Jpan-hun_Latn': {'num_samples': 1997, 'number_of_characters': 390002, 'unique_pairs': 1997, 'min_sentence1_length': 4, 'average_sentence1_length': 55.9, 'max_sentence1_length': 189, 'unique_sentence1': 1994, 'min_sentence2_length': 7, 'average_sentence2_length': 139.4, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'jpn_Jpan-ind_Latn': {'num_samples': 1997, 'number_of_characters': 398598, 'unique_pairs': 1997, 'min_sentence1_length': 4, 'average_sentence1_length': 55.9, 'max_sentence1_length': 189, 'unique_sentence1': 1994, 'min_sentence2_length': 6, 'average_sentence2_length': 143.7, 'max_sentence2_length': 486, 'unique_sentence2': 1997}, 'jpn_Jpan-kor_Hang': {'num_samples': 1997, 'number_of_characters': 245031, 'unique_pairs': 1995, 'min_sentence1_length': 4, 'average_sentence1_length': 55.9, 'max_sentence1_length': 189, 'unique_sentence1': 1994, 'min_sentence2_length': 5, 'average_sentence2_length': 66.8, 'max_sentence2_length': 217, 'unique_sentence2': 1995}, 'jpn_Jpan-lit_Latn': {'num_samples': 1997, 'number_of_characters': 371023, 'unique_pairs': 1995, 'min_sentence1_length': 4, 'average_sentence1_length': 55.9, 'max_sentence1_length': 189, 'unique_sentence1': 1994, 'min_sentence2_length': 9, 'average_sentence2_length': 129.89, 'max_sentence2_length': 446, 'unique_sentence2': 1995}, 'jpn_Jpan-nld_Latn': {'num_samples': 1997, 'number_of_characters': 403541, 'unique_pairs': 1996, 'min_sentence1_length': 4, 'average_sentence1_length': 55.9, 'max_sentence1_length': 189, 'unique_sentence1': 1994, 'min_sentence2_length': 8, 'average_sentence2_length': 146.18, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'jpn_Jpan-pol_Latn': {'num_samples': 1997, 'number_of_characters': 389492, 'unique_pairs': 1996, 'min_sentence1_length': 4, 'average_sentence1_length': 55.9, 'max_sentence1_length': 189, 'unique_sentence1': 1994, 'min_sentence2_length': 9, 'average_sentence2_length': 139.14, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'jpn_Jpan-por_Latn': {'num_samples': 1997, 'number_of_characters': 388841, 'unique_pairs': 1996, 'min_sentence1_length': 4, 'average_sentence1_length': 55.9, 'max_sentence1_length': 189, 'unique_sentence1': 1994, 'min_sentence2_length': 7, 'average_sentence2_length': 138.82, 'max_sentence2_length': 497, 'unique_sentence2': 1996}, 'jpn_Jpan-rus_Cyrl': {'num_samples': 1997, 'number_of_characters': 386519, 'unique_pairs': 1996, 'min_sentence1_length': 4, 'average_sentence1_length': 55.9, 'max_sentence1_length': 189, 'unique_sentence1': 1994, 'min_sentence2_length': 7, 'average_sentence2_length': 137.65, 'max_sentence2_length': 419, 'unique_sentence2': 1996}, 'jpn_Jpan-spa_Latn': {'num_samples': 1997, 'number_of_characters': 399826, 'unique_pairs': 1996, 'min_sentence1_length': 4, 'average_sentence1_length': 55.9, 'max_sentence1_length': 189, 'unique_sentence1': 1994, 'min_sentence2_length': 1, 'average_sentence2_length': 144.32, 'max_sentence2_length': 504, 'unique_sentence2': 1996}, 'jpn_Jpan-swa_Latn': {'num_samples': 1997, 'number_of_characters': 384135, 'unique_pairs': 1997, 'min_sentence1_length': 4, 'average_sentence1_length': 55.9, 'max_sentence1_length': 189, 'unique_sentence1': 1994, 'min_sentence2_length': 10, 'average_sentence2_length': 136.46, 'max_sentence2_length': 430, 'unique_sentence2': 1997}, 'jpn_Jpan-swe_Latn': {'num_samples': 1997, 'number_of_characters': 363453, 'unique_pairs': 1996, 'min_sentence1_length': 4, 'average_sentence1_length': 55.9, 'max_sentence1_length': 189, 'unique_sentence1': 1994, 'min_sentence2_length': 8, 'average_sentence2_length': 126.1, 'max_sentence2_length': 430, 'unique_sentence2': 1996}, 'jpn_Jpan-tam_Taml': {'num_samples': 1997, 'number_of_characters': 421587, 'unique_pairs': 1997, 'min_sentence1_length': 4, 'average_sentence1_length': 55.9, 'max_sentence1_length': 189, 'unique_sentence1': 1994, 'min_sentence2_length': 11, 'average_sentence2_length': 155.21, 'max_sentence2_length': 581, 'unique_sentence2': 1997}, 'jpn_Jpan-tur_Latn': {'num_samples': 1997, 'number_of_characters': 377239, 'unique_pairs': 1997, 'min_sentence1_length': 4, 'average_sentence1_length': 55.9, 'max_sentence1_length': 189, 'unique_sentence1': 1994, 'min_sentence2_length': 6, 'average_sentence2_length': 133.01, 'max_sentence2_length': 504, 'unique_sentence2': 1997}, 'jpn_Jpan-vie_Latn': {'num_samples': 1997, 'number_of_characters': 382747, 'unique_pairs': 1996, 'min_sentence1_length': 4, 'average_sentence1_length': 55.9, 'max_sentence1_length': 189, 'unique_sentence1': 1994, 'min_sentence2_length': 7, 'average_sentence2_length': 135.76, 'max_sentence2_length': 437, 'unique_sentence2': 1996}, 'jpn_Jpan-yue_Hant': {'num_samples': 1997, 'number_of_characters': 190513, 'unique_pairs': 1996, 'min_sentence1_length': 4, 'average_sentence1_length': 55.9, 'max_sentence1_length': 189, 'unique_sentence1': 1994, 'min_sentence2_length': 4, 'average_sentence2_length': 39.5, 'max_sentence2_length': 133, 'unique_sentence2': 1996}, 'jpn_Jpan-zho_Hans': {'num_samples': 1997, 'number_of_characters': 196587, 'unique_pairs': 1997, 'min_sentence1_length': 4, 'average_sentence1_length': 55.9, 'max_sentence1_length': 189, 'unique_sentence1': 1994, 'min_sentence2_length': 4, 'average_sentence2_length': 42.54, 'max_sentence2_length': 263, 'unique_sentence2': 1997}, 'jpn_Jpan-zho_Hant': {'num_samples': 1997, 'number_of_characters': 203104, 'unique_pairs': 1996, 'min_sentence1_length': 4, 'average_sentence1_length': 55.9, 'max_sentence1_length': 189, 'unique_sentence1': 1994, 'min_sentence2_length': 3, 'average_sentence2_length': 45.81, 'max_sentence2_length': 200, 'unique_sentence2': 1996}, 'jpn_Jpan-zul_Latn': {'num_samples': 1997, 'number_of_characters': 369358, 'unique_pairs': 1997, 'min_sentence1_length': 4, 'average_sentence1_length': 55.9, 'max_sentence1_length': 189, 'unique_sentence1': 1994, 'min_sentence2_length': 8, 'average_sentence2_length': 129.06, 'max_sentence2_length': 494, 'unique_sentence2': 1996}, 'kan_Knda-ben_Beng': {'num_samples': 1997, 'number_of_characters': 509338, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 132.97, 'max_sentence1_length': 449, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 122.08, 'max_sentence2_length': 402, 'unique_sentence2': 1997}, 'kan_Knda-div_Thaa': {'num_samples': 1997, 'number_of_characters': 569384, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 132.97, 'max_sentence1_length': 449, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 152.15, 'max_sentence2_length': 609, 'unique_sentence2': 1996}, 'kan_Knda-eng_Latn': {'num_samples': 1997, 'number_of_characters': 513256, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 132.97, 'max_sentence1_length': 449, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'kan_Knda-eus_Latn': {'num_samples': 1997, 'number_of_characters': 540739, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 132.97, 'max_sentence1_length': 449, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 137.81, 'max_sentence2_length': 393, 'unique_sentence2': 1997}, 'kan_Knda-guj_Gujr': {'num_samples': 1997, 'number_of_characters': 510467, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 132.97, 'max_sentence1_length': 449, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 122.65, 'max_sentence2_length': 378, 'unique_sentence2': 1997}, 'kan_Knda-hin_Deva': {'num_samples': 1997, 'number_of_characters': 527111, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 132.97, 'max_sentence1_length': 449, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 130.98, 'max_sentence2_length': 394, 'unique_sentence2': 1996}, 'kan_Knda-mar_Deva': {'num_samples': 1997, 'number_of_characters': 526423, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 132.97, 'max_sentence1_length': 449, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 130.64, 'max_sentence2_length': 443, 'unique_sentence2': 1995}, 'kan_Knda-nep_Deva': {'num_samples': 1997, 'number_of_characters': 513759, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 132.97, 'max_sentence1_length': 449, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 124.3, 'max_sentence2_length': 412, 'unique_sentence2': 1996}, 'kan_Knda-pan_Guru': {'num_samples': 1997, 'number_of_characters': 515958, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 132.97, 'max_sentence1_length': 449, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 125.4, 'max_sentence2_length': 383, 'unique_sentence2': 1996}, 'kan_Knda-sin_Sinh': {'num_samples': 1997, 'number_of_characters': 524277, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 132.97, 'max_sentence1_length': 449, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 129.56, 'max_sentence2_length': 441, 'unique_sentence2': 1996}, 'kan_Knda-snd_Arab': {'num_samples': 1997, 'number_of_characters': 485863, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 132.97, 'max_sentence1_length': 449, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 110.33, 'max_sentence2_length': 335, 'unique_sentence2': 1996}, 'kan_Knda-tam_Taml': {'num_samples': 1997, 'number_of_characters': 575497, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 132.97, 'max_sentence1_length': 449, 'unique_sentence1': 1996, 'min_sentence2_length': 11, 'average_sentence2_length': 155.21, 'max_sentence2_length': 581, 'unique_sentence2': 1997}, 'kan_Knda-tel_Telu': {'num_samples': 1997, 'number_of_characters': 513063, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 132.97, 'max_sentence1_length': 449, 'unique_sentence1': 1996, 'min_sentence2_length': 10, 'average_sentence2_length': 123.95, 'max_sentence2_length': 412, 'unique_sentence2': 1996}, 'kan_Knda-urd_Arab': {'num_samples': 1997, 'number_of_characters': 513534, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 132.97, 'max_sentence1_length': 449, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.19, 'max_sentence2_length': 390, 'unique_sentence2': 1996}, 'kat_Geor-ell_Grek': {'num_samples': 1997, 'number_of_characters': 565719, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 133.5, 'max_sentence1_length': 503, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 149.79, 'max_sentence2_length': 584, 'unique_sentence2': 1996}, 'kat_Geor-eng_Latn': {'num_samples': 1997, 'number_of_characters': 514312, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 133.5, 'max_sentence1_length': 503, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'kat_Geor-hye_Armn': {'num_samples': 1997, 'number_of_characters': 531307, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 133.5, 'max_sentence1_length': 503, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 132.56, 'max_sentence2_length': 441, 'unique_sentence2': 1996}, 'kat_Geor-sqi_Latn': {'num_samples': 1997, 'number_of_characters': 550199, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 133.5, 'max_sentence1_length': 503, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 142.02, 'max_sentence2_length': 461, 'unique_sentence2': 1996}, 'kaz_Cyrl-aze_Latn': {'num_samples': 1997, 'number_of_characters': 529910, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 130.33, 'max_sentence1_length': 473, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 135.02, 'max_sentence2_length': 398, 'unique_sentence2': 1997}, 'kaz_Cyrl-bak_Cyrl': {'num_samples': 1997, 'number_of_characters': 506602, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 130.33, 'max_sentence1_length': 473, 'unique_sentence1': 1996, 'min_sentence2_length': 11, 'average_sentence2_length': 123.35, 'max_sentence2_length': 437, 'unique_sentence2': 1995}, 'kaz_Cyrl-eng_Latn': {'num_samples': 1997, 'number_of_characters': 507996, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 130.33, 'max_sentence1_length': 473, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'kaz_Cyrl-kir_Cyrl': {'num_samples': 1997, 'number_of_characters': 511140, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 130.33, 'max_sentence1_length': 473, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 125.62, 'max_sentence2_length': 395, 'unique_sentence2': 1996}, 'kaz_Cyrl-tat_Cyrl': {'num_samples': 1997, 'number_of_characters': 506202, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 130.33, 'max_sentence1_length': 473, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 123.15, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'kaz_Cyrl-tuk_Latn': {'num_samples': 1997, 'number_of_characters': 545550, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 130.33, 'max_sentence1_length': 473, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 142.85, 'max_sentence2_length': 576, 'unique_sentence2': 1996}, 'kaz_Cyrl-tur_Latn': {'num_samples': 1997, 'number_of_characters': 525889, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 130.33, 'max_sentence1_length': 473, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 133.01, 'max_sentence2_length': 504, 'unique_sentence2': 1997}, 'kaz_Cyrl-uig_Arab': {'num_samples': 1997, 'number_of_characters': 571298, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 130.33, 'max_sentence1_length': 473, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 155.74, 'max_sentence2_length': 592, 'unique_sentence2': 1996}, 'kaz_Cyrl-uzb_Latn': {'num_samples': 1997, 'number_of_characters': 553971, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 130.33, 'max_sentence1_length': 473, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 147.07, 'max_sentence2_length': 470, 'unique_sentence2': 1996}, 'khm_Khmr-bod_Tibt': {'num_samples': 1997, 'number_of_characters': 589120, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 144.46, 'max_sentence1_length': 517, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 150.54, 'max_sentence2_length': 478, 'unique_sentence2': 1993}, 'khm_Khmr-dzo_Tibt': {'num_samples': 1997, 'number_of_characters': 531712, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 144.46, 'max_sentence1_length': 517, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 121.79, 'max_sentence2_length': 411, 'unique_sentence2': 1992}, 'khm_Khmr-eng_Latn': {'num_samples': 1997, 'number_of_characters': 536211, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 144.46, 'max_sentence1_length': 517, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'khm_Khmr-lao_Laoo': {'num_samples': 1997, 'number_of_characters': 555471, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 144.46, 'max_sentence1_length': 517, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 133.69, 'max_sentence2_length': 507, 'unique_sentence2': 1997}, 'khm_Khmr-mon_Mong': {'num_samples': 1997, 'number_of_characters': 547539, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 144.46, 'max_sentence1_length': 517, 'unique_sentence1': 1996, 'min_sentence2_length': 11, 'average_sentence2_length': 129.72, 'max_sentence2_length': 414, 'unique_sentence2': 1997}, 'khm_Khmr-mya_Mymr': {'num_samples': 1997, 'number_of_characters': 600345, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 144.46, 'max_sentence1_length': 517, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 156.16, 'max_sentence2_length': 773, 'unique_sentence2': 1997}, 'khm_Khmr-tha_Thai': {'num_samples': 1997, 'number_of_characters': 525959, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 144.46, 'max_sentence1_length': 517, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 118.91, 'max_sentence2_length': 439, 'unique_sentence2': 1996}, 'kin_Latn-bem_Latn': {'num_samples': 1997, 'number_of_characters': 602279, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 152.12, 'max_sentence1_length': 541, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 149.47, 'max_sentence2_length': 465, 'unique_sentence2': 1997}, 'kin_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 551507, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 152.12, 'max_sentence1_length': 541, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'kin_Latn-ewe_Latn': {'num_samples': 1997, 'number_of_characters': 542765, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 152.12, 'max_sentence1_length': 541, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 119.67, 'max_sentence2_length': 493, 'unique_sentence2': 1994}, 'kin_Latn-fuc_Latn': {'num_samples': 1997, 'number_of_characters': 532267, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 152.12, 'max_sentence1_length': 541, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 114.41, 'max_sentence2_length': 376, 'unique_sentence2': 1996}, 'kin_Latn-nde_Latn': {'num_samples': 1997, 'number_of_characters': 601526, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 152.12, 'max_sentence1_length': 541, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 149.09, 'max_sentence2_length': 590, 'unique_sentence2': 1997}, 'kin_Latn-nya_Latn': {'num_samples': 1997, 'number_of_characters': 588069, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 152.12, 'max_sentence1_length': 541, 'unique_sentence1': 1996, 'min_sentence2_length': 10, 'average_sentence2_length': 142.35, 'max_sentence2_length': 464, 'unique_sentence2': 1993}, 'kin_Latn-sna_Latn': {'num_samples': 1997, 'number_of_characters': 602117, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 152.12, 'max_sentence1_length': 541, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 149.39, 'max_sentence2_length': 511, 'unique_sentence2': 1995}, 'kin_Latn-ven_Latn': {'num_samples': 1997, 'number_of_characters': 603543, 'unique_pairs': 1996, 'min_sentence1_length': 10, 'average_sentence1_length': 152.12, 'max_sentence1_length': 541, 'unique_sentence1': 1996, 'min_sentence2_length': 10, 'average_sentence2_length': 150.1, 'max_sentence2_length': 535, 'unique_sentence2': 1993}, 'kir_Cyrl-aze_Latn': {'num_samples': 1997, 'number_of_characters': 520498, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 125.62, 'max_sentence1_length': 395, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 135.02, 'max_sentence2_length': 398, 'unique_sentence2': 1997}, 'kir_Cyrl-bak_Cyrl': {'num_samples': 1997, 'number_of_characters': 497190, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 125.62, 'max_sentence1_length': 395, 'unique_sentence1': 1996, 'min_sentence2_length': 11, 'average_sentence2_length': 123.35, 'max_sentence2_length': 437, 'unique_sentence2': 1995}, 'kir_Cyrl-eng_Latn': {'num_samples': 1997, 'number_of_characters': 498584, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 125.62, 'max_sentence1_length': 395, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'kir_Cyrl-kaz_Cyrl': {'num_samples': 1997, 'number_of_characters': 511140, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 125.62, 'max_sentence1_length': 395, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 130.33, 'max_sentence2_length': 473, 'unique_sentence2': 1996}, 'kir_Cyrl-tat_Cyrl': {'num_samples': 1997, 'number_of_characters': 496790, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 125.62, 'max_sentence1_length': 395, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 123.15, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'kir_Cyrl-tuk_Latn': {'num_samples': 1997, 'number_of_characters': 536138, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 125.62, 'max_sentence1_length': 395, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 142.85, 'max_sentence2_length': 576, 'unique_sentence2': 1996}, 'kir_Cyrl-tur_Latn': {'num_samples': 1997, 'number_of_characters': 516477, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 125.62, 'max_sentence1_length': 395, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 133.01, 'max_sentence2_length': 504, 'unique_sentence2': 1997}, 'kir_Cyrl-uig_Arab': {'num_samples': 1997, 'number_of_characters': 561886, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 125.62, 'max_sentence1_length': 395, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 155.74, 'max_sentence2_length': 592, 'unique_sentence2': 1996}, 'kir_Cyrl-uzb_Latn': {'num_samples': 1997, 'number_of_characters': 544559, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 125.62, 'max_sentence1_length': 395, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 147.07, 'max_sentence2_length': 470, 'unique_sentence2': 1996}, 'kmr_Latn-arb_Arab': {'num_samples': 1997, 'number_of_characters': 477127, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 123.16, 'max_sentence1_length': 420, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 115.76, 'max_sentence2_length': 362, 'unique_sentence2': 1995}, 'kmr_Latn-ckb_Arab': {'num_samples': 1997, 'number_of_characters': 498313, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 123.16, 'max_sentence1_length': 420, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 126.37, 'max_sentence2_length': 399, 'unique_sentence2': 1995}, 'kmr_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 493666, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 123.16, 'max_sentence1_length': 420, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'kmr_Latn-fas_Arab': {'num_samples': 1997, 'number_of_characters': 489285, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 123.16, 'max_sentence1_length': 420, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 121.85, 'max_sentence2_length': 389, 'unique_sentence2': 1995}, 'kmr_Latn-heb_Hebr': {'num_samples': 1997, 'number_of_characters': 446242, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 123.16, 'max_sentence1_length': 420, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 100.3, 'max_sentence2_length': 375, 'unique_sentence2': 1996}, 'kmr_Latn-mey_Arab': {'num_samples': 1997, 'number_of_characters': 459781, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 123.16, 'max_sentence1_length': 420, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 107.08, 'max_sentence2_length': 392, 'unique_sentence2': 1993}, 'kmr_Latn-prs_Arab': {'num_samples': 1997, 'number_of_characters': 488482, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 123.16, 'max_sentence1_length': 420, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 121.45, 'max_sentence2_length': 365, 'unique_sentence2': 1997}, 'kmr_Latn-pus_Arab': {'num_samples': 1997, 'number_of_characters': 488579, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 123.16, 'max_sentence1_length': 420, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 121.5, 'max_sentence2_length': 366, 'unique_sentence2': 1996}, 'kmr_Latn-shi_Arab': {'num_samples': 1997, 'number_of_characters': 460859, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 123.16, 'max_sentence1_length': 420, 'unique_sentence1': 1996, 'min_sentence2_length': 3, 'average_sentence2_length': 107.62, 'max_sentence2_length': 378, 'unique_sentence2': 1996}, 'kmr_Latn-tgk_Cyrl': {'num_samples': 1997, 'number_of_characters': 520093, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 123.16, 'max_sentence1_length': 420, 'unique_sentence1': 1996, 'min_sentence2_length': 3, 'average_sentence2_length': 137.28, 'max_sentence2_length': 451, 'unique_sentence2': 1995}, 'kor_Hang-arb_Arab': {'num_samples': 1997, 'number_of_characters': 364586, 'unique_pairs': 1995, 'min_sentence1_length': 5, 'average_sentence1_length': 66.8, 'max_sentence1_length': 217, 'unique_sentence1': 1995, 'min_sentence2_length': 5, 'average_sentence2_length': 115.76, 'max_sentence2_length': 362, 'unique_sentence2': 1995}, 'kor_Hang-ben_Beng': {'num_samples': 1997, 'number_of_characters': 377207, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 66.8, 'max_sentence1_length': 217, 'unique_sentence1': 1995, 'min_sentence2_length': 6, 'average_sentence2_length': 122.08, 'max_sentence2_length': 402, 'unique_sentence2': 1997}, 'kor_Hang-deu_Latn': {'num_samples': 1997, 'number_of_characters': 429055, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 66.8, 'max_sentence1_length': 217, 'unique_sentence1': 1995, 'min_sentence2_length': 9, 'average_sentence2_length': 148.05, 'max_sentence2_length': 508, 'unique_sentence2': 1996}, 'kor_Hang-ell_Grek': {'num_samples': 1997, 'number_of_characters': 432532, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 66.8, 'max_sentence1_length': 217, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 149.79, 'max_sentence2_length': 584, 'unique_sentence2': 1996}, 'kor_Hang-eng_Latn': {'num_samples': 1997, 'number_of_characters': 381125, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 66.8, 'max_sentence1_length': 217, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'kor_Hang-fas_Arab': {'num_samples': 1997, 'number_of_characters': 376744, 'unique_pairs': 1995, 'min_sentence1_length': 5, 'average_sentence1_length': 66.8, 'max_sentence1_length': 217, 'unique_sentence1': 1995, 'min_sentence2_length': 9, 'average_sentence2_length': 121.85, 'max_sentence2_length': 389, 'unique_sentence2': 1995}, 'kor_Hang-fin_Latn': {'num_samples': 1997, 'number_of_characters': 403205, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 66.8, 'max_sentence1_length': 217, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 135.1, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'kor_Hang-fra_Latn': {'num_samples': 1997, 'number_of_characters': 426513, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 66.8, 'max_sentence1_length': 217, 'unique_sentence1': 1995, 'min_sentence2_length': 8, 'average_sentence2_length': 146.77, 'max_sentence2_length': 512, 'unique_sentence2': 1996}, 'kor_Hang-heb_Hebr': {'num_samples': 1997, 'number_of_characters': 333701, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 66.8, 'max_sentence1_length': 217, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 100.3, 'max_sentence2_length': 375, 'unique_sentence2': 1996}, 'kor_Hang-hin_Deva': {'num_samples': 1997, 'number_of_characters': 394980, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 66.8, 'max_sentence1_length': 217, 'unique_sentence1': 1995, 'min_sentence2_length': 5, 'average_sentence2_length': 130.98, 'max_sentence2_length': 394, 'unique_sentence2': 1996}, 'kor_Hang-hun_Latn': {'num_samples': 1997, 'number_of_characters': 411781, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 66.8, 'max_sentence1_length': 217, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 139.4, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'kor_Hang-ind_Latn': {'num_samples': 1997, 'number_of_characters': 420377, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 66.8, 'max_sentence1_length': 217, 'unique_sentence1': 1995, 'min_sentence2_length': 6, 'average_sentence2_length': 143.7, 'max_sentence2_length': 486, 'unique_sentence2': 1997}, 'kor_Hang-jpn_Jpan': {'num_samples': 1997, 'number_of_characters': 245031, 'unique_pairs': 1995, 'min_sentence1_length': 5, 'average_sentence1_length': 66.8, 'max_sentence1_length': 217, 'unique_sentence1': 1995, 'min_sentence2_length': 4, 'average_sentence2_length': 55.9, 'max_sentence2_length': 189, 'unique_sentence2': 1994}, 'kor_Hang-lit_Latn': {'num_samples': 1997, 'number_of_characters': 392802, 'unique_pairs': 1995, 'min_sentence1_length': 5, 'average_sentence1_length': 66.8, 'max_sentence1_length': 217, 'unique_sentence1': 1995, 'min_sentence2_length': 9, 'average_sentence2_length': 129.89, 'max_sentence2_length': 446, 'unique_sentence2': 1995}, 'kor_Hang-nld_Latn': {'num_samples': 1997, 'number_of_characters': 425320, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 66.8, 'max_sentence1_length': 217, 'unique_sentence1': 1995, 'min_sentence2_length': 8, 'average_sentence2_length': 146.18, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'kor_Hang-pol_Latn': {'num_samples': 1997, 'number_of_characters': 411271, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 66.8, 'max_sentence1_length': 217, 'unique_sentence1': 1995, 'min_sentence2_length': 9, 'average_sentence2_length': 139.14, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'kor_Hang-por_Latn': {'num_samples': 1997, 'number_of_characters': 410620, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 66.8, 'max_sentence1_length': 217, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 138.82, 'max_sentence2_length': 497, 'unique_sentence2': 1996}, 'kor_Hang-rus_Cyrl': {'num_samples': 1997, 'number_of_characters': 408298, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 66.8, 'max_sentence1_length': 217, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 137.65, 'max_sentence2_length': 419, 'unique_sentence2': 1996}, 'kor_Hang-spa_Latn': {'num_samples': 1997, 'number_of_characters': 421605, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 66.8, 'max_sentence1_length': 217, 'unique_sentence1': 1995, 'min_sentence2_length': 1, 'average_sentence2_length': 144.32, 'max_sentence2_length': 504, 'unique_sentence2': 1996}, 'kor_Hang-swa_Latn': {'num_samples': 1997, 'number_of_characters': 405914, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 66.8, 'max_sentence1_length': 217, 'unique_sentence1': 1995, 'min_sentence2_length': 10, 'average_sentence2_length': 136.46, 'max_sentence2_length': 430, 'unique_sentence2': 1997}, 'kor_Hang-swe_Latn': {'num_samples': 1997, 'number_of_characters': 385232, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 66.8, 'max_sentence1_length': 217, 'unique_sentence1': 1995, 'min_sentence2_length': 8, 'average_sentence2_length': 126.1, 'max_sentence2_length': 430, 'unique_sentence2': 1996}, 'kor_Hang-tam_Taml': {'num_samples': 1997, 'number_of_characters': 443366, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 66.8, 'max_sentence1_length': 217, 'unique_sentence1': 1995, 'min_sentence2_length': 11, 'average_sentence2_length': 155.21, 'max_sentence2_length': 581, 'unique_sentence2': 1997}, 'kor_Hang-tur_Latn': {'num_samples': 1997, 'number_of_characters': 399018, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 66.8, 'max_sentence1_length': 217, 'unique_sentence1': 1995, 'min_sentence2_length': 6, 'average_sentence2_length': 133.01, 'max_sentence2_length': 504, 'unique_sentence2': 1997}, 'kor_Hang-vie_Latn': {'num_samples': 1997, 'number_of_characters': 404526, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 66.8, 'max_sentence1_length': 217, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 135.76, 'max_sentence2_length': 437, 'unique_sentence2': 1996}, 'kor_Hang-yue_Hant': {'num_samples': 1997, 'number_of_characters': 212292, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 66.8, 'max_sentence1_length': 217, 'unique_sentence1': 1995, 'min_sentence2_length': 4, 'average_sentence2_length': 39.5, 'max_sentence2_length': 133, 'unique_sentence2': 1996}, 'kor_Hang-zho_Hans': {'num_samples': 1997, 'number_of_characters': 218366, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 66.8, 'max_sentence1_length': 217, 'unique_sentence1': 1995, 'min_sentence2_length': 4, 'average_sentence2_length': 42.54, 'max_sentence2_length': 263, 'unique_sentence2': 1997}, 'kor_Hang-zho_Hant': {'num_samples': 1997, 'number_of_characters': 224883, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 66.8, 'max_sentence1_length': 217, 'unique_sentence1': 1995, 'min_sentence2_length': 3, 'average_sentence2_length': 45.81, 'max_sentence2_length': 200, 'unique_sentence2': 1996}, 'kor_Hang-zul_Latn': {'num_samples': 1997, 'number_of_characters': 391137, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 66.8, 'max_sentence1_length': 217, 'unique_sentence1': 1995, 'min_sentence2_length': 8, 'average_sentence2_length': 129.06, 'max_sentence2_length': 494, 'unique_sentence2': 1996}, 'lao_Laoo-bod_Tibt': {'num_samples': 1997, 'number_of_characters': 567609, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 133.69, 'max_sentence1_length': 507, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 150.54, 'max_sentence2_length': 478, 'unique_sentence2': 1993}, 'lao_Laoo-dzo_Tibt': {'num_samples': 1997, 'number_of_characters': 510201, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 133.69, 'max_sentence1_length': 507, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 121.79, 'max_sentence2_length': 411, 'unique_sentence2': 1992}, 'lao_Laoo-eng_Latn': {'num_samples': 1997, 'number_of_characters': 514700, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 133.69, 'max_sentence1_length': 507, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'lao_Laoo-khm_Khmr': {'num_samples': 1997, 'number_of_characters': 555471, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 133.69, 'max_sentence1_length': 507, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 144.46, 'max_sentence2_length': 517, 'unique_sentence2': 1996}, 'lao_Laoo-mon_Mong': {'num_samples': 1997, 'number_of_characters': 526028, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 133.69, 'max_sentence1_length': 507, 'unique_sentence1': 1997, 'min_sentence2_length': 11, 'average_sentence2_length': 129.72, 'max_sentence2_length': 414, 'unique_sentence2': 1997}, 'lao_Laoo-mya_Mymr': {'num_samples': 1997, 'number_of_characters': 578834, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 133.69, 'max_sentence1_length': 507, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 156.16, 'max_sentence2_length': 773, 'unique_sentence2': 1997}, 'lao_Laoo-tha_Thai': {'num_samples': 1997, 'number_of_characters': 504448, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 133.69, 'max_sentence1_length': 507, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 118.91, 'max_sentence2_length': 439, 'unique_sentence2': 1996}, 'lav_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 515908, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 134.3, 'max_sentence1_length': 503, 'unique_sentence1': 1994, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'lav_Latn-fin_Latn': {'num_samples': 1997, 'number_of_characters': 537988, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 134.3, 'max_sentence1_length': 503, 'unique_sentence1': 1994, 'min_sentence2_length': 7, 'average_sentence2_length': 135.1, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'lav_Latn-hun_Latn': {'num_samples': 1997, 'number_of_characters': 546564, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 134.3, 'max_sentence1_length': 503, 'unique_sentence1': 1994, 'min_sentence2_length': 7, 'average_sentence2_length': 139.4, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'lav_Latn-lit_Latn': {'num_samples': 1997, 'number_of_characters': 527585, 'unique_pairs': 1995, 'min_sentence1_length': 7, 'average_sentence1_length': 134.3, 'max_sentence1_length': 503, 'unique_sentence1': 1994, 'min_sentence2_length': 9, 'average_sentence2_length': 129.89, 'max_sentence2_length': 446, 'unique_sentence2': 1995}, 'lit_Latn-arb_Arab': {'num_samples': 1997, 'number_of_characters': 490578, 'unique_pairs': 1995, 'min_sentence1_length': 9, 'average_sentence1_length': 129.89, 'max_sentence1_length': 446, 'unique_sentence1': 1995, 'min_sentence2_length': 5, 'average_sentence2_length': 115.76, 'max_sentence2_length': 362, 'unique_sentence2': 1995}, 'lit_Latn-ben_Beng': {'num_samples': 1997, 'number_of_characters': 503199, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 129.89, 'max_sentence1_length': 446, 'unique_sentence1': 1995, 'min_sentence2_length': 6, 'average_sentence2_length': 122.08, 'max_sentence2_length': 402, 'unique_sentence2': 1997}, 'lit_Latn-deu_Latn': {'num_samples': 1997, 'number_of_characters': 555047, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 129.89, 'max_sentence1_length': 446, 'unique_sentence1': 1995, 'min_sentence2_length': 9, 'average_sentence2_length': 148.05, 'max_sentence2_length': 508, 'unique_sentence2': 1996}, 'lit_Latn-ell_Grek': {'num_samples': 1997, 'number_of_characters': 558524, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 129.89, 'max_sentence1_length': 446, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 149.79, 'max_sentence2_length': 584, 'unique_sentence2': 1996}, 'lit_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 507117, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 129.89, 'max_sentence1_length': 446, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'lit_Latn-fas_Arab': {'num_samples': 1997, 'number_of_characters': 502736, 'unique_pairs': 1995, 'min_sentence1_length': 9, 'average_sentence1_length': 129.89, 'max_sentence1_length': 446, 'unique_sentence1': 1995, 'min_sentence2_length': 9, 'average_sentence2_length': 121.85, 'max_sentence2_length': 389, 'unique_sentence2': 1995}, 'lit_Latn-fin_Latn': {'num_samples': 1997, 'number_of_characters': 529197, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 129.89, 'max_sentence1_length': 446, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 135.1, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'lit_Latn-fra_Latn': {'num_samples': 1997, 'number_of_characters': 552505, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 129.89, 'max_sentence1_length': 446, 'unique_sentence1': 1995, 'min_sentence2_length': 8, 'average_sentence2_length': 146.77, 'max_sentence2_length': 512, 'unique_sentence2': 1996}, 'lit_Latn-heb_Hebr': {'num_samples': 1997, 'number_of_characters': 459693, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 129.89, 'max_sentence1_length': 446, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 100.3, 'max_sentence2_length': 375, 'unique_sentence2': 1996}, 'lit_Latn-hin_Deva': {'num_samples': 1997, 'number_of_characters': 520972, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 129.89, 'max_sentence1_length': 446, 'unique_sentence1': 1995, 'min_sentence2_length': 5, 'average_sentence2_length': 130.98, 'max_sentence2_length': 394, 'unique_sentence2': 1996}, 'lit_Latn-hun_Latn': {'num_samples': 1997, 'number_of_characters': 537773, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 129.89, 'max_sentence1_length': 446, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 139.4, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'lit_Latn-ind_Latn': {'num_samples': 1997, 'number_of_characters': 546369, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 129.89, 'max_sentence1_length': 446, 'unique_sentence1': 1995, 'min_sentence2_length': 6, 'average_sentence2_length': 143.7, 'max_sentence2_length': 486, 'unique_sentence2': 1997}, 'lit_Latn-jpn_Jpan': {'num_samples': 1997, 'number_of_characters': 371023, 'unique_pairs': 1995, 'min_sentence1_length': 9, 'average_sentence1_length': 129.89, 'max_sentence1_length': 446, 'unique_sentence1': 1995, 'min_sentence2_length': 4, 'average_sentence2_length': 55.9, 'max_sentence2_length': 189, 'unique_sentence2': 1994}, 'lit_Latn-kor_Hang': {'num_samples': 1997, 'number_of_characters': 392802, 'unique_pairs': 1995, 'min_sentence1_length': 9, 'average_sentence1_length': 129.89, 'max_sentence1_length': 446, 'unique_sentence1': 1995, 'min_sentence2_length': 5, 'average_sentence2_length': 66.8, 'max_sentence2_length': 217, 'unique_sentence2': 1995}, 'lit_Latn-lav_Latn': {'num_samples': 1997, 'number_of_characters': 527585, 'unique_pairs': 1995, 'min_sentence1_length': 9, 'average_sentence1_length': 129.89, 'max_sentence1_length': 446, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 134.3, 'max_sentence2_length': 503, 'unique_sentence2': 1994}, 'lit_Latn-nld_Latn': {'num_samples': 1997, 'number_of_characters': 551312, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 129.89, 'max_sentence1_length': 446, 'unique_sentence1': 1995, 'min_sentence2_length': 8, 'average_sentence2_length': 146.18, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'lit_Latn-pol_Latn': {'num_samples': 1997, 'number_of_characters': 537263, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 129.89, 'max_sentence1_length': 446, 'unique_sentence1': 1995, 'min_sentence2_length': 9, 'average_sentence2_length': 139.14, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'lit_Latn-por_Latn': {'num_samples': 1997, 'number_of_characters': 536612, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 129.89, 'max_sentence1_length': 446, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 138.82, 'max_sentence2_length': 497, 'unique_sentence2': 1996}, 'lit_Latn-rus_Cyrl': {'num_samples': 1997, 'number_of_characters': 534290, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 129.89, 'max_sentence1_length': 446, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 137.65, 'max_sentence2_length': 419, 'unique_sentence2': 1996}, 'lit_Latn-spa_Latn': {'num_samples': 1997, 'number_of_characters': 547597, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 129.89, 'max_sentence1_length': 446, 'unique_sentence1': 1995, 'min_sentence2_length': 1, 'average_sentence2_length': 144.32, 'max_sentence2_length': 504, 'unique_sentence2': 1996}, 'lit_Latn-swa_Latn': {'num_samples': 1997, 'number_of_characters': 531906, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 129.89, 'max_sentence1_length': 446, 'unique_sentence1': 1995, 'min_sentence2_length': 10, 'average_sentence2_length': 136.46, 'max_sentence2_length': 430, 'unique_sentence2': 1997}, 'lit_Latn-swe_Latn': {'num_samples': 1997, 'number_of_characters': 511224, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 129.89, 'max_sentence1_length': 446, 'unique_sentence1': 1995, 'min_sentence2_length': 8, 'average_sentence2_length': 126.1, 'max_sentence2_length': 430, 'unique_sentence2': 1996}, 'lit_Latn-tam_Taml': {'num_samples': 1997, 'number_of_characters': 569358, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 129.89, 'max_sentence1_length': 446, 'unique_sentence1': 1995, 'min_sentence2_length': 11, 'average_sentence2_length': 155.21, 'max_sentence2_length': 581, 'unique_sentence2': 1997}, 'lit_Latn-tur_Latn': {'num_samples': 1997, 'number_of_characters': 525010, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 129.89, 'max_sentence1_length': 446, 'unique_sentence1': 1995, 'min_sentence2_length': 6, 'average_sentence2_length': 133.01, 'max_sentence2_length': 504, 'unique_sentence2': 1997}, 'lit_Latn-vie_Latn': {'num_samples': 1997, 'number_of_characters': 530518, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 129.89, 'max_sentence1_length': 446, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 135.76, 'max_sentence2_length': 437, 'unique_sentence2': 1996}, 'lit_Latn-zho_Hant': {'num_samples': 1997, 'number_of_characters': 350875, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 129.89, 'max_sentence1_length': 446, 'unique_sentence1': 1995, 'min_sentence2_length': 3, 'average_sentence2_length': 45.81, 'max_sentence2_length': 200, 'unique_sentence2': 1996}, 'lit_Latn-zul_Latn': {'num_samples': 1997, 'number_of_characters': 517129, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 129.89, 'max_sentence1_length': 446, 'unique_sentence1': 1995, 'min_sentence2_length': 8, 'average_sentence2_length': 129.06, 'max_sentence2_length': 494, 'unique_sentence2': 1996}, 'ltz_Latn-afr_Latn': {'num_samples': 1997, 'number_of_characters': 549109, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 140.59, 'max_sentence1_length': 543, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 134.38, 'max_sentence2_length': 437, 'unique_sentence2': 1996}, 'ltz_Latn-dan_Latn': {'num_samples': 1997, 'number_of_characters': 532895, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 140.59, 'max_sentence1_length': 543, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 126.26, 'max_sentence2_length': 522, 'unique_sentence2': 1995}, 'ltz_Latn-deu_Latn': {'num_samples': 1997, 'number_of_characters': 576407, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 140.59, 'max_sentence1_length': 543, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 148.05, 'max_sentence2_length': 508, 'unique_sentence2': 1996}, 'ltz_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 528477, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 140.59, 'max_sentence1_length': 543, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'ltz_Latn-fao_Latn': {'num_samples': 1997, 'number_of_characters': 538560, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 140.59, 'max_sentence1_length': 543, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 129.1, 'max_sentence2_length': 433, 'unique_sentence2': 1997}, 'ltz_Latn-isl_Latn': {'num_samples': 1997, 'number_of_characters': 542965, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 140.59, 'max_sentence1_length': 543, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 131.3, 'max_sentence2_length': 399, 'unique_sentence2': 1996}, 'ltz_Latn-nld_Latn': {'num_samples': 1997, 'number_of_characters': 572672, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 140.59, 'max_sentence1_length': 543, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 146.18, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'ltz_Latn-nno_Latn': {'num_samples': 1997, 'number_of_characters': 529114, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 140.59, 'max_sentence1_length': 543, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.37, 'max_sentence2_length': 417, 'unique_sentence2': 1996}, 'ltz_Latn-nob_Latn': {'num_samples': 1997, 'number_of_characters': 532201, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 140.59, 'max_sentence1_length': 543, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 125.91, 'max_sentence2_length': 482, 'unique_sentence2': 1996}, 'ltz_Latn-swe_Latn': {'num_samples': 1997, 'number_of_characters': 532584, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 140.59, 'max_sentence1_length': 543, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 126.1, 'max_sentence2_length': 430, 'unique_sentence2': 1996}, 'mal_Mlym-eng_Latn': {'num_samples': 1997, 'number_of_characters': 551872, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 152.3, 'max_sentence1_length': 540, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'mal_Mlym-fij_Latn': {'num_samples': 1997, 'number_of_characters': 604657, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 152.3, 'max_sentence1_length': 540, 'unique_sentence1': 1996, 'min_sentence2_length': 10, 'average_sentence2_length': 150.48, 'max_sentence2_length': 448, 'unique_sentence2': 1988}, 'mal_Mlym-fil_Latn': {'num_samples': 1997, 'number_of_characters': 597572, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 152.3, 'max_sentence1_length': 540, 'unique_sentence1': 1996, 'min_sentence2_length': 10, 'average_sentence2_length': 146.93, 'max_sentence2_length': 554, 'unique_sentence2': 1997}, 'mal_Mlym-ind_Latn': {'num_samples': 1997, 'number_of_characters': 591124, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 152.3, 'max_sentence1_length': 540, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 143.7, 'max_sentence2_length': 486, 'unique_sentence2': 1997}, 'mal_Mlym-mlg_Latn': {'num_samples': 1997, 'number_of_characters': 624460, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 152.3, 'max_sentence1_length': 540, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 160.39, 'max_sentence2_length': 559, 'unique_sentence2': 1994}, 'mal_Mlym-mri_Latn': {'num_samples': 1997, 'number_of_characters': 578276, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 152.3, 'max_sentence1_length': 540, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 137.27, 'max_sentence2_length': 443, 'unique_sentence2': 1997}, 'mal_Mlym-msa_Latn': {'num_samples': 1997, 'number_of_characters': 581335, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 152.3, 'max_sentence1_length': 540, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 138.8, 'max_sentence2_length': 463, 'unique_sentence2': 1997}, 'mal_Mlym-smo_Latn': {'num_samples': 1997, 'number_of_characters': 582007, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 152.3, 'max_sentence1_length': 540, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 139.14, 'max_sentence2_length': 431, 'unique_sentence2': 1996}, 'mal_Mlym-tah_Latn': {'num_samples': 1997, 'number_of_characters': 613775, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 152.3, 'max_sentence1_length': 540, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 155.04, 'max_sentence2_length': 524, 'unique_sentence2': 1997}, 'mal_Mlym-ton_Latn': {'num_samples': 1997, 'number_of_characters': 617792, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 152.3, 'max_sentence1_length': 540, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 157.06, 'max_sentence2_length': 468, 'unique_sentence2': 1997}, 'mar_Deva-ben_Beng': {'num_samples': 1997, 'number_of_characters': 504689, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 130.64, 'max_sentence1_length': 443, 'unique_sentence1': 1995, 'min_sentence2_length': 6, 'average_sentence2_length': 122.08, 'max_sentence2_length': 402, 'unique_sentence2': 1997}, 'mar_Deva-div_Thaa': {'num_samples': 1997, 'number_of_characters': 564735, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 130.64, 'max_sentence1_length': 443, 'unique_sentence1': 1995, 'min_sentence2_length': 9, 'average_sentence2_length': 152.15, 'max_sentence2_length': 609, 'unique_sentence2': 1996}, 'mar_Deva-eng_Latn': {'num_samples': 1997, 'number_of_characters': 508607, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 130.64, 'max_sentence1_length': 443, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'mar_Deva-eus_Latn': {'num_samples': 1997, 'number_of_characters': 536090, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 130.64, 'max_sentence1_length': 443, 'unique_sentence1': 1995, 'min_sentence2_length': 6, 'average_sentence2_length': 137.81, 'max_sentence2_length': 393, 'unique_sentence2': 1997}, 'mar_Deva-guj_Gujr': {'num_samples': 1997, 'number_of_characters': 505818, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 130.64, 'max_sentence1_length': 443, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 122.65, 'max_sentence2_length': 378, 'unique_sentence2': 1997}, 'mar_Deva-hin_Deva': {'num_samples': 1997, 'number_of_characters': 522462, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 130.64, 'max_sentence1_length': 443, 'unique_sentence1': 1995, 'min_sentence2_length': 5, 'average_sentence2_length': 130.98, 'max_sentence2_length': 394, 'unique_sentence2': 1996}, 'mar_Deva-kan_Knda': {'num_samples': 1997, 'number_of_characters': 526423, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 130.64, 'max_sentence1_length': 443, 'unique_sentence1': 1995, 'min_sentence2_length': 6, 'average_sentence2_length': 132.97, 'max_sentence2_length': 449, 'unique_sentence2': 1996}, 'mar_Deva-nep_Deva': {'num_samples': 1997, 'number_of_characters': 509110, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 130.64, 'max_sentence1_length': 443, 'unique_sentence1': 1995, 'min_sentence2_length': 6, 'average_sentence2_length': 124.3, 'max_sentence2_length': 412, 'unique_sentence2': 1996}, 'mar_Deva-pan_Guru': {'num_samples': 1997, 'number_of_characters': 511309, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 130.64, 'max_sentence1_length': 443, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 125.4, 'max_sentence2_length': 383, 'unique_sentence2': 1996}, 'mar_Deva-sin_Sinh': {'num_samples': 1997, 'number_of_characters': 519628, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 130.64, 'max_sentence1_length': 443, 'unique_sentence1': 1995, 'min_sentence2_length': 5, 'average_sentence2_length': 129.56, 'max_sentence2_length': 441, 'unique_sentence2': 1996}, 'mar_Deva-snd_Arab': {'num_samples': 1997, 'number_of_characters': 481214, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 130.64, 'max_sentence1_length': 443, 'unique_sentence1': 1995, 'min_sentence2_length': 6, 'average_sentence2_length': 110.33, 'max_sentence2_length': 335, 'unique_sentence2': 1996}, 'mar_Deva-tam_Taml': {'num_samples': 1997, 'number_of_characters': 570848, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 130.64, 'max_sentence1_length': 443, 'unique_sentence1': 1995, 'min_sentence2_length': 11, 'average_sentence2_length': 155.21, 'max_sentence2_length': 581, 'unique_sentence2': 1997}, 'mar_Deva-tel_Telu': {'num_samples': 1997, 'number_of_characters': 508414, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 130.64, 'max_sentence1_length': 443, 'unique_sentence1': 1995, 'min_sentence2_length': 10, 'average_sentence2_length': 123.95, 'max_sentence2_length': 412, 'unique_sentence2': 1996}, 'mar_Deva-urd_Arab': {'num_samples': 1997, 'number_of_characters': 508885, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 130.64, 'max_sentence1_length': 443, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 124.19, 'max_sentence2_length': 390, 'unique_sentence2': 1996}, 'mey_Arab-arb_Arab': {'num_samples': 1997, 'number_of_characters': 445016, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 107.08, 'max_sentence1_length': 392, 'unique_sentence1': 1993, 'min_sentence2_length': 5, 'average_sentence2_length': 115.76, 'max_sentence2_length': 362, 'unique_sentence2': 1995}, 'mey_Arab-ckb_Arab': {'num_samples': 1997, 'number_of_characters': 466202, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 107.08, 'max_sentence1_length': 392, 'unique_sentence1': 1993, 'min_sentence2_length': 5, 'average_sentence2_length': 126.37, 'max_sentence2_length': 399, 'unique_sentence2': 1995}, 'mey_Arab-eng_Latn': {'num_samples': 1997, 'number_of_characters': 461555, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 107.08, 'max_sentence1_length': 392, 'unique_sentence1': 1993, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'mey_Arab-fas_Arab': {'num_samples': 1997, 'number_of_characters': 457174, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 107.08, 'max_sentence1_length': 392, 'unique_sentence1': 1993, 'min_sentence2_length': 9, 'average_sentence2_length': 121.85, 'max_sentence2_length': 389, 'unique_sentence2': 1995}, 'mey_Arab-heb_Hebr': {'num_samples': 1997, 'number_of_characters': 414131, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 107.08, 'max_sentence1_length': 392, 'unique_sentence1': 1993, 'min_sentence2_length': 7, 'average_sentence2_length': 100.3, 'max_sentence2_length': 375, 'unique_sentence2': 1996}, 'mey_Arab-kmr_Latn': {'num_samples': 1997, 'number_of_characters': 459781, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 107.08, 'max_sentence1_length': 392, 'unique_sentence1': 1993, 'min_sentence2_length': 7, 'average_sentence2_length': 123.16, 'max_sentence2_length': 420, 'unique_sentence2': 1996}, 'mey_Arab-prs_Arab': {'num_samples': 1997, 'number_of_characters': 456371, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 107.08, 'max_sentence1_length': 392, 'unique_sentence1': 1993, 'min_sentence2_length': 8, 'average_sentence2_length': 121.45, 'max_sentence2_length': 365, 'unique_sentence2': 1997}, 'mey_Arab-pus_Arab': {'num_samples': 1997, 'number_of_characters': 456468, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 107.08, 'max_sentence1_length': 392, 'unique_sentence1': 1993, 'min_sentence2_length': 8, 'average_sentence2_length': 121.5, 'max_sentence2_length': 366, 'unique_sentence2': 1996}, 'mey_Arab-shi_Arab': {'num_samples': 1997, 'number_of_characters': 428748, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 107.08, 'max_sentence1_length': 392, 'unique_sentence1': 1993, 'min_sentence2_length': 3, 'average_sentence2_length': 107.62, 'max_sentence2_length': 378, 'unique_sentence2': 1996}, 'mey_Arab-tgk_Cyrl': {'num_samples': 1997, 'number_of_characters': 487982, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 107.08, 'max_sentence1_length': 392, 'unique_sentence1': 1993, 'min_sentence2_length': 3, 'average_sentence2_length': 137.28, 'max_sentence2_length': 451, 'unique_sentence2': 1995}, 'mkd_Cyrl-bel_Cyrl': {'num_samples': 1997, 'number_of_characters': 523981, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 134.15, 'max_sentence1_length': 451, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 128.24, 'max_sentence2_length': 422, 'unique_sentence2': 1996}, 'mkd_Cyrl-bos_Latn': {'num_samples': 1997, 'number_of_characters': 522801, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 134.15, 'max_sentence1_length': 451, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 127.65, 'max_sentence2_length': 434, 'unique_sentence2': 1996}, 'mkd_Cyrl-bul_Cyrl': {'num_samples': 1997, 'number_of_characters': 537780, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 134.15, 'max_sentence1_length': 451, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 135.15, 'max_sentence2_length': 493, 'unique_sentence2': 1996}, 'mkd_Cyrl-ces_Latn': {'num_samples': 1997, 'number_of_characters': 509209, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 134.15, 'max_sentence1_length': 451, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 120.84, 'max_sentence2_length': 474, 'unique_sentence2': 1997}, 'mkd_Cyrl-eng_Latn': {'num_samples': 1997, 'number_of_characters': 515611, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 134.15, 'max_sentence1_length': 451, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'mkd_Cyrl-hrv_Latn': {'num_samples': 1997, 'number_of_characters': 523816, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 134.15, 'max_sentence1_length': 451, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 128.15, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'mkd_Cyrl-pol_Latn': {'num_samples': 1997, 'number_of_characters': 545757, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 134.15, 'max_sentence1_length': 451, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 139.14, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'mkd_Cyrl-rus_Cyrl': {'num_samples': 1997, 'number_of_characters': 542784, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 134.15, 'max_sentence1_length': 451, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 137.65, 'max_sentence2_length': 419, 'unique_sentence2': 1996}, 'mkd_Cyrl-slk_Latn': {'num_samples': 1997, 'number_of_characters': 520860, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 134.15, 'max_sentence1_length': 451, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 126.67, 'max_sentence2_length': 403, 'unique_sentence2': 1996}, 'mkd_Cyrl-slv_Latn': {'num_samples': 1997, 'number_of_characters': 520787, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 134.15, 'max_sentence1_length': 451, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 126.64, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'mkd_Cyrl-srp_Cyrl': {'num_samples': 1997, 'number_of_characters': 520194, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 134.15, 'max_sentence1_length': 451, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 126.34, 'max_sentence2_length': 439, 'unique_sentence2': 1995}, 'mkd_Cyrl-srp_Latn': {'num_samples': 1997, 'number_of_characters': 524032, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 134.15, 'max_sentence1_length': 451, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 128.26, 'max_sentence2_length': 452, 'unique_sentence2': 1996}, 'mkd_Cyrl-ukr_Cyrl': {'num_samples': 1997, 'number_of_characters': 530674, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 134.15, 'max_sentence1_length': 451, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 131.59, 'max_sentence2_length': 440, 'unique_sentence2': 1996}, 'mlg_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 568028, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 160.39, 'max_sentence1_length': 559, 'unique_sentence1': 1994, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'mlg_Latn-fij_Latn': {'num_samples': 1997, 'number_of_characters': 620813, 'unique_pairs': 1995, 'min_sentence1_length': 6, 'average_sentence1_length': 160.39, 'max_sentence1_length': 559, 'unique_sentence1': 1994, 'min_sentence2_length': 10, 'average_sentence2_length': 150.48, 'max_sentence2_length': 448, 'unique_sentence2': 1988}, 'mlg_Latn-fil_Latn': {'num_samples': 1997, 'number_of_characters': 613728, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 160.39, 'max_sentence1_length': 559, 'unique_sentence1': 1994, 'min_sentence2_length': 10, 'average_sentence2_length': 146.93, 'max_sentence2_length': 554, 'unique_sentence2': 1997}, 'mlg_Latn-ind_Latn': {'num_samples': 1997, 'number_of_characters': 607280, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 160.39, 'max_sentence1_length': 559, 'unique_sentence1': 1994, 'min_sentence2_length': 6, 'average_sentence2_length': 143.7, 'max_sentence2_length': 486, 'unique_sentence2': 1997}, 'mlg_Latn-mal_Mlym': {'num_samples': 1997, 'number_of_characters': 624460, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 160.39, 'max_sentence1_length': 559, 'unique_sentence1': 1994, 'min_sentence2_length': 7, 'average_sentence2_length': 152.3, 'max_sentence2_length': 540, 'unique_sentence2': 1996}, 'mlg_Latn-mri_Latn': {'num_samples': 1997, 'number_of_characters': 594432, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 160.39, 'max_sentence1_length': 559, 'unique_sentence1': 1994, 'min_sentence2_length': 9, 'average_sentence2_length': 137.27, 'max_sentence2_length': 443, 'unique_sentence2': 1997}, 'mlg_Latn-msa_Latn': {'num_samples': 1997, 'number_of_characters': 597491, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 160.39, 'max_sentence1_length': 559, 'unique_sentence1': 1994, 'min_sentence2_length': 6, 'average_sentence2_length': 138.8, 'max_sentence2_length': 463, 'unique_sentence2': 1997}, 'mlg_Latn-smo_Latn': {'num_samples': 1997, 'number_of_characters': 598163, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 160.39, 'max_sentence1_length': 559, 'unique_sentence1': 1994, 'min_sentence2_length': 8, 'average_sentence2_length': 139.14, 'max_sentence2_length': 431, 'unique_sentence2': 1996}, 'mlg_Latn-tah_Latn': {'num_samples': 1997, 'number_of_characters': 629931, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 160.39, 'max_sentence1_length': 559, 'unique_sentence1': 1994, 'min_sentence2_length': 9, 'average_sentence2_length': 155.04, 'max_sentence2_length': 524, 'unique_sentence2': 1997}, 'mlg_Latn-ton_Latn': {'num_samples': 1997, 'number_of_characters': 633948, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 160.39, 'max_sentence1_length': 559, 'unique_sentence1': 1994, 'min_sentence2_length': 7, 'average_sentence2_length': 157.06, 'max_sentence2_length': 468, 'unique_sentence2': 1997}, 'mlt_Latn-cat_Latn': {'num_samples': 1997, 'number_of_characters': 560435, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 138.95, 'max_sentence1_length': 582, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 141.69, 'max_sentence2_length': 460, 'unique_sentence2': 1997}, 'mlt_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 525195, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 138.95, 'max_sentence1_length': 582, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'mlt_Latn-fra_Latn': {'num_samples': 1997, 'number_of_characters': 570583, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 138.95, 'max_sentence1_length': 582, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 146.77, 'max_sentence2_length': 512, 'unique_sentence2': 1996}, 'mlt_Latn-glg_Latn': {'num_samples': 1997, 'number_of_characters': 549461, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 138.95, 'max_sentence1_length': 582, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 136.2, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'mlt_Latn-ita_Latn': {'num_samples': 1997, 'number_of_characters': 566692, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 138.95, 'max_sentence1_length': 582, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 144.83, 'max_sentence2_length': 623, 'unique_sentence2': 1996}, 'mlt_Latn-por_Latn': {'num_samples': 1997, 'number_of_characters': 554690, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 138.95, 'max_sentence1_length': 582, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 138.82, 'max_sentence2_length': 497, 'unique_sentence2': 1996}, 'mlt_Latn-ron_Latn': {'num_samples': 1997, 'number_of_characters': 569960, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 138.95, 'max_sentence1_length': 582, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 146.46, 'max_sentence2_length': 518, 'unique_sentence2': 1997}, 'mlt_Latn-spa_Latn': {'num_samples': 1997, 'number_of_characters': 565675, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 138.95, 'max_sentence1_length': 582, 'unique_sentence1': 1996, 'min_sentence2_length': 1, 'average_sentence2_length': 144.32, 'max_sentence2_length': 504, 'unique_sentence2': 1996}, 'mon_Mong-bod_Tibt': {'num_samples': 1997, 'number_of_characters': 559677, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 129.72, 'max_sentence1_length': 414, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 150.54, 'max_sentence2_length': 478, 'unique_sentence2': 1993}, 'mon_Mong-dzo_Tibt': {'num_samples': 1997, 'number_of_characters': 502269, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 129.72, 'max_sentence1_length': 414, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 121.79, 'max_sentence2_length': 411, 'unique_sentence2': 1992}, 'mon_Mong-eng_Latn': {'num_samples': 1997, 'number_of_characters': 506768, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 129.72, 'max_sentence1_length': 414, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'mon_Mong-khm_Khmr': {'num_samples': 1997, 'number_of_characters': 547539, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 129.72, 'max_sentence1_length': 414, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 144.46, 'max_sentence2_length': 517, 'unique_sentence2': 1996}, 'mon_Mong-lao_Laoo': {'num_samples': 1997, 'number_of_characters': 526028, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 129.72, 'max_sentence1_length': 414, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 133.69, 'max_sentence2_length': 507, 'unique_sentence2': 1997}, 'mon_Mong-mya_Mymr': {'num_samples': 1997, 'number_of_characters': 570902, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 129.72, 'max_sentence1_length': 414, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 156.16, 'max_sentence2_length': 773, 'unique_sentence2': 1997}, 'mon_Mong-tha_Thai': {'num_samples': 1997, 'number_of_characters': 496516, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 129.72, 'max_sentence1_length': 414, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 118.91, 'max_sentence2_length': 439, 'unique_sentence2': 1996}, 'mri_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 521844, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 137.27, 'max_sentence1_length': 443, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'mri_Latn-fij_Latn': {'num_samples': 1997, 'number_of_characters': 574629, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 137.27, 'max_sentence1_length': 443, 'unique_sentence1': 1997, 'min_sentence2_length': 10, 'average_sentence2_length': 150.48, 'max_sentence2_length': 448, 'unique_sentence2': 1988}, 'mri_Latn-fil_Latn': {'num_samples': 1997, 'number_of_characters': 567544, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 137.27, 'max_sentence1_length': 443, 'unique_sentence1': 1997, 'min_sentence2_length': 10, 'average_sentence2_length': 146.93, 'max_sentence2_length': 554, 'unique_sentence2': 1997}, 'mri_Latn-ind_Latn': {'num_samples': 1997, 'number_of_characters': 561096, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 137.27, 'max_sentence1_length': 443, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 143.7, 'max_sentence2_length': 486, 'unique_sentence2': 1997}, 'mri_Latn-mal_Mlym': {'num_samples': 1997, 'number_of_characters': 578276, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 137.27, 'max_sentence1_length': 443, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 152.3, 'max_sentence2_length': 540, 'unique_sentence2': 1996}, 'mri_Latn-mlg_Latn': {'num_samples': 1997, 'number_of_characters': 594432, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 137.27, 'max_sentence1_length': 443, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 160.39, 'max_sentence2_length': 559, 'unique_sentence2': 1994}, 'mri_Latn-msa_Latn': {'num_samples': 1997, 'number_of_characters': 551307, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 137.27, 'max_sentence1_length': 443, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 138.8, 'max_sentence2_length': 463, 'unique_sentence2': 1997}, 'mri_Latn-smo_Latn': {'num_samples': 1997, 'number_of_characters': 551979, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 137.27, 'max_sentence1_length': 443, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 139.14, 'max_sentence2_length': 431, 'unique_sentence2': 1996}, 'mri_Latn-tah_Latn': {'num_samples': 1997, 'number_of_characters': 583747, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 137.27, 'max_sentence1_length': 443, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 155.04, 'max_sentence2_length': 524, 'unique_sentence2': 1997}, 'mri_Latn-ton_Latn': {'num_samples': 1997, 'number_of_characters': 587764, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 137.27, 'max_sentence1_length': 443, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 157.06, 'max_sentence2_length': 468, 'unique_sentence2': 1997}, 'msa_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 524903, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 138.8, 'max_sentence1_length': 463, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'msa_Latn-fij_Latn': {'num_samples': 1997, 'number_of_characters': 577688, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 138.8, 'max_sentence1_length': 463, 'unique_sentence1': 1997, 'min_sentence2_length': 10, 'average_sentence2_length': 150.48, 'max_sentence2_length': 448, 'unique_sentence2': 1988}, 'msa_Latn-fil_Latn': {'num_samples': 1997, 'number_of_characters': 570603, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 138.8, 'max_sentence1_length': 463, 'unique_sentence1': 1997, 'min_sentence2_length': 10, 'average_sentence2_length': 146.93, 'max_sentence2_length': 554, 'unique_sentence2': 1997}, 'msa_Latn-ind_Latn': {'num_samples': 1997, 'number_of_characters': 564155, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 138.8, 'max_sentence1_length': 463, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 143.7, 'max_sentence2_length': 486, 'unique_sentence2': 1997}, 'msa_Latn-mal_Mlym': {'num_samples': 1997, 'number_of_characters': 581335, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 138.8, 'max_sentence1_length': 463, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 152.3, 'max_sentence2_length': 540, 'unique_sentence2': 1996}, 'msa_Latn-mlg_Latn': {'num_samples': 1997, 'number_of_characters': 597491, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 138.8, 'max_sentence1_length': 463, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 160.39, 'max_sentence2_length': 559, 'unique_sentence2': 1994}, 'msa_Latn-mri_Latn': {'num_samples': 1997, 'number_of_characters': 551307, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 138.8, 'max_sentence1_length': 463, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 137.27, 'max_sentence2_length': 443, 'unique_sentence2': 1997}, 'msa_Latn-smo_Latn': {'num_samples': 1997, 'number_of_characters': 555038, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 138.8, 'max_sentence1_length': 463, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 139.14, 'max_sentence2_length': 431, 'unique_sentence2': 1996}, 'msa_Latn-tah_Latn': {'num_samples': 1997, 'number_of_characters': 586806, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 138.8, 'max_sentence1_length': 463, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 155.04, 'max_sentence2_length': 524, 'unique_sentence2': 1997}, 'msa_Latn-ton_Latn': {'num_samples': 1997, 'number_of_characters': 590823, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 138.8, 'max_sentence1_length': 463, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 157.06, 'max_sentence2_length': 468, 'unique_sentence2': 1997}, 'mya_Mymr-bod_Tibt': {'num_samples': 1997, 'number_of_characters': 612483, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 156.16, 'max_sentence1_length': 773, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 150.54, 'max_sentence2_length': 478, 'unique_sentence2': 1993}, 'mya_Mymr-dzo_Tibt': {'num_samples': 1997, 'number_of_characters': 555075, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 156.16, 'max_sentence1_length': 773, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 121.79, 'max_sentence2_length': 411, 'unique_sentence2': 1992}, 'mya_Mymr-eng_Latn': {'num_samples': 1997, 'number_of_characters': 559574, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 156.16, 'max_sentence1_length': 773, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'mya_Mymr-khm_Khmr': {'num_samples': 1997, 'number_of_characters': 600345, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 156.16, 'max_sentence1_length': 773, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 144.46, 'max_sentence2_length': 517, 'unique_sentence2': 1996}, 'mya_Mymr-lao_Laoo': {'num_samples': 1997, 'number_of_characters': 578834, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 156.16, 'max_sentence1_length': 773, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 133.69, 'max_sentence2_length': 507, 'unique_sentence2': 1997}, 'mya_Mymr-mon_Mong': {'num_samples': 1997, 'number_of_characters': 570902, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 156.16, 'max_sentence1_length': 773, 'unique_sentence1': 1997, 'min_sentence2_length': 11, 'average_sentence2_length': 129.72, 'max_sentence2_length': 414, 'unique_sentence2': 1997}, 'mya_Mymr-tha_Thai': {'num_samples': 1997, 'number_of_characters': 549322, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 156.16, 'max_sentence1_length': 773, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 118.91, 'max_sentence2_length': 439, 'unique_sentence2': 1996}, 'nde_Latn-bem_Latn': {'num_samples': 1997, 'number_of_characters': 596231, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 149.09, 'max_sentence1_length': 590, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 149.47, 'max_sentence2_length': 465, 'unique_sentence2': 1997}, 'nde_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 545459, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 149.09, 'max_sentence1_length': 590, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'nde_Latn-ewe_Latn': {'num_samples': 1997, 'number_of_characters': 536717, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 149.09, 'max_sentence1_length': 590, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 119.67, 'max_sentence2_length': 493, 'unique_sentence2': 1994}, 'nde_Latn-fuc_Latn': {'num_samples': 1997, 'number_of_characters': 526219, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 149.09, 'max_sentence1_length': 590, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 114.41, 'max_sentence2_length': 376, 'unique_sentence2': 1996}, 'nde_Latn-kin_Latn': {'num_samples': 1997, 'number_of_characters': 601526, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 149.09, 'max_sentence1_length': 590, 'unique_sentence1': 1997, 'min_sentence2_length': 10, 'average_sentence2_length': 152.12, 'max_sentence2_length': 541, 'unique_sentence2': 1996}, 'nde_Latn-nya_Latn': {'num_samples': 1997, 'number_of_characters': 582021, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 149.09, 'max_sentence1_length': 590, 'unique_sentence1': 1997, 'min_sentence2_length': 10, 'average_sentence2_length': 142.35, 'max_sentence2_length': 464, 'unique_sentence2': 1993}, 'nde_Latn-sna_Latn': {'num_samples': 1997, 'number_of_characters': 596069, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 149.09, 'max_sentence1_length': 590, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 149.39, 'max_sentence2_length': 511, 'unique_sentence2': 1995}, 'nde_Latn-ven_Latn': {'num_samples': 1997, 'number_of_characters': 597495, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 149.09, 'max_sentence1_length': 590, 'unique_sentence1': 1997, 'min_sentence2_length': 10, 'average_sentence2_length': 150.1, 'max_sentence2_length': 535, 'unique_sentence2': 1993}, 'nep_Deva-ben_Beng': {'num_samples': 1997, 'number_of_characters': 492025, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 124.3, 'max_sentence1_length': 412, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 122.08, 'max_sentence2_length': 402, 'unique_sentence2': 1997}, 'nep_Deva-div_Thaa': {'num_samples': 1997, 'number_of_characters': 552071, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 124.3, 'max_sentence1_length': 412, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 152.15, 'max_sentence2_length': 609, 'unique_sentence2': 1996}, 'nep_Deva-eng_Latn': {'num_samples': 1997, 'number_of_characters': 495943, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 124.3, 'max_sentence1_length': 412, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'nep_Deva-eus_Latn': {'num_samples': 1997, 'number_of_characters': 523426, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 124.3, 'max_sentence1_length': 412, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 137.81, 'max_sentence2_length': 393, 'unique_sentence2': 1997}, 'nep_Deva-guj_Gujr': {'num_samples': 1997, 'number_of_characters': 493154, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 124.3, 'max_sentence1_length': 412, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 122.65, 'max_sentence2_length': 378, 'unique_sentence2': 1997}, 'nep_Deva-hin_Deva': {'num_samples': 1997, 'number_of_characters': 509798, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 124.3, 'max_sentence1_length': 412, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 130.98, 'max_sentence2_length': 394, 'unique_sentence2': 1996}, 'nep_Deva-kan_Knda': {'num_samples': 1997, 'number_of_characters': 513759, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 124.3, 'max_sentence1_length': 412, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 132.97, 'max_sentence2_length': 449, 'unique_sentence2': 1996}, 'nep_Deva-mar_Deva': {'num_samples': 1997, 'number_of_characters': 509110, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 124.3, 'max_sentence1_length': 412, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 130.64, 'max_sentence2_length': 443, 'unique_sentence2': 1995}, 'nep_Deva-pan_Guru': {'num_samples': 1997, 'number_of_characters': 498645, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 124.3, 'max_sentence1_length': 412, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 125.4, 'max_sentence2_length': 383, 'unique_sentence2': 1996}, 'nep_Deva-sin_Sinh': {'num_samples': 1997, 'number_of_characters': 506964, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 124.3, 'max_sentence1_length': 412, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 129.56, 'max_sentence2_length': 441, 'unique_sentence2': 1996}, 'nep_Deva-snd_Arab': {'num_samples': 1997, 'number_of_characters': 468550, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 124.3, 'max_sentence1_length': 412, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 110.33, 'max_sentence2_length': 335, 'unique_sentence2': 1996}, 'nep_Deva-tam_Taml': {'num_samples': 1997, 'number_of_characters': 558184, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 124.3, 'max_sentence1_length': 412, 'unique_sentence1': 1996, 'min_sentence2_length': 11, 'average_sentence2_length': 155.21, 'max_sentence2_length': 581, 'unique_sentence2': 1997}, 'nep_Deva-tel_Telu': {'num_samples': 1997, 'number_of_characters': 495750, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 124.3, 'max_sentence1_length': 412, 'unique_sentence1': 1996, 'min_sentence2_length': 10, 'average_sentence2_length': 123.95, 'max_sentence2_length': 412, 'unique_sentence2': 1996}, 'nep_Deva-urd_Arab': {'num_samples': 1997, 'number_of_characters': 496221, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 124.3, 'max_sentence1_length': 412, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.19, 'max_sentence2_length': 390, 'unique_sentence2': 1996}, 'nld_Latn-afr_Latn': {'num_samples': 1997, 'number_of_characters': 560267, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.18, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 134.38, 'max_sentence2_length': 437, 'unique_sentence2': 1996}, 'nld_Latn-arb_Arab': {'num_samples': 1997, 'number_of_characters': 523096, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.18, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 115.76, 'max_sentence2_length': 362, 'unique_sentence2': 1995}, 'nld_Latn-ben_Beng': {'num_samples': 1997, 'number_of_characters': 535717, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 146.18, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 122.08, 'max_sentence2_length': 402, 'unique_sentence2': 1997}, 'nld_Latn-dan_Latn': {'num_samples': 1997, 'number_of_characters': 544053, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.18, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 126.26, 'max_sentence2_length': 522, 'unique_sentence2': 1995}, 'nld_Latn-deu_Latn': {'num_samples': 1997, 'number_of_characters': 587565, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.18, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 148.05, 'max_sentence2_length': 508, 'unique_sentence2': 1996}, 'nld_Latn-ell_Grek': {'num_samples': 1997, 'number_of_characters': 591042, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.18, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 149.79, 'max_sentence2_length': 584, 'unique_sentence2': 1996}, 'nld_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 539635, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 146.18, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'nld_Latn-fao_Latn': {'num_samples': 1997, 'number_of_characters': 549718, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 146.18, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 129.1, 'max_sentence2_length': 433, 'unique_sentence2': 1997}, 'nld_Latn-fas_Arab': {'num_samples': 1997, 'number_of_characters': 535254, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.18, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 121.85, 'max_sentence2_length': 389, 'unique_sentence2': 1995}, 'nld_Latn-fin_Latn': {'num_samples': 1997, 'number_of_characters': 561715, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.18, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 135.1, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'nld_Latn-fra_Latn': {'num_samples': 1997, 'number_of_characters': 585023, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.18, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 146.77, 'max_sentence2_length': 512, 'unique_sentence2': 1996}, 'nld_Latn-heb_Hebr': {'num_samples': 1997, 'number_of_characters': 492211, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.18, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 100.3, 'max_sentence2_length': 375, 'unique_sentence2': 1996}, 'nld_Latn-hin_Deva': {'num_samples': 1997, 'number_of_characters': 553490, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.18, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 130.98, 'max_sentence2_length': 394, 'unique_sentence2': 1996}, 'nld_Latn-hun_Latn': {'num_samples': 1997, 'number_of_characters': 570291, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 146.18, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 139.4, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'nld_Latn-ind_Latn': {'num_samples': 1997, 'number_of_characters': 578887, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 146.18, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 143.7, 'max_sentence2_length': 486, 'unique_sentence2': 1997}, 'nld_Latn-isl_Latn': {'num_samples': 1997, 'number_of_characters': 554123, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.18, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 131.3, 'max_sentence2_length': 399, 'unique_sentence2': 1996}, 'nld_Latn-jpn_Jpan': {'num_samples': 1997, 'number_of_characters': 403541, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.18, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 4, 'average_sentence2_length': 55.9, 'max_sentence2_length': 189, 'unique_sentence2': 1994}, 'nld_Latn-kor_Hang': {'num_samples': 1997, 'number_of_characters': 425320, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.18, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 66.8, 'max_sentence2_length': 217, 'unique_sentence2': 1995}, 'nld_Latn-lit_Latn': {'num_samples': 1997, 'number_of_characters': 551312, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.18, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 129.89, 'max_sentence2_length': 446, 'unique_sentence2': 1995}, 'nld_Latn-ltz_Latn': {'num_samples': 1997, 'number_of_characters': 572672, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.18, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 140.59, 'max_sentence2_length': 543, 'unique_sentence2': 1996}, 'nld_Latn-nno_Latn': {'num_samples': 1997, 'number_of_characters': 540272, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.18, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.37, 'max_sentence2_length': 417, 'unique_sentence2': 1996}, 'nld_Latn-nob_Latn': {'num_samples': 1997, 'number_of_characters': 543359, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.18, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 125.91, 'max_sentence2_length': 482, 'unique_sentence2': 1996}, 'nld_Latn-pol_Latn': {'num_samples': 1997, 'number_of_characters': 569781, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.18, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 139.14, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'nld_Latn-por_Latn': {'num_samples': 1997, 'number_of_characters': 569130, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.18, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 138.82, 'max_sentence2_length': 497, 'unique_sentence2': 1996}, 'nld_Latn-rus_Cyrl': {'num_samples': 1997, 'number_of_characters': 566808, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.18, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 137.65, 'max_sentence2_length': 419, 'unique_sentence2': 1996}, 'nld_Latn-spa_Latn': {'num_samples': 1997, 'number_of_characters': 580115, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.18, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 1, 'average_sentence2_length': 144.32, 'max_sentence2_length': 504, 'unique_sentence2': 1996}, 'nld_Latn-swa_Latn': {'num_samples': 1997, 'number_of_characters': 564424, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 146.18, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 10, 'average_sentence2_length': 136.46, 'max_sentence2_length': 430, 'unique_sentence2': 1997}, 'nld_Latn-swe_Latn': {'num_samples': 1997, 'number_of_characters': 543742, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.18, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 126.1, 'max_sentence2_length': 430, 'unique_sentence2': 1996}, 'nld_Latn-tam_Taml': {'num_samples': 1997, 'number_of_characters': 601876, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 146.18, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 11, 'average_sentence2_length': 155.21, 'max_sentence2_length': 581, 'unique_sentence2': 1997}, 'nld_Latn-tur_Latn': {'num_samples': 1997, 'number_of_characters': 557528, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 146.18, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 133.01, 'max_sentence2_length': 504, 'unique_sentence2': 1997}, 'nld_Latn-vie_Latn': {'num_samples': 1997, 'number_of_characters': 563036, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 146.18, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 135.76, 'max_sentence2_length': 437, 'unique_sentence2': 1996}, 'nld_Latn-zho_Hant': {'num_samples': 1997, 'number_of_characters': 383393, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.18, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 3, 'average_sentence2_length': 45.81, 'max_sentence2_length': 200, 'unique_sentence2': 1996}, 'nld_Latn-zul_Latn': {'num_samples': 1997, 'number_of_characters': 549647, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 146.18, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 129.06, 'max_sentence2_length': 494, 'unique_sentence2': 1996}, 'nno_Latn-afr_Latn': {'num_samples': 1997, 'number_of_characters': 516709, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 124.37, 'max_sentence1_length': 417, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 134.38, 'max_sentence2_length': 437, 'unique_sentence2': 1996}, 'nno_Latn-dan_Latn': {'num_samples': 1997, 'number_of_characters': 500495, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 124.37, 'max_sentence1_length': 417, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 126.26, 'max_sentence2_length': 522, 'unique_sentence2': 1995}, 'nno_Latn-deu_Latn': {'num_samples': 1997, 'number_of_characters': 544007, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 124.37, 'max_sentence1_length': 417, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 148.05, 'max_sentence2_length': 508, 'unique_sentence2': 1996}, 'nno_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 496077, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.37, 'max_sentence1_length': 417, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'nno_Latn-fao_Latn': {'num_samples': 1997, 'number_of_characters': 506160, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.37, 'max_sentence1_length': 417, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 129.1, 'max_sentence2_length': 433, 'unique_sentence2': 1997}, 'nno_Latn-isl_Latn': {'num_samples': 1997, 'number_of_characters': 510565, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 124.37, 'max_sentence1_length': 417, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 131.3, 'max_sentence2_length': 399, 'unique_sentence2': 1996}, 'nno_Latn-ltz_Latn': {'num_samples': 1997, 'number_of_characters': 529114, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 124.37, 'max_sentence1_length': 417, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 140.59, 'max_sentence2_length': 543, 'unique_sentence2': 1996}, 'nno_Latn-nld_Latn': {'num_samples': 1997, 'number_of_characters': 540272, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 124.37, 'max_sentence1_length': 417, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 146.18, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'nno_Latn-nob_Latn': {'num_samples': 1997, 'number_of_characters': 499801, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 124.37, 'max_sentence1_length': 417, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 125.91, 'max_sentence2_length': 482, 'unique_sentence2': 1996}, 'nno_Latn-swe_Latn': {'num_samples': 1997, 'number_of_characters': 500184, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 124.37, 'max_sentence1_length': 417, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 126.1, 'max_sentence2_length': 430, 'unique_sentence2': 1996}, 'nob_Latn-afr_Latn': {'num_samples': 1997, 'number_of_characters': 519796, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 125.91, 'max_sentence1_length': 482, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 134.38, 'max_sentence2_length': 437, 'unique_sentence2': 1996}, 'nob_Latn-dan_Latn': {'num_samples': 1997, 'number_of_characters': 503582, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 125.91, 'max_sentence1_length': 482, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 126.26, 'max_sentence2_length': 522, 'unique_sentence2': 1995}, 'nob_Latn-deu_Latn': {'num_samples': 1997, 'number_of_characters': 547094, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 125.91, 'max_sentence1_length': 482, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 148.05, 'max_sentence2_length': 508, 'unique_sentence2': 1996}, 'nob_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 499164, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 125.91, 'max_sentence1_length': 482, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'nob_Latn-fao_Latn': {'num_samples': 1997, 'number_of_characters': 509247, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 125.91, 'max_sentence1_length': 482, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 129.1, 'max_sentence2_length': 433, 'unique_sentence2': 1997}, 'nob_Latn-isl_Latn': {'num_samples': 1997, 'number_of_characters': 513652, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 125.91, 'max_sentence1_length': 482, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 131.3, 'max_sentence2_length': 399, 'unique_sentence2': 1996}, 'nob_Latn-ltz_Latn': {'num_samples': 1997, 'number_of_characters': 532201, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 125.91, 'max_sentence1_length': 482, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 140.59, 'max_sentence2_length': 543, 'unique_sentence2': 1996}, 'nob_Latn-nld_Latn': {'num_samples': 1997, 'number_of_characters': 543359, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 125.91, 'max_sentence1_length': 482, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 146.18, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'nob_Latn-nno_Latn': {'num_samples': 1997, 'number_of_characters': 499801, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 125.91, 'max_sentence1_length': 482, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.37, 'max_sentence2_length': 417, 'unique_sentence2': 1996}, 'nob_Latn-swe_Latn': {'num_samples': 1997, 'number_of_characters': 503271, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 125.91, 'max_sentence1_length': 482, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 126.1, 'max_sentence2_length': 430, 'unique_sentence2': 1996}, 'nso_Latn-amh_Ethi': {'num_samples': 1997, 'number_of_characters': 459006, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 145.97, 'max_sentence1_length': 487, 'unique_sentence1': 1996, 'min_sentence2_length': 1, 'average_sentence2_length': 83.88, 'max_sentence2_length': 290, 'unique_sentence2': 1994}, 'nso_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 539219, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 145.97, 'max_sentence1_length': 487, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'nso_Latn-hau_Latn': {'num_samples': 1997, 'number_of_characters': 561465, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 145.97, 'max_sentence1_length': 487, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 135.19, 'max_sentence2_length': 483, 'unique_sentence2': 1997}, 'nso_Latn-ibo_Latn': {'num_samples': 1997, 'number_of_characters': 537600, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 145.97, 'max_sentence1_length': 487, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 123.24, 'max_sentence2_length': 469, 'unique_sentence2': 1997}, 'nso_Latn-orm_Ethi': {'num_samples': 1997, 'number_of_characters': 528930, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 145.97, 'max_sentence1_length': 487, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 118.89, 'max_sentence2_length': 466, 'unique_sentence2': 1984}, 'nso_Latn-som_Latn': {'num_samples': 1997, 'number_of_characters': 582791, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 145.97, 'max_sentence1_length': 487, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 145.86, 'max_sentence2_length': 455, 'unique_sentence2': 1997}, 'nso_Latn-ssw_Latn': {'num_samples': 1997, 'number_of_characters': 579641, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 145.97, 'max_sentence1_length': 487, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 144.29, 'max_sentence2_length': 510, 'unique_sentence2': 1996}, 'nso_Latn-swa_Latn': {'num_samples': 1997, 'number_of_characters': 564008, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 145.97, 'max_sentence1_length': 487, 'unique_sentence1': 1996, 'min_sentence2_length': 10, 'average_sentence2_length': 136.46, 'max_sentence2_length': 430, 'unique_sentence2': 1997}, 'nso_Latn-tir_Ethi': {'num_samples': 1997, 'number_of_characters': 456737, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 145.97, 'max_sentence1_length': 487, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 82.74, 'max_sentence2_length': 272, 'unique_sentence2': 1996}, 'nso_Latn-tsn_Latn': {'num_samples': 1997, 'number_of_characters': 625782, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 145.97, 'max_sentence1_length': 487, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 167.39, 'max_sentence2_length': 556, 'unique_sentence2': 1997}, 'nso_Latn-wol_Latn': {'num_samples': 1997, 'number_of_characters': 531302, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 145.97, 'max_sentence1_length': 487, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 120.08, 'max_sentence2_length': 405, 'unique_sentence2': 1990}, 'nso_Latn-xho_Latn': {'num_samples': 1997, 'number_of_characters': 559589, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 145.97, 'max_sentence1_length': 487, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 134.25, 'max_sentence2_length': 492, 'unique_sentence2': 1997}, 'nso_Latn-yor_Latn': {'num_samples': 1997, 'number_of_characters': 607587, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 145.97, 'max_sentence1_length': 487, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 158.28, 'max_sentence2_length': 582, 'unique_sentence2': 1996}, 'nso_Latn-zul_Latn': {'num_samples': 1997, 'number_of_characters': 549231, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 145.97, 'max_sentence1_length': 487, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 129.06, 'max_sentence2_length': 494, 'unique_sentence2': 1996}, 'nya_Latn-bem_Latn': {'num_samples': 1997, 'number_of_characters': 582774, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 142.35, 'max_sentence1_length': 464, 'unique_sentence1': 1993, 'min_sentence2_length': 8, 'average_sentence2_length': 149.47, 'max_sentence2_length': 465, 'unique_sentence2': 1997}, 'nya_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 532002, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 142.35, 'max_sentence1_length': 464, 'unique_sentence1': 1993, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'nya_Latn-ewe_Latn': {'num_samples': 1997, 'number_of_characters': 523260, 'unique_pairs': 1995, 'min_sentence1_length': 10, 'average_sentence1_length': 142.35, 'max_sentence1_length': 464, 'unique_sentence1': 1993, 'min_sentence2_length': 6, 'average_sentence2_length': 119.67, 'max_sentence2_length': 493, 'unique_sentence2': 1994}, 'nya_Latn-fuc_Latn': {'num_samples': 1997, 'number_of_characters': 512762, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 142.35, 'max_sentence1_length': 464, 'unique_sentence1': 1993, 'min_sentence2_length': 6, 'average_sentence2_length': 114.41, 'max_sentence2_length': 376, 'unique_sentence2': 1996}, 'nya_Latn-kin_Latn': {'num_samples': 1997, 'number_of_characters': 588069, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 142.35, 'max_sentence1_length': 464, 'unique_sentence1': 1993, 'min_sentence2_length': 10, 'average_sentence2_length': 152.12, 'max_sentence2_length': 541, 'unique_sentence2': 1996}, 'nya_Latn-nde_Latn': {'num_samples': 1997, 'number_of_characters': 582021, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 142.35, 'max_sentence1_length': 464, 'unique_sentence1': 1993, 'min_sentence2_length': 6, 'average_sentence2_length': 149.09, 'max_sentence2_length': 590, 'unique_sentence2': 1997}, 'nya_Latn-sna_Latn': {'num_samples': 1997, 'number_of_characters': 582612, 'unique_pairs': 1996, 'min_sentence1_length': 10, 'average_sentence1_length': 142.35, 'max_sentence1_length': 464, 'unique_sentence1': 1993, 'min_sentence2_length': 6, 'average_sentence2_length': 149.39, 'max_sentence2_length': 511, 'unique_sentence2': 1995}, 'nya_Latn-ven_Latn': {'num_samples': 1997, 'number_of_characters': 584038, 'unique_pairs': 1996, 'min_sentence1_length': 10, 'average_sentence1_length': 142.35, 'max_sentence1_length': 464, 'unique_sentence1': 1993, 'min_sentence2_length': 10, 'average_sentence2_length': 150.1, 'max_sentence2_length': 535, 'unique_sentence2': 1993}, 'orm_Ethi-amh_Ethi': {'num_samples': 1997, 'number_of_characters': 404938, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 118.89, 'max_sentence1_length': 466, 'unique_sentence1': 1984, 'min_sentence2_length': 1, 'average_sentence2_length': 83.88, 'max_sentence2_length': 290, 'unique_sentence2': 1994}, 'orm_Ethi-eng_Latn': {'num_samples': 1997, 'number_of_characters': 485151, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 118.89, 'max_sentence1_length': 466, 'unique_sentence1': 1984, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'orm_Ethi-hau_Latn': {'num_samples': 1997, 'number_of_characters': 507397, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 118.89, 'max_sentence1_length': 466, 'unique_sentence1': 1984, 'min_sentence2_length': 8, 'average_sentence2_length': 135.19, 'max_sentence2_length': 483, 'unique_sentence2': 1997}, 'orm_Ethi-ibo_Latn': {'num_samples': 1997, 'number_of_characters': 483532, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 118.89, 'max_sentence1_length': 466, 'unique_sentence1': 1984, 'min_sentence2_length': 6, 'average_sentence2_length': 123.24, 'max_sentence2_length': 469, 'unique_sentence2': 1997}, 'orm_Ethi-nso_Latn': {'num_samples': 1997, 'number_of_characters': 528930, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 118.89, 'max_sentence1_length': 466, 'unique_sentence1': 1984, 'min_sentence2_length': 5, 'average_sentence2_length': 145.97, 'max_sentence2_length': 487, 'unique_sentence2': 1996}, 'orm_Ethi-som_Latn': {'num_samples': 1997, 'number_of_characters': 528723, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 118.89, 'max_sentence1_length': 466, 'unique_sentence1': 1984, 'min_sentence2_length': 8, 'average_sentence2_length': 145.86, 'max_sentence2_length': 455, 'unique_sentence2': 1997}, 'orm_Ethi-ssw_Latn': {'num_samples': 1997, 'number_of_characters': 525573, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 118.89, 'max_sentence1_length': 466, 'unique_sentence1': 1984, 'min_sentence2_length': 8, 'average_sentence2_length': 144.29, 'max_sentence2_length': 510, 'unique_sentence2': 1996}, 'orm_Ethi-swa_Latn': {'num_samples': 1997, 'number_of_characters': 509940, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 118.89, 'max_sentence1_length': 466, 'unique_sentence1': 1984, 'min_sentence2_length': 10, 'average_sentence2_length': 136.46, 'max_sentence2_length': 430, 'unique_sentence2': 1997}, 'orm_Ethi-tir_Ethi': {'num_samples': 1997, 'number_of_characters': 402669, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 118.89, 'max_sentence1_length': 466, 'unique_sentence1': 1984, 'min_sentence2_length': 5, 'average_sentence2_length': 82.74, 'max_sentence2_length': 272, 'unique_sentence2': 1996}, 'orm_Ethi-tsn_Latn': {'num_samples': 1997, 'number_of_characters': 571714, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 118.89, 'max_sentence1_length': 466, 'unique_sentence1': 1984, 'min_sentence2_length': 6, 'average_sentence2_length': 167.39, 'max_sentence2_length': 556, 'unique_sentence2': 1997}, 'orm_Ethi-wol_Latn': {'num_samples': 1997, 'number_of_characters': 477234, 'unique_pairs': 1992, 'min_sentence1_length': 9, 'average_sentence1_length': 118.89, 'max_sentence1_length': 466, 'unique_sentence1': 1984, 'min_sentence2_length': 6, 'average_sentence2_length': 120.08, 'max_sentence2_length': 405, 'unique_sentence2': 1990}, 'orm_Ethi-xho_Latn': {'num_samples': 1997, 'number_of_characters': 505521, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 118.89, 'max_sentence1_length': 466, 'unique_sentence1': 1984, 'min_sentence2_length': 6, 'average_sentence2_length': 134.25, 'max_sentence2_length': 492, 'unique_sentence2': 1997}, 'orm_Ethi-yor_Latn': {'num_samples': 1997, 'number_of_characters': 553519, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 118.89, 'max_sentence1_length': 466, 'unique_sentence1': 1984, 'min_sentence2_length': 7, 'average_sentence2_length': 158.28, 'max_sentence2_length': 582, 'unique_sentence2': 1996}, 'orm_Ethi-zul_Latn': {'num_samples': 1997, 'number_of_characters': 495163, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 118.89, 'max_sentence1_length': 466, 'unique_sentence1': 1984, 'min_sentence2_length': 8, 'average_sentence2_length': 129.06, 'max_sentence2_length': 494, 'unique_sentence2': 1996}, 'pan_Guru-ben_Beng': {'num_samples': 1997, 'number_of_characters': 494224, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 125.4, 'max_sentence1_length': 383, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 122.08, 'max_sentence2_length': 402, 'unique_sentence2': 1997}, 'pan_Guru-div_Thaa': {'num_samples': 1997, 'number_of_characters': 554270, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 125.4, 'max_sentence1_length': 383, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 152.15, 'max_sentence2_length': 609, 'unique_sentence2': 1996}, 'pan_Guru-eng_Latn': {'num_samples': 1997, 'number_of_characters': 498142, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 125.4, 'max_sentence1_length': 383, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'pan_Guru-eus_Latn': {'num_samples': 1997, 'number_of_characters': 525625, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 125.4, 'max_sentence1_length': 383, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 137.81, 'max_sentence2_length': 393, 'unique_sentence2': 1997}, 'pan_Guru-guj_Gujr': {'num_samples': 1997, 'number_of_characters': 495353, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 125.4, 'max_sentence1_length': 383, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 122.65, 'max_sentence2_length': 378, 'unique_sentence2': 1997}, 'pan_Guru-hin_Deva': {'num_samples': 1997, 'number_of_characters': 511997, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 125.4, 'max_sentence1_length': 383, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 130.98, 'max_sentence2_length': 394, 'unique_sentence2': 1996}, 'pan_Guru-kan_Knda': {'num_samples': 1997, 'number_of_characters': 515958, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 125.4, 'max_sentence1_length': 383, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 132.97, 'max_sentence2_length': 449, 'unique_sentence2': 1996}, 'pan_Guru-mar_Deva': {'num_samples': 1997, 'number_of_characters': 511309, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 125.4, 'max_sentence1_length': 383, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 130.64, 'max_sentence2_length': 443, 'unique_sentence2': 1995}, 'pan_Guru-nep_Deva': {'num_samples': 1997, 'number_of_characters': 498645, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 125.4, 'max_sentence1_length': 383, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 124.3, 'max_sentence2_length': 412, 'unique_sentence2': 1996}, 'pan_Guru-sin_Sinh': {'num_samples': 1997, 'number_of_characters': 509163, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 125.4, 'max_sentence1_length': 383, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 129.56, 'max_sentence2_length': 441, 'unique_sentence2': 1996}, 'pan_Guru-snd_Arab': {'num_samples': 1997, 'number_of_characters': 470749, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 125.4, 'max_sentence1_length': 383, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 110.33, 'max_sentence2_length': 335, 'unique_sentence2': 1996}, 'pan_Guru-tam_Taml': {'num_samples': 1997, 'number_of_characters': 560383, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 125.4, 'max_sentence1_length': 383, 'unique_sentence1': 1996, 'min_sentence2_length': 11, 'average_sentence2_length': 155.21, 'max_sentence2_length': 581, 'unique_sentence2': 1997}, 'pan_Guru-tel_Telu': {'num_samples': 1997, 'number_of_characters': 497949, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 125.4, 'max_sentence1_length': 383, 'unique_sentence1': 1996, 'min_sentence2_length': 10, 'average_sentence2_length': 123.95, 'max_sentence2_length': 412, 'unique_sentence2': 1996}, 'pan_Guru-urd_Arab': {'num_samples': 1997, 'number_of_characters': 498420, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 125.4, 'max_sentence1_length': 383, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.19, 'max_sentence2_length': 390, 'unique_sentence2': 1996}, 'pol_Latn-arb_Arab': {'num_samples': 1997, 'number_of_characters': 509047, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 139.14, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 115.76, 'max_sentence2_length': 362, 'unique_sentence2': 1995}, 'pol_Latn-bel_Cyrl': {'num_samples': 1997, 'number_of_characters': 533956, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 139.14, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 128.24, 'max_sentence2_length': 422, 'unique_sentence2': 1996}, 'pol_Latn-ben_Beng': {'num_samples': 1997, 'number_of_characters': 521668, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 139.14, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 122.08, 'max_sentence2_length': 402, 'unique_sentence2': 1997}, 'pol_Latn-bos_Latn': {'num_samples': 1997, 'number_of_characters': 532776, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 139.14, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 127.65, 'max_sentence2_length': 434, 'unique_sentence2': 1996}, 'pol_Latn-bul_Cyrl': {'num_samples': 1997, 'number_of_characters': 547755, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 139.14, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 135.15, 'max_sentence2_length': 493, 'unique_sentence2': 1996}, 'pol_Latn-ces_Latn': {'num_samples': 1997, 'number_of_characters': 519184, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 139.14, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 120.84, 'max_sentence2_length': 474, 'unique_sentence2': 1997}, 'pol_Latn-deu_Latn': {'num_samples': 1997, 'number_of_characters': 573516, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 139.14, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 148.05, 'max_sentence2_length': 508, 'unique_sentence2': 1996}, 'pol_Latn-ell_Grek': {'num_samples': 1997, 'number_of_characters': 576993, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 139.14, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 149.79, 'max_sentence2_length': 584, 'unique_sentence2': 1996}, 'pol_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 525586, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 139.14, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'pol_Latn-fas_Arab': {'num_samples': 1997, 'number_of_characters': 521205, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 139.14, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 121.85, 'max_sentence2_length': 389, 'unique_sentence2': 1995}, 'pol_Latn-fin_Latn': {'num_samples': 1997, 'number_of_characters': 547666, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 139.14, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 135.1, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'pol_Latn-fra_Latn': {'num_samples': 1997, 'number_of_characters': 570974, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 139.14, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 146.77, 'max_sentence2_length': 512, 'unique_sentence2': 1996}, 'pol_Latn-heb_Hebr': {'num_samples': 1997, 'number_of_characters': 478162, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 139.14, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 100.3, 'max_sentence2_length': 375, 'unique_sentence2': 1996}, 'pol_Latn-hin_Deva': {'num_samples': 1997, 'number_of_characters': 539441, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 139.14, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 130.98, 'max_sentence2_length': 394, 'unique_sentence2': 1996}, 'pol_Latn-hrv_Latn': {'num_samples': 1997, 'number_of_characters': 533791, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 139.14, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 128.15, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'pol_Latn-hun_Latn': {'num_samples': 1997, 'number_of_characters': 556242, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 139.14, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 139.4, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'pol_Latn-ind_Latn': {'num_samples': 1997, 'number_of_characters': 564838, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 139.14, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 143.7, 'max_sentence2_length': 486, 'unique_sentence2': 1997}, 'pol_Latn-jpn_Jpan': {'num_samples': 1997, 'number_of_characters': 389492, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 139.14, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 4, 'average_sentence2_length': 55.9, 'max_sentence2_length': 189, 'unique_sentence2': 1994}, 'pol_Latn-kor_Hang': {'num_samples': 1997, 'number_of_characters': 411271, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 139.14, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 66.8, 'max_sentence2_length': 217, 'unique_sentence2': 1995}, 'pol_Latn-lit_Latn': {'num_samples': 1997, 'number_of_characters': 537263, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 139.14, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 129.89, 'max_sentence2_length': 446, 'unique_sentence2': 1995}, 'pol_Latn-mkd_Cyrl': {'num_samples': 1997, 'number_of_characters': 545757, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 139.14, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 134.15, 'max_sentence2_length': 451, 'unique_sentence2': 1997}, 'pol_Latn-nld_Latn': {'num_samples': 1997, 'number_of_characters': 569781, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 139.14, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 146.18, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'pol_Latn-por_Latn': {'num_samples': 1997, 'number_of_characters': 555081, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 139.14, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 138.82, 'max_sentence2_length': 497, 'unique_sentence2': 1996}, 'pol_Latn-rus_Cyrl': {'num_samples': 1997, 'number_of_characters': 552759, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 139.14, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 137.65, 'max_sentence2_length': 419, 'unique_sentence2': 1996}, 'pol_Latn-slk_Latn': {'num_samples': 1997, 'number_of_characters': 530835, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 139.14, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 126.67, 'max_sentence2_length': 403, 'unique_sentence2': 1996}, 'pol_Latn-slv_Latn': {'num_samples': 1997, 'number_of_characters': 530762, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 139.14, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 126.64, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'pol_Latn-spa_Latn': {'num_samples': 1997, 'number_of_characters': 566066, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 139.14, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 1, 'average_sentence2_length': 144.32, 'max_sentence2_length': 504, 'unique_sentence2': 1996}, 'pol_Latn-srp_Cyrl': {'num_samples': 1997, 'number_of_characters': 530169, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 139.14, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 126.34, 'max_sentence2_length': 439, 'unique_sentence2': 1995}, 'pol_Latn-srp_Latn': {'num_samples': 1997, 'number_of_characters': 534007, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 139.14, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 128.26, 'max_sentence2_length': 452, 'unique_sentence2': 1996}, 'pol_Latn-swa_Latn': {'num_samples': 1997, 'number_of_characters': 550375, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 139.14, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 10, 'average_sentence2_length': 136.46, 'max_sentence2_length': 430, 'unique_sentence2': 1997}, 'pol_Latn-swe_Latn': {'num_samples': 1997, 'number_of_characters': 529693, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 139.14, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 126.1, 'max_sentence2_length': 430, 'unique_sentence2': 1996}, 'pol_Latn-tam_Taml': {'num_samples': 1997, 'number_of_characters': 587827, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 139.14, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 11, 'average_sentence2_length': 155.21, 'max_sentence2_length': 581, 'unique_sentence2': 1997}, 'pol_Latn-tur_Latn': {'num_samples': 1997, 'number_of_characters': 543479, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 139.14, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 133.01, 'max_sentence2_length': 504, 'unique_sentence2': 1997}, 'pol_Latn-ukr_Cyrl': {'num_samples': 1997, 'number_of_characters': 540649, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 139.14, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 131.59, 'max_sentence2_length': 440, 'unique_sentence2': 1996}, 'pol_Latn-vie_Latn': {'num_samples': 1997, 'number_of_characters': 548987, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 139.14, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 135.76, 'max_sentence2_length': 437, 'unique_sentence2': 1996}, 'pol_Latn-zho_Hant': {'num_samples': 1997, 'number_of_characters': 369344, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 139.14, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 3, 'average_sentence2_length': 45.81, 'max_sentence2_length': 200, 'unique_sentence2': 1996}, 'pol_Latn-zul_Latn': {'num_samples': 1997, 'number_of_characters': 535598, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 139.14, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 129.06, 'max_sentence2_length': 494, 'unique_sentence2': 1996}, 'por_Latn-arb_Arab': {'num_samples': 1997, 'number_of_characters': 508396, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 138.82, 'max_sentence1_length': 497, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 115.76, 'max_sentence2_length': 362, 'unique_sentence2': 1995}, 'por_Latn-ben_Beng': {'num_samples': 1997, 'number_of_characters': 521017, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 138.82, 'max_sentence1_length': 497, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 122.08, 'max_sentence2_length': 402, 'unique_sentence2': 1997}, 'por_Latn-cat_Latn': {'num_samples': 1997, 'number_of_characters': 560175, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 138.82, 'max_sentence1_length': 497, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 141.69, 'max_sentence2_length': 460, 'unique_sentence2': 1997}, 'por_Latn-deu_Latn': {'num_samples': 1997, 'number_of_characters': 572865, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 138.82, 'max_sentence1_length': 497, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 148.05, 'max_sentence2_length': 508, 'unique_sentence2': 1996}, 'por_Latn-ell_Grek': {'num_samples': 1997, 'number_of_characters': 576342, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 138.82, 'max_sentence1_length': 497, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 149.79, 'max_sentence2_length': 584, 'unique_sentence2': 1996}, 'por_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 524935, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 138.82, 'max_sentence1_length': 497, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'por_Latn-fas_Arab': {'num_samples': 1997, 'number_of_characters': 520554, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 138.82, 'max_sentence1_length': 497, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 121.85, 'max_sentence2_length': 389, 'unique_sentence2': 1995}, 'por_Latn-fin_Latn': {'num_samples': 1997, 'number_of_characters': 547015, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 138.82, 'max_sentence1_length': 497, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 135.1, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'por_Latn-fra_Latn': {'num_samples': 1997, 'number_of_characters': 570323, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 138.82, 'max_sentence1_length': 497, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 146.77, 'max_sentence2_length': 512, 'unique_sentence2': 1996}, 'por_Latn-glg_Latn': {'num_samples': 1997, 'number_of_characters': 549201, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 138.82, 'max_sentence1_length': 497, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 136.2, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'por_Latn-heb_Hebr': {'num_samples': 1997, 'number_of_characters': 477511, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 138.82, 'max_sentence1_length': 497, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 100.3, 'max_sentence2_length': 375, 'unique_sentence2': 1996}, 'por_Latn-hin_Deva': {'num_samples': 1997, 'number_of_characters': 538790, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 138.82, 'max_sentence1_length': 497, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 130.98, 'max_sentence2_length': 394, 'unique_sentence2': 1996}, 'por_Latn-hun_Latn': {'num_samples': 1997, 'number_of_characters': 555591, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 138.82, 'max_sentence1_length': 497, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 139.4, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'por_Latn-ind_Latn': {'num_samples': 1997, 'number_of_characters': 564187, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 138.82, 'max_sentence1_length': 497, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 143.7, 'max_sentence2_length': 486, 'unique_sentence2': 1997}, 'por_Latn-ita_Latn': {'num_samples': 1997, 'number_of_characters': 566432, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 138.82, 'max_sentence1_length': 497, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 144.83, 'max_sentence2_length': 623, 'unique_sentence2': 1996}, 'por_Latn-jpn_Jpan': {'num_samples': 1997, 'number_of_characters': 388841, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 138.82, 'max_sentence1_length': 497, 'unique_sentence1': 1996, 'min_sentence2_length': 4, 'average_sentence2_length': 55.9, 'max_sentence2_length': 189, 'unique_sentence2': 1994}, 'por_Latn-kor_Hang': {'num_samples': 1997, 'number_of_characters': 410620, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 138.82, 'max_sentence1_length': 497, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 66.8, 'max_sentence2_length': 217, 'unique_sentence2': 1995}, 'por_Latn-lit_Latn': {'num_samples': 1997, 'number_of_characters': 536612, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 138.82, 'max_sentence1_length': 497, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 129.89, 'max_sentence2_length': 446, 'unique_sentence2': 1995}, 'por_Latn-mlt_Latn': {'num_samples': 1997, 'number_of_characters': 554690, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 138.82, 'max_sentence1_length': 497, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 138.95, 'max_sentence2_length': 582, 'unique_sentence2': 1996}, 'por_Latn-nld_Latn': {'num_samples': 1997, 'number_of_characters': 569130, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 138.82, 'max_sentence1_length': 497, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 146.18, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'por_Latn-pol_Latn': {'num_samples': 1997, 'number_of_characters': 555081, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 138.82, 'max_sentence1_length': 497, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 139.14, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'por_Latn-ron_Latn': {'num_samples': 1997, 'number_of_characters': 569700, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 138.82, 'max_sentence1_length': 497, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 146.46, 'max_sentence2_length': 518, 'unique_sentence2': 1997}, 'por_Latn-rus_Cyrl': {'num_samples': 1997, 'number_of_characters': 552108, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 138.82, 'max_sentence1_length': 497, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 137.65, 'max_sentence2_length': 419, 'unique_sentence2': 1996}, 'por_Latn-spa_Latn': {'num_samples': 1997, 'number_of_characters': 565415, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 138.82, 'max_sentence1_length': 497, 'unique_sentence1': 1996, 'min_sentence2_length': 1, 'average_sentence2_length': 144.32, 'max_sentence2_length': 504, 'unique_sentence2': 1996}, 'por_Latn-swa_Latn': {'num_samples': 1997, 'number_of_characters': 549724, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 138.82, 'max_sentence1_length': 497, 'unique_sentence1': 1996, 'min_sentence2_length': 10, 'average_sentence2_length': 136.46, 'max_sentence2_length': 430, 'unique_sentence2': 1997}, 'por_Latn-swe_Latn': {'num_samples': 1997, 'number_of_characters': 529042, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 138.82, 'max_sentence1_length': 497, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 126.1, 'max_sentence2_length': 430, 'unique_sentence2': 1996}, 'por_Latn-tam_Taml': {'num_samples': 1997, 'number_of_characters': 587176, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 138.82, 'max_sentence1_length': 497, 'unique_sentence1': 1996, 'min_sentence2_length': 11, 'average_sentence2_length': 155.21, 'max_sentence2_length': 581, 'unique_sentence2': 1997}, 'por_Latn-tur_Latn': {'num_samples': 1997, 'number_of_characters': 542828, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 138.82, 'max_sentence1_length': 497, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 133.01, 'max_sentence2_length': 504, 'unique_sentence2': 1997}, 'por_Latn-vie_Latn': {'num_samples': 1997, 'number_of_characters': 548336, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 138.82, 'max_sentence1_length': 497, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 135.76, 'max_sentence2_length': 437, 'unique_sentence2': 1996}, 'por_Latn-zho_Hant': {'num_samples': 1997, 'number_of_characters': 368693, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 138.82, 'max_sentence1_length': 497, 'unique_sentence1': 1996, 'min_sentence2_length': 3, 'average_sentence2_length': 45.81, 'max_sentence2_length': 200, 'unique_sentence2': 1996}, 'por_Latn-zul_Latn': {'num_samples': 1997, 'number_of_characters': 534947, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 138.82, 'max_sentence1_length': 497, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 129.06, 'max_sentence2_length': 494, 'unique_sentence2': 1996}, 'prs_Arab-arb_Arab': {'num_samples': 1997, 'number_of_characters': 473717, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 121.45, 'max_sentence1_length': 365, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 115.76, 'max_sentence2_length': 362, 'unique_sentence2': 1995}, 'prs_Arab-ckb_Arab': {'num_samples': 1997, 'number_of_characters': 494903, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 121.45, 'max_sentence1_length': 365, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 126.37, 'max_sentence2_length': 399, 'unique_sentence2': 1995}, 'prs_Arab-eng_Latn': {'num_samples': 1997, 'number_of_characters': 490256, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 121.45, 'max_sentence1_length': 365, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'prs_Arab-fas_Arab': {'num_samples': 1997, 'number_of_characters': 485875, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 121.45, 'max_sentence1_length': 365, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 121.85, 'max_sentence2_length': 389, 'unique_sentence2': 1995}, 'prs_Arab-heb_Hebr': {'num_samples': 1997, 'number_of_characters': 442832, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 121.45, 'max_sentence1_length': 365, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 100.3, 'max_sentence2_length': 375, 'unique_sentence2': 1996}, 'prs_Arab-kmr_Latn': {'num_samples': 1997, 'number_of_characters': 488482, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 121.45, 'max_sentence1_length': 365, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 123.16, 'max_sentence2_length': 420, 'unique_sentence2': 1996}, 'prs_Arab-mey_Arab': {'num_samples': 1997, 'number_of_characters': 456371, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 121.45, 'max_sentence1_length': 365, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 107.08, 'max_sentence2_length': 392, 'unique_sentence2': 1993}, 'prs_Arab-pus_Arab': {'num_samples': 1997, 'number_of_characters': 485169, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 121.45, 'max_sentence1_length': 365, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 121.5, 'max_sentence2_length': 366, 'unique_sentence2': 1996}, 'prs_Arab-shi_Arab': {'num_samples': 1997, 'number_of_characters': 457449, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 121.45, 'max_sentence1_length': 365, 'unique_sentence1': 1997, 'min_sentence2_length': 3, 'average_sentence2_length': 107.62, 'max_sentence2_length': 378, 'unique_sentence2': 1996}, 'prs_Arab-tgk_Cyrl': {'num_samples': 1997, 'number_of_characters': 516683, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 121.45, 'max_sentence1_length': 365, 'unique_sentence1': 1997, 'min_sentence2_length': 3, 'average_sentence2_length': 137.28, 'max_sentence2_length': 451, 'unique_sentence2': 1995}, 'pus_Arab-arb_Arab': {'num_samples': 1997, 'number_of_characters': 473814, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 121.5, 'max_sentence1_length': 366, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 115.76, 'max_sentence2_length': 362, 'unique_sentence2': 1995}, 'pus_Arab-ckb_Arab': {'num_samples': 1997, 'number_of_characters': 495000, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 121.5, 'max_sentence1_length': 366, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 126.37, 'max_sentence2_length': 399, 'unique_sentence2': 1995}, 'pus_Arab-eng_Latn': {'num_samples': 1997, 'number_of_characters': 490353, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 121.5, 'max_sentence1_length': 366, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'pus_Arab-fas_Arab': {'num_samples': 1997, 'number_of_characters': 485972, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 121.5, 'max_sentence1_length': 366, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 121.85, 'max_sentence2_length': 389, 'unique_sentence2': 1995}, 'pus_Arab-heb_Hebr': {'num_samples': 1997, 'number_of_characters': 442929, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 121.5, 'max_sentence1_length': 366, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 100.3, 'max_sentence2_length': 375, 'unique_sentence2': 1996}, 'pus_Arab-kmr_Latn': {'num_samples': 1997, 'number_of_characters': 488579, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 121.5, 'max_sentence1_length': 366, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 123.16, 'max_sentence2_length': 420, 'unique_sentence2': 1996}, 'pus_Arab-mey_Arab': {'num_samples': 1997, 'number_of_characters': 456468, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 121.5, 'max_sentence1_length': 366, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 107.08, 'max_sentence2_length': 392, 'unique_sentence2': 1993}, 'pus_Arab-prs_Arab': {'num_samples': 1997, 'number_of_characters': 485169, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 121.5, 'max_sentence1_length': 366, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 121.45, 'max_sentence2_length': 365, 'unique_sentence2': 1997}, 'pus_Arab-shi_Arab': {'num_samples': 1997, 'number_of_characters': 457546, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 121.5, 'max_sentence1_length': 366, 'unique_sentence1': 1996, 'min_sentence2_length': 3, 'average_sentence2_length': 107.62, 'max_sentence2_length': 378, 'unique_sentence2': 1996}, 'pus_Arab-tgk_Cyrl': {'num_samples': 1997, 'number_of_characters': 516780, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 121.5, 'max_sentence1_length': 366, 'unique_sentence1': 1996, 'min_sentence2_length': 3, 'average_sentence2_length': 137.28, 'max_sentence2_length': 451, 'unique_sentence2': 1995}, 'ron_Latn-cat_Latn': {'num_samples': 1997, 'number_of_characters': 575445, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 146.46, 'max_sentence1_length': 518, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 141.69, 'max_sentence2_length': 460, 'unique_sentence2': 1997}, 'ron_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 540205, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 146.46, 'max_sentence1_length': 518, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'ron_Latn-fra_Latn': {'num_samples': 1997, 'number_of_characters': 585593, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 146.46, 'max_sentence1_length': 518, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 146.77, 'max_sentence2_length': 512, 'unique_sentence2': 1996}, 'ron_Latn-glg_Latn': {'num_samples': 1997, 'number_of_characters': 564471, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 146.46, 'max_sentence1_length': 518, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 136.2, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'ron_Latn-ita_Latn': {'num_samples': 1997, 'number_of_characters': 581702, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 146.46, 'max_sentence1_length': 518, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 144.83, 'max_sentence2_length': 623, 'unique_sentence2': 1996}, 'ron_Latn-mlt_Latn': {'num_samples': 1997, 'number_of_characters': 569960, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 146.46, 'max_sentence1_length': 518, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 138.95, 'max_sentence2_length': 582, 'unique_sentence2': 1996}, 'ron_Latn-por_Latn': {'num_samples': 1997, 'number_of_characters': 569700, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 146.46, 'max_sentence1_length': 518, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 138.82, 'max_sentence2_length': 497, 'unique_sentence2': 1996}, 'ron_Latn-spa_Latn': {'num_samples': 1997, 'number_of_characters': 580685, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 146.46, 'max_sentence1_length': 518, 'unique_sentence1': 1997, 'min_sentence2_length': 1, 'average_sentence2_length': 144.32, 'max_sentence2_length': 504, 'unique_sentence2': 1996}, 'rus_Cyrl-arb_Arab': {'num_samples': 1997, 'number_of_characters': 506074, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 137.65, 'max_sentence1_length': 419, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 115.76, 'max_sentence2_length': 362, 'unique_sentence2': 1995}, 'rus_Cyrl-bel_Cyrl': {'num_samples': 1997, 'number_of_characters': 530983, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 137.65, 'max_sentence1_length': 419, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 128.24, 'max_sentence2_length': 422, 'unique_sentence2': 1996}, 'rus_Cyrl-ben_Beng': {'num_samples': 1997, 'number_of_characters': 518695, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 137.65, 'max_sentence1_length': 419, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 122.08, 'max_sentence2_length': 402, 'unique_sentence2': 1997}, 'rus_Cyrl-bos_Latn': {'num_samples': 1997, 'number_of_characters': 529803, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 137.65, 'max_sentence1_length': 419, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 127.65, 'max_sentence2_length': 434, 'unique_sentence2': 1996}, 'rus_Cyrl-bul_Cyrl': {'num_samples': 1997, 'number_of_characters': 544782, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 137.65, 'max_sentence1_length': 419, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 135.15, 'max_sentence2_length': 493, 'unique_sentence2': 1996}, 'rus_Cyrl-ces_Latn': {'num_samples': 1997, 'number_of_characters': 516211, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 137.65, 'max_sentence1_length': 419, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 120.84, 'max_sentence2_length': 474, 'unique_sentence2': 1997}, 'rus_Cyrl-deu_Latn': {'num_samples': 1997, 'number_of_characters': 570543, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 137.65, 'max_sentence1_length': 419, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 148.05, 'max_sentence2_length': 508, 'unique_sentence2': 1996}, 'rus_Cyrl-ell_Grek': {'num_samples': 1997, 'number_of_characters': 574020, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 137.65, 'max_sentence1_length': 419, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 149.79, 'max_sentence2_length': 584, 'unique_sentence2': 1996}, 'rus_Cyrl-eng_Latn': {'num_samples': 1997, 'number_of_characters': 522613, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 137.65, 'max_sentence1_length': 419, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'rus_Cyrl-fas_Arab': {'num_samples': 1997, 'number_of_characters': 518232, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 137.65, 'max_sentence1_length': 419, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 121.85, 'max_sentence2_length': 389, 'unique_sentence2': 1995}, 'rus_Cyrl-fin_Latn': {'num_samples': 1997, 'number_of_characters': 544693, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 137.65, 'max_sentence1_length': 419, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 135.1, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'rus_Cyrl-fra_Latn': {'num_samples': 1997, 'number_of_characters': 568001, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 137.65, 'max_sentence1_length': 419, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 146.77, 'max_sentence2_length': 512, 'unique_sentence2': 1996}, 'rus_Cyrl-heb_Hebr': {'num_samples': 1997, 'number_of_characters': 475189, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 137.65, 'max_sentence1_length': 419, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 100.3, 'max_sentence2_length': 375, 'unique_sentence2': 1996}, 'rus_Cyrl-hin_Deva': {'num_samples': 1997, 'number_of_characters': 536468, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 137.65, 'max_sentence1_length': 419, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 130.98, 'max_sentence2_length': 394, 'unique_sentence2': 1996}, 'rus_Cyrl-hrv_Latn': {'num_samples': 1997, 'number_of_characters': 530818, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 137.65, 'max_sentence1_length': 419, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 128.15, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'rus_Cyrl-hun_Latn': {'num_samples': 1997, 'number_of_characters': 553269, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 137.65, 'max_sentence1_length': 419, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 139.4, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'rus_Cyrl-ind_Latn': {'num_samples': 1997, 'number_of_characters': 561865, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 137.65, 'max_sentence1_length': 419, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 143.7, 'max_sentence2_length': 486, 'unique_sentence2': 1997}, 'rus_Cyrl-jpn_Jpan': {'num_samples': 1997, 'number_of_characters': 386519, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 137.65, 'max_sentence1_length': 419, 'unique_sentence1': 1996, 'min_sentence2_length': 4, 'average_sentence2_length': 55.9, 'max_sentence2_length': 189, 'unique_sentence2': 1994}, 'rus_Cyrl-kor_Hang': {'num_samples': 1997, 'number_of_characters': 408298, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 137.65, 'max_sentence1_length': 419, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 66.8, 'max_sentence2_length': 217, 'unique_sentence2': 1995}, 'rus_Cyrl-lit_Latn': {'num_samples': 1997, 'number_of_characters': 534290, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 137.65, 'max_sentence1_length': 419, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 129.89, 'max_sentence2_length': 446, 'unique_sentence2': 1995}, 'rus_Cyrl-mkd_Cyrl': {'num_samples': 1997, 'number_of_characters': 542784, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 137.65, 'max_sentence1_length': 419, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 134.15, 'max_sentence2_length': 451, 'unique_sentence2': 1997}, 'rus_Cyrl-nld_Latn': {'num_samples': 1997, 'number_of_characters': 566808, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 137.65, 'max_sentence1_length': 419, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 146.18, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'rus_Cyrl-pol_Latn': {'num_samples': 1997, 'number_of_characters': 552759, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 137.65, 'max_sentence1_length': 419, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 139.14, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'rus_Cyrl-por_Latn': {'num_samples': 1997, 'number_of_characters': 552108, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 137.65, 'max_sentence1_length': 419, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 138.82, 'max_sentence2_length': 497, 'unique_sentence2': 1996}, 'rus_Cyrl-slk_Latn': {'num_samples': 1997, 'number_of_characters': 527862, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 137.65, 'max_sentence1_length': 419, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 126.67, 'max_sentence2_length': 403, 'unique_sentence2': 1996}, 'rus_Cyrl-slv_Latn': {'num_samples': 1997, 'number_of_characters': 527789, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 137.65, 'max_sentence1_length': 419, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 126.64, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'rus_Cyrl-spa_Latn': {'num_samples': 1997, 'number_of_characters': 563093, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 137.65, 'max_sentence1_length': 419, 'unique_sentence1': 1996, 'min_sentence2_length': 1, 'average_sentence2_length': 144.32, 'max_sentence2_length': 504, 'unique_sentence2': 1996}, 'rus_Cyrl-srp_Cyrl': {'num_samples': 1997, 'number_of_characters': 527196, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 137.65, 'max_sentence1_length': 419, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 126.34, 'max_sentence2_length': 439, 'unique_sentence2': 1995}, 'rus_Cyrl-srp_Latn': {'num_samples': 1997, 'number_of_characters': 531034, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 137.65, 'max_sentence1_length': 419, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 128.26, 'max_sentence2_length': 452, 'unique_sentence2': 1996}, 'rus_Cyrl-swa_Latn': {'num_samples': 1997, 'number_of_characters': 547402, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 137.65, 'max_sentence1_length': 419, 'unique_sentence1': 1996, 'min_sentence2_length': 10, 'average_sentence2_length': 136.46, 'max_sentence2_length': 430, 'unique_sentence2': 1997}, 'rus_Cyrl-swe_Latn': {'num_samples': 1997, 'number_of_characters': 526720, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 137.65, 'max_sentence1_length': 419, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 126.1, 'max_sentence2_length': 430, 'unique_sentence2': 1996}, 'rus_Cyrl-tam_Taml': {'num_samples': 1997, 'number_of_characters': 584854, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 137.65, 'max_sentence1_length': 419, 'unique_sentence1': 1996, 'min_sentence2_length': 11, 'average_sentence2_length': 155.21, 'max_sentence2_length': 581, 'unique_sentence2': 1997}, 'rus_Cyrl-tur_Latn': {'num_samples': 1997, 'number_of_characters': 540506, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 137.65, 'max_sentence1_length': 419, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 133.01, 'max_sentence2_length': 504, 'unique_sentence2': 1997}, 'rus_Cyrl-ukr_Cyrl': {'num_samples': 1997, 'number_of_characters': 537676, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 137.65, 'max_sentence1_length': 419, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 131.59, 'max_sentence2_length': 440, 'unique_sentence2': 1996}, 'rus_Cyrl-vie_Latn': {'num_samples': 1997, 'number_of_characters': 546014, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 137.65, 'max_sentence1_length': 419, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 135.76, 'max_sentence2_length': 437, 'unique_sentence2': 1996}, 'rus_Cyrl-zho_Hant': {'num_samples': 1997, 'number_of_characters': 366371, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 137.65, 'max_sentence1_length': 419, 'unique_sentence1': 1996, 'min_sentence2_length': 3, 'average_sentence2_length': 45.81, 'max_sentence2_length': 200, 'unique_sentence2': 1996}, 'rus_Cyrl-zul_Latn': {'num_samples': 1997, 'number_of_characters': 532625, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 137.65, 'max_sentence1_length': 419, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 129.06, 'max_sentence2_length': 494, 'unique_sentence2': 1996}, 'shi_Arab-arb_Arab': {'num_samples': 1997, 'number_of_characters': 446094, 'unique_pairs': 1997, 'min_sentence1_length': 3, 'average_sentence1_length': 107.62, 'max_sentence1_length': 378, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 115.76, 'max_sentence2_length': 362, 'unique_sentence2': 1995}, 'shi_Arab-ckb_Arab': {'num_samples': 1997, 'number_of_characters': 467280, 'unique_pairs': 1997, 'min_sentence1_length': 3, 'average_sentence1_length': 107.62, 'max_sentence1_length': 378, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 126.37, 'max_sentence2_length': 399, 'unique_sentence2': 1995}, 'shi_Arab-eng_Latn': {'num_samples': 1997, 'number_of_characters': 462633, 'unique_pairs': 1997, 'min_sentence1_length': 3, 'average_sentence1_length': 107.62, 'max_sentence1_length': 378, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'shi_Arab-fas_Arab': {'num_samples': 1997, 'number_of_characters': 458252, 'unique_pairs': 1997, 'min_sentence1_length': 3, 'average_sentence1_length': 107.62, 'max_sentence1_length': 378, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 121.85, 'max_sentence2_length': 389, 'unique_sentence2': 1995}, 'shi_Arab-heb_Hebr': {'num_samples': 1997, 'number_of_characters': 415209, 'unique_pairs': 1997, 'min_sentence1_length': 3, 'average_sentence1_length': 107.62, 'max_sentence1_length': 378, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 100.3, 'max_sentence2_length': 375, 'unique_sentence2': 1996}, 'shi_Arab-kmr_Latn': {'num_samples': 1997, 'number_of_characters': 460859, 'unique_pairs': 1997, 'min_sentence1_length': 3, 'average_sentence1_length': 107.62, 'max_sentence1_length': 378, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 123.16, 'max_sentence2_length': 420, 'unique_sentence2': 1996}, 'shi_Arab-mey_Arab': {'num_samples': 1997, 'number_of_characters': 428748, 'unique_pairs': 1997, 'min_sentence1_length': 3, 'average_sentence1_length': 107.62, 'max_sentence1_length': 378, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 107.08, 'max_sentence2_length': 392, 'unique_sentence2': 1993}, 'shi_Arab-prs_Arab': {'num_samples': 1997, 'number_of_characters': 457449, 'unique_pairs': 1997, 'min_sentence1_length': 3, 'average_sentence1_length': 107.62, 'max_sentence1_length': 378, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 121.45, 'max_sentence2_length': 365, 'unique_sentence2': 1997}, 'shi_Arab-pus_Arab': {'num_samples': 1997, 'number_of_characters': 457546, 'unique_pairs': 1997, 'min_sentence1_length': 3, 'average_sentence1_length': 107.62, 'max_sentence1_length': 378, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 121.5, 'max_sentence2_length': 366, 'unique_sentence2': 1996}, 'shi_Arab-tgk_Cyrl': {'num_samples': 1997, 'number_of_characters': 489060, 'unique_pairs': 1997, 'min_sentence1_length': 3, 'average_sentence1_length': 107.62, 'max_sentence1_length': 378, 'unique_sentence1': 1996, 'min_sentence2_length': 3, 'average_sentence2_length': 137.28, 'max_sentence2_length': 451, 'unique_sentence2': 1995}, 'sin_Sinh-ben_Beng': {'num_samples': 1997, 'number_of_characters': 502543, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 129.56, 'max_sentence1_length': 441, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 122.08, 'max_sentence2_length': 402, 'unique_sentence2': 1997}, 'sin_Sinh-div_Thaa': {'num_samples': 1997, 'number_of_characters': 562589, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 129.56, 'max_sentence1_length': 441, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 152.15, 'max_sentence2_length': 609, 'unique_sentence2': 1996}, 'sin_Sinh-eng_Latn': {'num_samples': 1997, 'number_of_characters': 506461, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 129.56, 'max_sentence1_length': 441, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'sin_Sinh-eus_Latn': {'num_samples': 1997, 'number_of_characters': 533944, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 129.56, 'max_sentence1_length': 441, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 137.81, 'max_sentence2_length': 393, 'unique_sentence2': 1997}, 'sin_Sinh-guj_Gujr': {'num_samples': 1997, 'number_of_characters': 503672, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 129.56, 'max_sentence1_length': 441, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 122.65, 'max_sentence2_length': 378, 'unique_sentence2': 1997}, 'sin_Sinh-hin_Deva': {'num_samples': 1997, 'number_of_characters': 520316, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 129.56, 'max_sentence1_length': 441, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 130.98, 'max_sentence2_length': 394, 'unique_sentence2': 1996}, 'sin_Sinh-kan_Knda': {'num_samples': 1997, 'number_of_characters': 524277, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 129.56, 'max_sentence1_length': 441, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 132.97, 'max_sentence2_length': 449, 'unique_sentence2': 1996}, 'sin_Sinh-mar_Deva': {'num_samples': 1997, 'number_of_characters': 519628, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 129.56, 'max_sentence1_length': 441, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 130.64, 'max_sentence2_length': 443, 'unique_sentence2': 1995}, 'sin_Sinh-nep_Deva': {'num_samples': 1997, 'number_of_characters': 506964, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 129.56, 'max_sentence1_length': 441, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 124.3, 'max_sentence2_length': 412, 'unique_sentence2': 1996}, 'sin_Sinh-pan_Guru': {'num_samples': 1997, 'number_of_characters': 509163, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 129.56, 'max_sentence1_length': 441, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 125.4, 'max_sentence2_length': 383, 'unique_sentence2': 1996}, 'sin_Sinh-snd_Arab': {'num_samples': 1997, 'number_of_characters': 479068, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 129.56, 'max_sentence1_length': 441, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 110.33, 'max_sentence2_length': 335, 'unique_sentence2': 1996}, 'sin_Sinh-tam_Taml': {'num_samples': 1997, 'number_of_characters': 568702, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 129.56, 'max_sentence1_length': 441, 'unique_sentence1': 1996, 'min_sentence2_length': 11, 'average_sentence2_length': 155.21, 'max_sentence2_length': 581, 'unique_sentence2': 1997}, 'sin_Sinh-tel_Telu': {'num_samples': 1997, 'number_of_characters': 506268, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 129.56, 'max_sentence1_length': 441, 'unique_sentence1': 1996, 'min_sentence2_length': 10, 'average_sentence2_length': 123.95, 'max_sentence2_length': 412, 'unique_sentence2': 1996}, 'sin_Sinh-urd_Arab': {'num_samples': 1997, 'number_of_characters': 506739, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 129.56, 'max_sentence1_length': 441, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.19, 'max_sentence2_length': 390, 'unique_sentence2': 1996}, 'slk_Latn-bel_Cyrl': {'num_samples': 1997, 'number_of_characters': 509059, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 126.67, 'max_sentence1_length': 403, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 128.24, 'max_sentence2_length': 422, 'unique_sentence2': 1996}, 'slk_Latn-bos_Latn': {'num_samples': 1997, 'number_of_characters': 507879, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 126.67, 'max_sentence1_length': 403, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 127.65, 'max_sentence2_length': 434, 'unique_sentence2': 1996}, 'slk_Latn-bul_Cyrl': {'num_samples': 1997, 'number_of_characters': 522858, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 126.67, 'max_sentence1_length': 403, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 135.15, 'max_sentence2_length': 493, 'unique_sentence2': 1996}, 'slk_Latn-ces_Latn': {'num_samples': 1997, 'number_of_characters': 494287, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 126.67, 'max_sentence1_length': 403, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 120.84, 'max_sentence2_length': 474, 'unique_sentence2': 1997}, 'slk_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 500689, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 126.67, 'max_sentence1_length': 403, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'slk_Latn-hrv_Latn': {'num_samples': 1997, 'number_of_characters': 508894, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 126.67, 'max_sentence1_length': 403, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 128.15, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'slk_Latn-mkd_Cyrl': {'num_samples': 1997, 'number_of_characters': 520860, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 126.67, 'max_sentence1_length': 403, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 134.15, 'max_sentence2_length': 451, 'unique_sentence2': 1997}, 'slk_Latn-pol_Latn': {'num_samples': 1997, 'number_of_characters': 530835, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 126.67, 'max_sentence1_length': 403, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 139.14, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'slk_Latn-rus_Cyrl': {'num_samples': 1997, 'number_of_characters': 527862, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 126.67, 'max_sentence1_length': 403, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 137.65, 'max_sentence2_length': 419, 'unique_sentence2': 1996}, 'slk_Latn-slv_Latn': {'num_samples': 1997, 'number_of_characters': 505865, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 126.67, 'max_sentence1_length': 403, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 126.64, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'slk_Latn-srp_Cyrl': {'num_samples': 1997, 'number_of_characters': 505272, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 126.67, 'max_sentence1_length': 403, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 126.34, 'max_sentence2_length': 439, 'unique_sentence2': 1995}, 'slk_Latn-srp_Latn': {'num_samples': 1997, 'number_of_characters': 509110, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 126.67, 'max_sentence1_length': 403, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 128.26, 'max_sentence2_length': 452, 'unique_sentence2': 1996}, 'slk_Latn-ukr_Cyrl': {'num_samples': 1997, 'number_of_characters': 515752, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 126.67, 'max_sentence1_length': 403, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 131.59, 'max_sentence2_length': 440, 'unique_sentence2': 1996}, 'slv_Latn-bel_Cyrl': {'num_samples': 1997, 'number_of_characters': 508986, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 126.64, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 128.24, 'max_sentence2_length': 422, 'unique_sentence2': 1996}, 'slv_Latn-bos_Latn': {'num_samples': 1997, 'number_of_characters': 507806, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 126.64, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 127.65, 'max_sentence2_length': 434, 'unique_sentence2': 1996}, 'slv_Latn-bul_Cyrl': {'num_samples': 1997, 'number_of_characters': 522785, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 126.64, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 135.15, 'max_sentence2_length': 493, 'unique_sentence2': 1996}, 'slv_Latn-ces_Latn': {'num_samples': 1997, 'number_of_characters': 494214, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 126.64, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 120.84, 'max_sentence2_length': 474, 'unique_sentence2': 1997}, 'slv_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 500616, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 126.64, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'slv_Latn-hrv_Latn': {'num_samples': 1997, 'number_of_characters': 508821, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 126.64, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 128.15, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'slv_Latn-mkd_Cyrl': {'num_samples': 1997, 'number_of_characters': 520787, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 126.64, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 134.15, 'max_sentence2_length': 451, 'unique_sentence2': 1997}, 'slv_Latn-pol_Latn': {'num_samples': 1997, 'number_of_characters': 530762, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 126.64, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 139.14, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'slv_Latn-rus_Cyrl': {'num_samples': 1997, 'number_of_characters': 527789, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 126.64, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 137.65, 'max_sentence2_length': 419, 'unique_sentence2': 1996}, 'slv_Latn-slk_Latn': {'num_samples': 1997, 'number_of_characters': 505865, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 126.64, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 126.67, 'max_sentence2_length': 403, 'unique_sentence2': 1996}, 'slv_Latn-srp_Cyrl': {'num_samples': 1997, 'number_of_characters': 505199, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 126.64, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 126.34, 'max_sentence2_length': 439, 'unique_sentence2': 1995}, 'slv_Latn-srp_Latn': {'num_samples': 1997, 'number_of_characters': 509037, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 126.64, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 128.26, 'max_sentence2_length': 452, 'unique_sentence2': 1996}, 'slv_Latn-ukr_Cyrl': {'num_samples': 1997, 'number_of_characters': 515679, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 126.64, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 131.59, 'max_sentence2_length': 440, 'unique_sentence2': 1996}, 'smo_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 525575, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 139.14, 'max_sentence1_length': 431, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'smo_Latn-fij_Latn': {'num_samples': 1997, 'number_of_characters': 578360, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 139.14, 'max_sentence1_length': 431, 'unique_sentence1': 1996, 'min_sentence2_length': 10, 'average_sentence2_length': 150.48, 'max_sentence2_length': 448, 'unique_sentence2': 1988}, 'smo_Latn-fil_Latn': {'num_samples': 1997, 'number_of_characters': 571275, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 139.14, 'max_sentence1_length': 431, 'unique_sentence1': 1996, 'min_sentence2_length': 10, 'average_sentence2_length': 146.93, 'max_sentence2_length': 554, 'unique_sentence2': 1997}, 'smo_Latn-ind_Latn': {'num_samples': 1997, 'number_of_characters': 564827, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 139.14, 'max_sentence1_length': 431, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 143.7, 'max_sentence2_length': 486, 'unique_sentence2': 1997}, 'smo_Latn-mal_Mlym': {'num_samples': 1997, 'number_of_characters': 582007, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 139.14, 'max_sentence1_length': 431, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 152.3, 'max_sentence2_length': 540, 'unique_sentence2': 1996}, 'smo_Latn-mlg_Latn': {'num_samples': 1997, 'number_of_characters': 598163, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 139.14, 'max_sentence1_length': 431, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 160.39, 'max_sentence2_length': 559, 'unique_sentence2': 1994}, 'smo_Latn-mri_Latn': {'num_samples': 1997, 'number_of_characters': 551979, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 139.14, 'max_sentence1_length': 431, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 137.27, 'max_sentence2_length': 443, 'unique_sentence2': 1997}, 'smo_Latn-msa_Latn': {'num_samples': 1997, 'number_of_characters': 555038, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 139.14, 'max_sentence1_length': 431, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 138.8, 'max_sentence2_length': 463, 'unique_sentence2': 1997}, 'smo_Latn-tah_Latn': {'num_samples': 1997, 'number_of_characters': 587478, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 139.14, 'max_sentence1_length': 431, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 155.04, 'max_sentence2_length': 524, 'unique_sentence2': 1997}, 'smo_Latn-ton_Latn': {'num_samples': 1997, 'number_of_characters': 591495, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 139.14, 'max_sentence1_length': 431, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 157.06, 'max_sentence2_length': 468, 'unique_sentence2': 1997}, 'sna_Latn-bem_Latn': {'num_samples': 1997, 'number_of_characters': 596822, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 149.39, 'max_sentence1_length': 511, 'unique_sentence1': 1995, 'min_sentence2_length': 8, 'average_sentence2_length': 149.47, 'max_sentence2_length': 465, 'unique_sentence2': 1997}, 'sna_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 546050, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 149.39, 'max_sentence1_length': 511, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'sna_Latn-ewe_Latn': {'num_samples': 1997, 'number_of_characters': 537308, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 149.39, 'max_sentence1_length': 511, 'unique_sentence1': 1995, 'min_sentence2_length': 6, 'average_sentence2_length': 119.67, 'max_sentence2_length': 493, 'unique_sentence2': 1994}, 'sna_Latn-fuc_Latn': {'num_samples': 1997, 'number_of_characters': 526810, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 149.39, 'max_sentence1_length': 511, 'unique_sentence1': 1995, 'min_sentence2_length': 6, 'average_sentence2_length': 114.41, 'max_sentence2_length': 376, 'unique_sentence2': 1996}, 'sna_Latn-kin_Latn': {'num_samples': 1997, 'number_of_characters': 602117, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 149.39, 'max_sentence1_length': 511, 'unique_sentence1': 1995, 'min_sentence2_length': 10, 'average_sentence2_length': 152.12, 'max_sentence2_length': 541, 'unique_sentence2': 1996}, 'sna_Latn-nde_Latn': {'num_samples': 1997, 'number_of_characters': 596069, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 149.39, 'max_sentence1_length': 511, 'unique_sentence1': 1995, 'min_sentence2_length': 6, 'average_sentence2_length': 149.09, 'max_sentence2_length': 590, 'unique_sentence2': 1997}, 'sna_Latn-nya_Latn': {'num_samples': 1997, 'number_of_characters': 582612, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 149.39, 'max_sentence1_length': 511, 'unique_sentence1': 1995, 'min_sentence2_length': 10, 'average_sentence2_length': 142.35, 'max_sentence2_length': 464, 'unique_sentence2': 1993}, 'sna_Latn-ven_Latn': {'num_samples': 1997, 'number_of_characters': 598086, 'unique_pairs': 1995, 'min_sentence1_length': 6, 'average_sentence1_length': 149.39, 'max_sentence1_length': 511, 'unique_sentence1': 1995, 'min_sentence2_length': 10, 'average_sentence2_length': 150.1, 'max_sentence2_length': 535, 'unique_sentence2': 1993}, 'snd_Arab-ben_Beng': {'num_samples': 1997, 'number_of_characters': 464129, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 110.33, 'max_sentence1_length': 335, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 122.08, 'max_sentence2_length': 402, 'unique_sentence2': 1997}, 'snd_Arab-div_Thaa': {'num_samples': 1997, 'number_of_characters': 524175, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 110.33, 'max_sentence1_length': 335, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 152.15, 'max_sentence2_length': 609, 'unique_sentence2': 1996}, 'snd_Arab-eng_Latn': {'num_samples': 1997, 'number_of_characters': 468047, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 110.33, 'max_sentence1_length': 335, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'snd_Arab-eus_Latn': {'num_samples': 1997, 'number_of_characters': 495530, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 110.33, 'max_sentence1_length': 335, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 137.81, 'max_sentence2_length': 393, 'unique_sentence2': 1997}, 'snd_Arab-guj_Gujr': {'num_samples': 1997, 'number_of_characters': 465258, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 110.33, 'max_sentence1_length': 335, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 122.65, 'max_sentence2_length': 378, 'unique_sentence2': 1997}, 'snd_Arab-hin_Deva': {'num_samples': 1997, 'number_of_characters': 481902, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 110.33, 'max_sentence1_length': 335, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 130.98, 'max_sentence2_length': 394, 'unique_sentence2': 1996}, 'snd_Arab-kan_Knda': {'num_samples': 1997, 'number_of_characters': 485863, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 110.33, 'max_sentence1_length': 335, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 132.97, 'max_sentence2_length': 449, 'unique_sentence2': 1996}, 'snd_Arab-mar_Deva': {'num_samples': 1997, 'number_of_characters': 481214, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 110.33, 'max_sentence1_length': 335, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 130.64, 'max_sentence2_length': 443, 'unique_sentence2': 1995}, 'snd_Arab-nep_Deva': {'num_samples': 1997, 'number_of_characters': 468550, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 110.33, 'max_sentence1_length': 335, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 124.3, 'max_sentence2_length': 412, 'unique_sentence2': 1996}, 'snd_Arab-pan_Guru': {'num_samples': 1997, 'number_of_characters': 470749, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 110.33, 'max_sentence1_length': 335, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 125.4, 'max_sentence2_length': 383, 'unique_sentence2': 1996}, 'snd_Arab-sin_Sinh': {'num_samples': 1997, 'number_of_characters': 479068, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 110.33, 'max_sentence1_length': 335, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 129.56, 'max_sentence2_length': 441, 'unique_sentence2': 1996}, 'snd_Arab-tam_Taml': {'num_samples': 1997, 'number_of_characters': 530288, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 110.33, 'max_sentence1_length': 335, 'unique_sentence1': 1996, 'min_sentence2_length': 11, 'average_sentence2_length': 155.21, 'max_sentence2_length': 581, 'unique_sentence2': 1997}, 'snd_Arab-tel_Telu': {'num_samples': 1997, 'number_of_characters': 467854, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 110.33, 'max_sentence1_length': 335, 'unique_sentence1': 1996, 'min_sentence2_length': 10, 'average_sentence2_length': 123.95, 'max_sentence2_length': 412, 'unique_sentence2': 1996}, 'snd_Arab-urd_Arab': {'num_samples': 1997, 'number_of_characters': 468325, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 110.33, 'max_sentence1_length': 335, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.19, 'max_sentence2_length': 390, 'unique_sentence2': 1996}, 'som_Latn-amh_Ethi': {'num_samples': 1997, 'number_of_characters': 458799, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 145.86, 'max_sentence1_length': 455, 'unique_sentence1': 1997, 'min_sentence2_length': 1, 'average_sentence2_length': 83.88, 'max_sentence2_length': 290, 'unique_sentence2': 1994}, 'som_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 539012, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 145.86, 'max_sentence1_length': 455, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'som_Latn-hau_Latn': {'num_samples': 1997, 'number_of_characters': 561258, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 145.86, 'max_sentence1_length': 455, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 135.19, 'max_sentence2_length': 483, 'unique_sentence2': 1997}, 'som_Latn-ibo_Latn': {'num_samples': 1997, 'number_of_characters': 537393, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 145.86, 'max_sentence1_length': 455, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 123.24, 'max_sentence2_length': 469, 'unique_sentence2': 1997}, 'som_Latn-nso_Latn': {'num_samples': 1997, 'number_of_characters': 582791, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 145.86, 'max_sentence1_length': 455, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 145.97, 'max_sentence2_length': 487, 'unique_sentence2': 1996}, 'som_Latn-orm_Ethi': {'num_samples': 1997, 'number_of_characters': 528723, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 145.86, 'max_sentence1_length': 455, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 118.89, 'max_sentence2_length': 466, 'unique_sentence2': 1984}, 'som_Latn-ssw_Latn': {'num_samples': 1997, 'number_of_characters': 579434, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 145.86, 'max_sentence1_length': 455, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 144.29, 'max_sentence2_length': 510, 'unique_sentence2': 1996}, 'som_Latn-swa_Latn': {'num_samples': 1997, 'number_of_characters': 563801, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 145.86, 'max_sentence1_length': 455, 'unique_sentence1': 1997, 'min_sentence2_length': 10, 'average_sentence2_length': 136.46, 'max_sentence2_length': 430, 'unique_sentence2': 1997}, 'som_Latn-tir_Ethi': {'num_samples': 1997, 'number_of_characters': 456530, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 145.86, 'max_sentence1_length': 455, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 82.74, 'max_sentence2_length': 272, 'unique_sentence2': 1996}, 'som_Latn-tsn_Latn': {'num_samples': 1997, 'number_of_characters': 625575, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 145.86, 'max_sentence1_length': 455, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 167.39, 'max_sentence2_length': 556, 'unique_sentence2': 1997}, 'som_Latn-wol_Latn': {'num_samples': 1997, 'number_of_characters': 531095, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 145.86, 'max_sentence1_length': 455, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 120.08, 'max_sentence2_length': 405, 'unique_sentence2': 1990}, 'som_Latn-xho_Latn': {'num_samples': 1997, 'number_of_characters': 559382, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 145.86, 'max_sentence1_length': 455, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 134.25, 'max_sentence2_length': 492, 'unique_sentence2': 1997}, 'som_Latn-yor_Latn': {'num_samples': 1997, 'number_of_characters': 607380, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 145.86, 'max_sentence1_length': 455, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 158.28, 'max_sentence2_length': 582, 'unique_sentence2': 1996}, 'som_Latn-zul_Latn': {'num_samples': 1997, 'number_of_characters': 549024, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 145.86, 'max_sentence1_length': 455, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 129.06, 'max_sentence2_length': 494, 'unique_sentence2': 1996}, 'spa_Latn-arb_Arab': {'num_samples': 1997, 'number_of_characters': 519381, 'unique_pairs': 1996, 'min_sentence1_length': 1, 'average_sentence1_length': 144.32, 'max_sentence1_length': 504, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 115.76, 'max_sentence2_length': 362, 'unique_sentence2': 1995}, 'spa_Latn-ben_Beng': {'num_samples': 1997, 'number_of_characters': 532002, 'unique_pairs': 1997, 'min_sentence1_length': 1, 'average_sentence1_length': 144.32, 'max_sentence1_length': 504, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 122.08, 'max_sentence2_length': 402, 'unique_sentence2': 1997}, 'spa_Latn-cat_Latn': {'num_samples': 1997, 'number_of_characters': 571160, 'unique_pairs': 1997, 'min_sentence1_length': 1, 'average_sentence1_length': 144.32, 'max_sentence1_length': 504, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 141.69, 'max_sentence2_length': 460, 'unique_sentence2': 1997}, 'spa_Latn-deu_Latn': {'num_samples': 1997, 'number_of_characters': 583850, 'unique_pairs': 1996, 'min_sentence1_length': 1, 'average_sentence1_length': 144.32, 'max_sentence1_length': 504, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 148.05, 'max_sentence2_length': 508, 'unique_sentence2': 1996}, 'spa_Latn-ell_Grek': {'num_samples': 1997, 'number_of_characters': 587327, 'unique_pairs': 1996, 'min_sentence1_length': 1, 'average_sentence1_length': 144.32, 'max_sentence1_length': 504, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 149.79, 'max_sentence2_length': 584, 'unique_sentence2': 1996}, 'spa_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 535920, 'unique_pairs': 1997, 'min_sentence1_length': 1, 'average_sentence1_length': 144.32, 'max_sentence1_length': 504, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'spa_Latn-fas_Arab': {'num_samples': 1997, 'number_of_characters': 531539, 'unique_pairs': 1996, 'min_sentence1_length': 1, 'average_sentence1_length': 144.32, 'max_sentence1_length': 504, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 121.85, 'max_sentence2_length': 389, 'unique_sentence2': 1995}, 'spa_Latn-fin_Latn': {'num_samples': 1997, 'number_of_characters': 558000, 'unique_pairs': 1996, 'min_sentence1_length': 1, 'average_sentence1_length': 144.32, 'max_sentence1_length': 504, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 135.1, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'spa_Latn-fra_Latn': {'num_samples': 1997, 'number_of_characters': 581308, 'unique_pairs': 1996, 'min_sentence1_length': 1, 'average_sentence1_length': 144.32, 'max_sentence1_length': 504, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 146.77, 'max_sentence2_length': 512, 'unique_sentence2': 1996}, 'spa_Latn-glg_Latn': {'num_samples': 1997, 'number_of_characters': 560186, 'unique_pairs': 1996, 'min_sentence1_length': 1, 'average_sentence1_length': 144.32, 'max_sentence1_length': 504, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 136.2, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'spa_Latn-heb_Hebr': {'num_samples': 1997, 'number_of_characters': 488496, 'unique_pairs': 1996, 'min_sentence1_length': 1, 'average_sentence1_length': 144.32, 'max_sentence1_length': 504, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 100.3, 'max_sentence2_length': 375, 'unique_sentence2': 1996}, 'spa_Latn-hin_Deva': {'num_samples': 1997, 'number_of_characters': 549775, 'unique_pairs': 1996, 'min_sentence1_length': 1, 'average_sentence1_length': 144.32, 'max_sentence1_length': 504, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 130.98, 'max_sentence2_length': 394, 'unique_sentence2': 1996}, 'spa_Latn-hun_Latn': {'num_samples': 1997, 'number_of_characters': 566576, 'unique_pairs': 1997, 'min_sentence1_length': 1, 'average_sentence1_length': 144.32, 'max_sentence1_length': 504, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 139.4, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'spa_Latn-ind_Latn': {'num_samples': 1997, 'number_of_characters': 575172, 'unique_pairs': 1997, 'min_sentence1_length': 1, 'average_sentence1_length': 144.32, 'max_sentence1_length': 504, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 143.7, 'max_sentence2_length': 486, 'unique_sentence2': 1997}, 'spa_Latn-ita_Latn': {'num_samples': 1997, 'number_of_characters': 577417, 'unique_pairs': 1996, 'min_sentence1_length': 1, 'average_sentence1_length': 144.32, 'max_sentence1_length': 504, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 144.83, 'max_sentence2_length': 623, 'unique_sentence2': 1996}, 'spa_Latn-jpn_Jpan': {'num_samples': 1997, 'number_of_characters': 399826, 'unique_pairs': 1996, 'min_sentence1_length': 1, 'average_sentence1_length': 144.32, 'max_sentence1_length': 504, 'unique_sentence1': 1996, 'min_sentence2_length': 4, 'average_sentence2_length': 55.9, 'max_sentence2_length': 189, 'unique_sentence2': 1994}, 'spa_Latn-kor_Hang': {'num_samples': 1997, 'number_of_characters': 421605, 'unique_pairs': 1996, 'min_sentence1_length': 1, 'average_sentence1_length': 144.32, 'max_sentence1_length': 504, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 66.8, 'max_sentence2_length': 217, 'unique_sentence2': 1995}, 'spa_Latn-lit_Latn': {'num_samples': 1997, 'number_of_characters': 547597, 'unique_pairs': 1996, 'min_sentence1_length': 1, 'average_sentence1_length': 144.32, 'max_sentence1_length': 504, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 129.89, 'max_sentence2_length': 446, 'unique_sentence2': 1995}, 'spa_Latn-mlt_Latn': {'num_samples': 1997, 'number_of_characters': 565675, 'unique_pairs': 1996, 'min_sentence1_length': 1, 'average_sentence1_length': 144.32, 'max_sentence1_length': 504, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 138.95, 'max_sentence2_length': 582, 'unique_sentence2': 1996}, 'spa_Latn-nld_Latn': {'num_samples': 1997, 'number_of_characters': 580115, 'unique_pairs': 1996, 'min_sentence1_length': 1, 'average_sentence1_length': 144.32, 'max_sentence1_length': 504, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 146.18, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'spa_Latn-pol_Latn': {'num_samples': 1997, 'number_of_characters': 566066, 'unique_pairs': 1996, 'min_sentence1_length': 1, 'average_sentence1_length': 144.32, 'max_sentence1_length': 504, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 139.14, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'spa_Latn-por_Latn': {'num_samples': 1997, 'number_of_characters': 565415, 'unique_pairs': 1996, 'min_sentence1_length': 1, 'average_sentence1_length': 144.32, 'max_sentence1_length': 504, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 138.82, 'max_sentence2_length': 497, 'unique_sentence2': 1996}, 'spa_Latn-ron_Latn': {'num_samples': 1997, 'number_of_characters': 580685, 'unique_pairs': 1997, 'min_sentence1_length': 1, 'average_sentence1_length': 144.32, 'max_sentence1_length': 504, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 146.46, 'max_sentence2_length': 518, 'unique_sentence2': 1997}, 'spa_Latn-rus_Cyrl': {'num_samples': 1997, 'number_of_characters': 563093, 'unique_pairs': 1996, 'min_sentence1_length': 1, 'average_sentence1_length': 144.32, 'max_sentence1_length': 504, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 137.65, 'max_sentence2_length': 419, 'unique_sentence2': 1996}, 'spa_Latn-swa_Latn': {'num_samples': 1997, 'number_of_characters': 560709, 'unique_pairs': 1997, 'min_sentence1_length': 1, 'average_sentence1_length': 144.32, 'max_sentence1_length': 504, 'unique_sentence1': 1996, 'min_sentence2_length': 10, 'average_sentence2_length': 136.46, 'max_sentence2_length': 430, 'unique_sentence2': 1997}, 'spa_Latn-swe_Latn': {'num_samples': 1997, 'number_of_characters': 540027, 'unique_pairs': 1996, 'min_sentence1_length': 1, 'average_sentence1_length': 144.32, 'max_sentence1_length': 504, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 126.1, 'max_sentence2_length': 430, 'unique_sentence2': 1996}, 'spa_Latn-tam_Taml': {'num_samples': 1997, 'number_of_characters': 598161, 'unique_pairs': 1997, 'min_sentence1_length': 1, 'average_sentence1_length': 144.32, 'max_sentence1_length': 504, 'unique_sentence1': 1996, 'min_sentence2_length': 11, 'average_sentence2_length': 155.21, 'max_sentence2_length': 581, 'unique_sentence2': 1997}, 'spa_Latn-tur_Latn': {'num_samples': 1997, 'number_of_characters': 553813, 'unique_pairs': 1997, 'min_sentence1_length': 1, 'average_sentence1_length': 144.32, 'max_sentence1_length': 504, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 133.01, 'max_sentence2_length': 504, 'unique_sentence2': 1997}, 'spa_Latn-vie_Latn': {'num_samples': 1997, 'number_of_characters': 559321, 'unique_pairs': 1997, 'min_sentence1_length': 1, 'average_sentence1_length': 144.32, 'max_sentence1_length': 504, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 135.76, 'max_sentence2_length': 437, 'unique_sentence2': 1996}, 'spa_Latn-zho_Hant': {'num_samples': 1997, 'number_of_characters': 379678, 'unique_pairs': 1996, 'min_sentence1_length': 1, 'average_sentence1_length': 144.32, 'max_sentence1_length': 504, 'unique_sentence1': 1996, 'min_sentence2_length': 3, 'average_sentence2_length': 45.81, 'max_sentence2_length': 200, 'unique_sentence2': 1996}, 'spa_Latn-zul_Latn': {'num_samples': 1997, 'number_of_characters': 545932, 'unique_pairs': 1997, 'min_sentence1_length': 1, 'average_sentence1_length': 144.32, 'max_sentence1_length': 504, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 129.06, 'max_sentence2_length': 494, 'unique_sentence2': 1996}, 'sqi_Latn-ell_Grek': {'num_samples': 1997, 'number_of_characters': 582734, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 142.02, 'max_sentence1_length': 461, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 149.79, 'max_sentence2_length': 584, 'unique_sentence2': 1996}, 'sqi_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 531327, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 142.02, 'max_sentence1_length': 461, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'sqi_Latn-hye_Armn': {'num_samples': 1997, 'number_of_characters': 548322, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 142.02, 'max_sentence1_length': 461, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 132.56, 'max_sentence2_length': 441, 'unique_sentence2': 1996}, 'sqi_Latn-kat_Geor': {'num_samples': 1997, 'number_of_characters': 550199, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 142.02, 'max_sentence1_length': 461, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 133.5, 'max_sentence2_length': 503, 'unique_sentence2': 1995}, 'srp_Cyrl-bel_Cyrl': {'num_samples': 1997, 'number_of_characters': 508393, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 126.34, 'max_sentence1_length': 439, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 128.24, 'max_sentence2_length': 422, 'unique_sentence2': 1996}, 'srp_Cyrl-bos_Latn': {'num_samples': 1997, 'number_of_characters': 507213, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 126.34, 'max_sentence1_length': 439, 'unique_sentence1': 1995, 'min_sentence2_length': 6, 'average_sentence2_length': 127.65, 'max_sentence2_length': 434, 'unique_sentence2': 1996}, 'srp_Cyrl-bul_Cyrl': {'num_samples': 1997, 'number_of_characters': 522192, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 126.34, 'max_sentence1_length': 439, 'unique_sentence1': 1995, 'min_sentence2_length': 6, 'average_sentence2_length': 135.15, 'max_sentence2_length': 493, 'unique_sentence2': 1996}, 'srp_Cyrl-ces_Latn': {'num_samples': 1997, 'number_of_characters': 493621, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 126.34, 'max_sentence1_length': 439, 'unique_sentence1': 1995, 'min_sentence2_length': 8, 'average_sentence2_length': 120.84, 'max_sentence2_length': 474, 'unique_sentence2': 1997}, 'srp_Cyrl-eng_Latn': {'num_samples': 1997, 'number_of_characters': 500023, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 126.34, 'max_sentence1_length': 439, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'srp_Cyrl-hrv_Latn': {'num_samples': 1997, 'number_of_characters': 508228, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 126.34, 'max_sentence1_length': 439, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 128.15, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'srp_Cyrl-mkd_Cyrl': {'num_samples': 1997, 'number_of_characters': 520194, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 126.34, 'max_sentence1_length': 439, 'unique_sentence1': 1995, 'min_sentence2_length': 6, 'average_sentence2_length': 134.15, 'max_sentence2_length': 451, 'unique_sentence2': 1997}, 'srp_Cyrl-pol_Latn': {'num_samples': 1997, 'number_of_characters': 530169, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 126.34, 'max_sentence1_length': 439, 'unique_sentence1': 1995, 'min_sentence2_length': 9, 'average_sentence2_length': 139.14, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'srp_Cyrl-rus_Cyrl': {'num_samples': 1997, 'number_of_characters': 527196, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 126.34, 'max_sentence1_length': 439, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 137.65, 'max_sentence2_length': 419, 'unique_sentence2': 1996}, 'srp_Cyrl-slk_Latn': {'num_samples': 1997, 'number_of_characters': 505272, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 126.34, 'max_sentence1_length': 439, 'unique_sentence1': 1995, 'min_sentence2_length': 9, 'average_sentence2_length': 126.67, 'max_sentence2_length': 403, 'unique_sentence2': 1996}, 'srp_Cyrl-slv_Latn': {'num_samples': 1997, 'number_of_characters': 505199, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 126.34, 'max_sentence1_length': 439, 'unique_sentence1': 1995, 'min_sentence2_length': 6, 'average_sentence2_length': 126.64, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'srp_Cyrl-srp_Latn': {'num_samples': 1997, 'number_of_characters': 508444, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 126.34, 'max_sentence1_length': 439, 'unique_sentence1': 1995, 'min_sentence2_length': 6, 'average_sentence2_length': 128.26, 'max_sentence2_length': 452, 'unique_sentence2': 1996}, 'srp_Cyrl-ukr_Cyrl': {'num_samples': 1997, 'number_of_characters': 515086, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 126.34, 'max_sentence1_length': 439, 'unique_sentence1': 1995, 'min_sentence2_length': 6, 'average_sentence2_length': 131.59, 'max_sentence2_length': 440, 'unique_sentence2': 1996}, 'srp_Latn-bel_Cyrl': {'num_samples': 1997, 'number_of_characters': 512231, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 128.26, 'max_sentence1_length': 452, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 128.24, 'max_sentence2_length': 422, 'unique_sentence2': 1996}, 'srp_Latn-bos_Latn': {'num_samples': 1997, 'number_of_characters': 511051, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 128.26, 'max_sentence1_length': 452, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 127.65, 'max_sentence2_length': 434, 'unique_sentence2': 1996}, 'srp_Latn-bul_Cyrl': {'num_samples': 1997, 'number_of_characters': 526030, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 128.26, 'max_sentence1_length': 452, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 135.15, 'max_sentence2_length': 493, 'unique_sentence2': 1996}, 'srp_Latn-ces_Latn': {'num_samples': 1997, 'number_of_characters': 497459, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 128.26, 'max_sentence1_length': 452, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 120.84, 'max_sentence2_length': 474, 'unique_sentence2': 1997}, 'srp_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 503861, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 128.26, 'max_sentence1_length': 452, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'srp_Latn-hrv_Latn': {'num_samples': 1997, 'number_of_characters': 512066, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 128.26, 'max_sentence1_length': 452, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 128.15, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'srp_Latn-mkd_Cyrl': {'num_samples': 1997, 'number_of_characters': 524032, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 128.26, 'max_sentence1_length': 452, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 134.15, 'max_sentence2_length': 451, 'unique_sentence2': 1997}, 'srp_Latn-pol_Latn': {'num_samples': 1997, 'number_of_characters': 534007, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 128.26, 'max_sentence1_length': 452, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 139.14, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'srp_Latn-rus_Cyrl': {'num_samples': 1997, 'number_of_characters': 531034, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 128.26, 'max_sentence1_length': 452, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 137.65, 'max_sentence2_length': 419, 'unique_sentence2': 1996}, 'srp_Latn-slk_Latn': {'num_samples': 1997, 'number_of_characters': 509110, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 128.26, 'max_sentence1_length': 452, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 126.67, 'max_sentence2_length': 403, 'unique_sentence2': 1996}, 'srp_Latn-slv_Latn': {'num_samples': 1997, 'number_of_characters': 509037, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 128.26, 'max_sentence1_length': 452, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 126.64, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'srp_Latn-srp_Cyrl': {'num_samples': 1997, 'number_of_characters': 508444, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 128.26, 'max_sentence1_length': 452, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 126.34, 'max_sentence2_length': 439, 'unique_sentence2': 1995}, 'srp_Latn-ukr_Cyrl': {'num_samples': 1997, 'number_of_characters': 518924, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 128.26, 'max_sentence1_length': 452, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 131.59, 'max_sentence2_length': 440, 'unique_sentence2': 1996}, 'ssw_Latn-amh_Ethi': {'num_samples': 1997, 'number_of_characters': 455649, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 144.29, 'max_sentence1_length': 510, 'unique_sentence1': 1996, 'min_sentence2_length': 1, 'average_sentence2_length': 83.88, 'max_sentence2_length': 290, 'unique_sentence2': 1994}, 'ssw_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 535862, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 144.29, 'max_sentence1_length': 510, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'ssw_Latn-hau_Latn': {'num_samples': 1997, 'number_of_characters': 558108, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 144.29, 'max_sentence1_length': 510, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 135.19, 'max_sentence2_length': 483, 'unique_sentence2': 1997}, 'ssw_Latn-ibo_Latn': {'num_samples': 1997, 'number_of_characters': 534243, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 144.29, 'max_sentence1_length': 510, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 123.24, 'max_sentence2_length': 469, 'unique_sentence2': 1997}, 'ssw_Latn-nso_Latn': {'num_samples': 1997, 'number_of_characters': 579641, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 144.29, 'max_sentence1_length': 510, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 145.97, 'max_sentence2_length': 487, 'unique_sentence2': 1996}, 'ssw_Latn-orm_Ethi': {'num_samples': 1997, 'number_of_characters': 525573, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 144.29, 'max_sentence1_length': 510, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 118.89, 'max_sentence2_length': 466, 'unique_sentence2': 1984}, 'ssw_Latn-som_Latn': {'num_samples': 1997, 'number_of_characters': 579434, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 144.29, 'max_sentence1_length': 510, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 145.86, 'max_sentence2_length': 455, 'unique_sentence2': 1997}, 'ssw_Latn-swa_Latn': {'num_samples': 1997, 'number_of_characters': 560651, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 144.29, 'max_sentence1_length': 510, 'unique_sentence1': 1996, 'min_sentence2_length': 10, 'average_sentence2_length': 136.46, 'max_sentence2_length': 430, 'unique_sentence2': 1997}, 'ssw_Latn-tir_Ethi': {'num_samples': 1997, 'number_of_characters': 453380, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 144.29, 'max_sentence1_length': 510, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 82.74, 'max_sentence2_length': 272, 'unique_sentence2': 1996}, 'ssw_Latn-tsn_Latn': {'num_samples': 1997, 'number_of_characters': 622425, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 144.29, 'max_sentence1_length': 510, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 167.39, 'max_sentence2_length': 556, 'unique_sentence2': 1997}, 'ssw_Latn-wol_Latn': {'num_samples': 1997, 'number_of_characters': 527945, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 144.29, 'max_sentence1_length': 510, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 120.08, 'max_sentence2_length': 405, 'unique_sentence2': 1990}, 'ssw_Latn-xho_Latn': {'num_samples': 1997, 'number_of_characters': 556232, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 144.29, 'max_sentence1_length': 510, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 134.25, 'max_sentence2_length': 492, 'unique_sentence2': 1997}, 'ssw_Latn-yor_Latn': {'num_samples': 1997, 'number_of_characters': 604230, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 144.29, 'max_sentence1_length': 510, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 158.28, 'max_sentence2_length': 582, 'unique_sentence2': 1996}, 'ssw_Latn-zul_Latn': {'num_samples': 1997, 'number_of_characters': 545874, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 144.29, 'max_sentence1_length': 510, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 129.06, 'max_sentence2_length': 494, 'unique_sentence2': 1996}, 'swa_Latn-amh_Ethi': {'num_samples': 1997, 'number_of_characters': 440016, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 136.46, 'max_sentence1_length': 430, 'unique_sentence1': 1997, 'min_sentence2_length': 1, 'average_sentence2_length': 83.88, 'max_sentence2_length': 290, 'unique_sentence2': 1994}, 'swa_Latn-arb_Arab': {'num_samples': 1997, 'number_of_characters': 503690, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 136.46, 'max_sentence1_length': 430, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 115.76, 'max_sentence2_length': 362, 'unique_sentence2': 1995}, 'swa_Latn-ben_Beng': {'num_samples': 1997, 'number_of_characters': 516311, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 136.46, 'max_sentence1_length': 430, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 122.08, 'max_sentence2_length': 402, 'unique_sentence2': 1997}, 'swa_Latn-deu_Latn': {'num_samples': 1997, 'number_of_characters': 568159, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 136.46, 'max_sentence1_length': 430, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 148.05, 'max_sentence2_length': 508, 'unique_sentence2': 1996}, 'swa_Latn-ell_Grek': {'num_samples': 1997, 'number_of_characters': 571636, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 136.46, 'max_sentence1_length': 430, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 149.79, 'max_sentence2_length': 584, 'unique_sentence2': 1996}, 'swa_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 520229, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 136.46, 'max_sentence1_length': 430, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'swa_Latn-fas_Arab': {'num_samples': 1997, 'number_of_characters': 515848, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 136.46, 'max_sentence1_length': 430, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 121.85, 'max_sentence2_length': 389, 'unique_sentence2': 1995}, 'swa_Latn-fin_Latn': {'num_samples': 1997, 'number_of_characters': 542309, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 136.46, 'max_sentence1_length': 430, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 135.1, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'swa_Latn-fra_Latn': {'num_samples': 1997, 'number_of_characters': 565617, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 136.46, 'max_sentence1_length': 430, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 146.77, 'max_sentence2_length': 512, 'unique_sentence2': 1996}, 'swa_Latn-hau_Latn': {'num_samples': 1997, 'number_of_characters': 542475, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 136.46, 'max_sentence1_length': 430, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 135.19, 'max_sentence2_length': 483, 'unique_sentence2': 1997}, 'swa_Latn-heb_Hebr': {'num_samples': 1997, 'number_of_characters': 472805, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 136.46, 'max_sentence1_length': 430, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 100.3, 'max_sentence2_length': 375, 'unique_sentence2': 1996}, 'swa_Latn-hin_Deva': {'num_samples': 1997, 'number_of_characters': 534084, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 136.46, 'max_sentence1_length': 430, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 130.98, 'max_sentence2_length': 394, 'unique_sentence2': 1996}, 'swa_Latn-hun_Latn': {'num_samples': 1997, 'number_of_characters': 550885, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 136.46, 'max_sentence1_length': 430, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 139.4, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'swa_Latn-ibo_Latn': {'num_samples': 1997, 'number_of_characters': 518610, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 136.46, 'max_sentence1_length': 430, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 123.24, 'max_sentence2_length': 469, 'unique_sentence2': 1997}, 'swa_Latn-ind_Latn': {'num_samples': 1997, 'number_of_characters': 559481, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 136.46, 'max_sentence1_length': 430, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 143.7, 'max_sentence2_length': 486, 'unique_sentence2': 1997}, 'swa_Latn-jpn_Jpan': {'num_samples': 1997, 'number_of_characters': 384135, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 136.46, 'max_sentence1_length': 430, 'unique_sentence1': 1997, 'min_sentence2_length': 4, 'average_sentence2_length': 55.9, 'max_sentence2_length': 189, 'unique_sentence2': 1994}, 'swa_Latn-kor_Hang': {'num_samples': 1997, 'number_of_characters': 405914, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 136.46, 'max_sentence1_length': 430, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 66.8, 'max_sentence2_length': 217, 'unique_sentence2': 1995}, 'swa_Latn-lit_Latn': {'num_samples': 1997, 'number_of_characters': 531906, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 136.46, 'max_sentence1_length': 430, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 129.89, 'max_sentence2_length': 446, 'unique_sentence2': 1995}, 'swa_Latn-nld_Latn': {'num_samples': 1997, 'number_of_characters': 564424, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 136.46, 'max_sentence1_length': 430, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 146.18, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'swa_Latn-nso_Latn': {'num_samples': 1997, 'number_of_characters': 564008, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 136.46, 'max_sentence1_length': 430, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 145.97, 'max_sentence2_length': 487, 'unique_sentence2': 1996}, 'swa_Latn-orm_Ethi': {'num_samples': 1997, 'number_of_characters': 509940, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 136.46, 'max_sentence1_length': 430, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 118.89, 'max_sentence2_length': 466, 'unique_sentence2': 1984}, 'swa_Latn-pol_Latn': {'num_samples': 1997, 'number_of_characters': 550375, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 136.46, 'max_sentence1_length': 430, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 139.14, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'swa_Latn-por_Latn': {'num_samples': 1997, 'number_of_characters': 549724, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 136.46, 'max_sentence1_length': 430, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 138.82, 'max_sentence2_length': 497, 'unique_sentence2': 1996}, 'swa_Latn-rus_Cyrl': {'num_samples': 1997, 'number_of_characters': 547402, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 136.46, 'max_sentence1_length': 430, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 137.65, 'max_sentence2_length': 419, 'unique_sentence2': 1996}, 'swa_Latn-som_Latn': {'num_samples': 1997, 'number_of_characters': 563801, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 136.46, 'max_sentence1_length': 430, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 145.86, 'max_sentence2_length': 455, 'unique_sentence2': 1997}, 'swa_Latn-spa_Latn': {'num_samples': 1997, 'number_of_characters': 560709, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 136.46, 'max_sentence1_length': 430, 'unique_sentence1': 1997, 'min_sentence2_length': 1, 'average_sentence2_length': 144.32, 'max_sentence2_length': 504, 'unique_sentence2': 1996}, 'swa_Latn-ssw_Latn': {'num_samples': 1997, 'number_of_characters': 560651, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 136.46, 'max_sentence1_length': 430, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 144.29, 'max_sentence2_length': 510, 'unique_sentence2': 1996}, 'swa_Latn-swe_Latn': {'num_samples': 1997, 'number_of_characters': 524336, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 136.46, 'max_sentence1_length': 430, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 126.1, 'max_sentence2_length': 430, 'unique_sentence2': 1996}, 'swa_Latn-tam_Taml': {'num_samples': 1997, 'number_of_characters': 582470, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 136.46, 'max_sentence1_length': 430, 'unique_sentence1': 1997, 'min_sentence2_length': 11, 'average_sentence2_length': 155.21, 'max_sentence2_length': 581, 'unique_sentence2': 1997}, 'swa_Latn-tir_Ethi': {'num_samples': 1997, 'number_of_characters': 437747, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 136.46, 'max_sentence1_length': 430, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 82.74, 'max_sentence2_length': 272, 'unique_sentence2': 1996}, 'swa_Latn-tsn_Latn': {'num_samples': 1997, 'number_of_characters': 606792, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 136.46, 'max_sentence1_length': 430, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 167.39, 'max_sentence2_length': 556, 'unique_sentence2': 1997}, 'swa_Latn-tur_Latn': {'num_samples': 1997, 'number_of_characters': 538122, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 136.46, 'max_sentence1_length': 430, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 133.01, 'max_sentence2_length': 504, 'unique_sentence2': 1997}, 'swa_Latn-vie_Latn': {'num_samples': 1997, 'number_of_characters': 543630, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 136.46, 'max_sentence1_length': 430, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 135.76, 'max_sentence2_length': 437, 'unique_sentence2': 1996}, 'swa_Latn-wol_Latn': {'num_samples': 1997, 'number_of_characters': 512312, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 136.46, 'max_sentence1_length': 430, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 120.08, 'max_sentence2_length': 405, 'unique_sentence2': 1990}, 'swa_Latn-xho_Latn': {'num_samples': 1997, 'number_of_characters': 540599, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 136.46, 'max_sentence1_length': 430, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 134.25, 'max_sentence2_length': 492, 'unique_sentence2': 1997}, 'swa_Latn-yor_Latn': {'num_samples': 1997, 'number_of_characters': 588597, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 136.46, 'max_sentence1_length': 430, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 158.28, 'max_sentence2_length': 582, 'unique_sentence2': 1996}, 'swa_Latn-zho_Hant': {'num_samples': 1997, 'number_of_characters': 363987, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 136.46, 'max_sentence1_length': 430, 'unique_sentence1': 1997, 'min_sentence2_length': 3, 'average_sentence2_length': 45.81, 'max_sentence2_length': 200, 'unique_sentence2': 1996}, 'swa_Latn-zul_Latn': {'num_samples': 1997, 'number_of_characters': 530241, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 136.46, 'max_sentence1_length': 430, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 129.06, 'max_sentence2_length': 494, 'unique_sentence2': 1996}, 'swe_Latn-afr_Latn': {'num_samples': 1997, 'number_of_characters': 520179, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 126.1, 'max_sentence1_length': 430, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 134.38, 'max_sentence2_length': 437, 'unique_sentence2': 1996}, 'swe_Latn-arb_Arab': {'num_samples': 1997, 'number_of_characters': 483008, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 126.1, 'max_sentence1_length': 430, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 115.76, 'max_sentence2_length': 362, 'unique_sentence2': 1995}, 'swe_Latn-ben_Beng': {'num_samples': 1997, 'number_of_characters': 495629, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 126.1, 'max_sentence1_length': 430, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 122.08, 'max_sentence2_length': 402, 'unique_sentence2': 1997}, 'swe_Latn-dan_Latn': {'num_samples': 1997, 'number_of_characters': 503965, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 126.1, 'max_sentence1_length': 430, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 126.26, 'max_sentence2_length': 522, 'unique_sentence2': 1995}, 'swe_Latn-deu_Latn': {'num_samples': 1997, 'number_of_characters': 547477, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 126.1, 'max_sentence1_length': 430, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 148.05, 'max_sentence2_length': 508, 'unique_sentence2': 1996}, 'swe_Latn-ell_Grek': {'num_samples': 1997, 'number_of_characters': 550954, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 126.1, 'max_sentence1_length': 430, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 149.79, 'max_sentence2_length': 584, 'unique_sentence2': 1996}, 'swe_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 499547, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 126.1, 'max_sentence1_length': 430, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'swe_Latn-fao_Latn': {'num_samples': 1997, 'number_of_characters': 509630, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 126.1, 'max_sentence1_length': 430, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 129.1, 'max_sentence2_length': 433, 'unique_sentence2': 1997}, 'swe_Latn-fas_Arab': {'num_samples': 1997, 'number_of_characters': 495166, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 126.1, 'max_sentence1_length': 430, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 121.85, 'max_sentence2_length': 389, 'unique_sentence2': 1995}, 'swe_Latn-fin_Latn': {'num_samples': 1997, 'number_of_characters': 521627, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 126.1, 'max_sentence1_length': 430, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 135.1, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'swe_Latn-fra_Latn': {'num_samples': 1997, 'number_of_characters': 544935, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 126.1, 'max_sentence1_length': 430, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 146.77, 'max_sentence2_length': 512, 'unique_sentence2': 1996}, 'swe_Latn-heb_Hebr': {'num_samples': 1997, 'number_of_characters': 452123, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 126.1, 'max_sentence1_length': 430, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 100.3, 'max_sentence2_length': 375, 'unique_sentence2': 1996}, 'swe_Latn-hin_Deva': {'num_samples': 1997, 'number_of_characters': 513402, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 126.1, 'max_sentence1_length': 430, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 130.98, 'max_sentence2_length': 394, 'unique_sentence2': 1996}, 'swe_Latn-hun_Latn': {'num_samples': 1997, 'number_of_characters': 530203, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 126.1, 'max_sentence1_length': 430, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 139.4, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'swe_Latn-ind_Latn': {'num_samples': 1997, 'number_of_characters': 538799, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 126.1, 'max_sentence1_length': 430, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 143.7, 'max_sentence2_length': 486, 'unique_sentence2': 1997}, 'swe_Latn-isl_Latn': {'num_samples': 1997, 'number_of_characters': 514035, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 126.1, 'max_sentence1_length': 430, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 131.3, 'max_sentence2_length': 399, 'unique_sentence2': 1996}, 'swe_Latn-jpn_Jpan': {'num_samples': 1997, 'number_of_characters': 363453, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 126.1, 'max_sentence1_length': 430, 'unique_sentence1': 1996, 'min_sentence2_length': 4, 'average_sentence2_length': 55.9, 'max_sentence2_length': 189, 'unique_sentence2': 1994}, 'swe_Latn-kor_Hang': {'num_samples': 1997, 'number_of_characters': 385232, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 126.1, 'max_sentence1_length': 430, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 66.8, 'max_sentence2_length': 217, 'unique_sentence2': 1995}, 'swe_Latn-lit_Latn': {'num_samples': 1997, 'number_of_characters': 511224, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 126.1, 'max_sentence1_length': 430, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 129.89, 'max_sentence2_length': 446, 'unique_sentence2': 1995}, 'swe_Latn-ltz_Latn': {'num_samples': 1997, 'number_of_characters': 532584, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 126.1, 'max_sentence1_length': 430, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 140.59, 'max_sentence2_length': 543, 'unique_sentence2': 1996}, 'swe_Latn-nld_Latn': {'num_samples': 1997, 'number_of_characters': 543742, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 126.1, 'max_sentence1_length': 430, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 146.18, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'swe_Latn-nno_Latn': {'num_samples': 1997, 'number_of_characters': 500184, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 126.1, 'max_sentence1_length': 430, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.37, 'max_sentence2_length': 417, 'unique_sentence2': 1996}, 'swe_Latn-nob_Latn': {'num_samples': 1997, 'number_of_characters': 503271, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 126.1, 'max_sentence1_length': 430, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 125.91, 'max_sentence2_length': 482, 'unique_sentence2': 1996}, 'swe_Latn-pol_Latn': {'num_samples': 1997, 'number_of_characters': 529693, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 126.1, 'max_sentence1_length': 430, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 139.14, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'swe_Latn-por_Latn': {'num_samples': 1997, 'number_of_characters': 529042, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 126.1, 'max_sentence1_length': 430, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 138.82, 'max_sentence2_length': 497, 'unique_sentence2': 1996}, 'swe_Latn-rus_Cyrl': {'num_samples': 1997, 'number_of_characters': 526720, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 126.1, 'max_sentence1_length': 430, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 137.65, 'max_sentence2_length': 419, 'unique_sentence2': 1996}, 'swe_Latn-spa_Latn': {'num_samples': 1997, 'number_of_characters': 540027, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 126.1, 'max_sentence1_length': 430, 'unique_sentence1': 1996, 'min_sentence2_length': 1, 'average_sentence2_length': 144.32, 'max_sentence2_length': 504, 'unique_sentence2': 1996}, 'swe_Latn-swa_Latn': {'num_samples': 1997, 'number_of_characters': 524336, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 126.1, 'max_sentence1_length': 430, 'unique_sentence1': 1996, 'min_sentence2_length': 10, 'average_sentence2_length': 136.46, 'max_sentence2_length': 430, 'unique_sentence2': 1997}, 'swe_Latn-tam_Taml': {'num_samples': 1997, 'number_of_characters': 561788, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 126.1, 'max_sentence1_length': 430, 'unique_sentence1': 1996, 'min_sentence2_length': 11, 'average_sentence2_length': 155.21, 'max_sentence2_length': 581, 'unique_sentence2': 1997}, 'swe_Latn-tur_Latn': {'num_samples': 1997, 'number_of_characters': 517440, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 126.1, 'max_sentence1_length': 430, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 133.01, 'max_sentence2_length': 504, 'unique_sentence2': 1997}, 'swe_Latn-vie_Latn': {'num_samples': 1997, 'number_of_characters': 522948, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 126.1, 'max_sentence1_length': 430, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 135.76, 'max_sentence2_length': 437, 'unique_sentence2': 1996}, 'swe_Latn-zho_Hant': {'num_samples': 1997, 'number_of_characters': 343305, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 126.1, 'max_sentence1_length': 430, 'unique_sentence1': 1996, 'min_sentence2_length': 3, 'average_sentence2_length': 45.81, 'max_sentence2_length': 200, 'unique_sentence2': 1996}, 'swe_Latn-zul_Latn': {'num_samples': 1997, 'number_of_characters': 509559, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 126.1, 'max_sentence1_length': 430, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 129.06, 'max_sentence2_length': 494, 'unique_sentence2': 1996}, 'tah_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 557343, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 155.04, 'max_sentence1_length': 524, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'tah_Latn-fij_Latn': {'num_samples': 1997, 'number_of_characters': 610128, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 155.04, 'max_sentence1_length': 524, 'unique_sentence1': 1997, 'min_sentence2_length': 10, 'average_sentence2_length': 150.48, 'max_sentence2_length': 448, 'unique_sentence2': 1988}, 'tah_Latn-fil_Latn': {'num_samples': 1997, 'number_of_characters': 603043, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 155.04, 'max_sentence1_length': 524, 'unique_sentence1': 1997, 'min_sentence2_length': 10, 'average_sentence2_length': 146.93, 'max_sentence2_length': 554, 'unique_sentence2': 1997}, 'tah_Latn-ind_Latn': {'num_samples': 1997, 'number_of_characters': 596595, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 155.04, 'max_sentence1_length': 524, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 143.7, 'max_sentence2_length': 486, 'unique_sentence2': 1997}, 'tah_Latn-mal_Mlym': {'num_samples': 1997, 'number_of_characters': 613775, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 155.04, 'max_sentence1_length': 524, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 152.3, 'max_sentence2_length': 540, 'unique_sentence2': 1996}, 'tah_Latn-mlg_Latn': {'num_samples': 1997, 'number_of_characters': 629931, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 155.04, 'max_sentence1_length': 524, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 160.39, 'max_sentence2_length': 559, 'unique_sentence2': 1994}, 'tah_Latn-mri_Latn': {'num_samples': 1997, 'number_of_characters': 583747, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 155.04, 'max_sentence1_length': 524, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 137.27, 'max_sentence2_length': 443, 'unique_sentence2': 1997}, 'tah_Latn-msa_Latn': {'num_samples': 1997, 'number_of_characters': 586806, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 155.04, 'max_sentence1_length': 524, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 138.8, 'max_sentence2_length': 463, 'unique_sentence2': 1997}, 'tah_Latn-smo_Latn': {'num_samples': 1997, 'number_of_characters': 587478, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 155.04, 'max_sentence1_length': 524, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 139.14, 'max_sentence2_length': 431, 'unique_sentence2': 1996}, 'tah_Latn-ton_Latn': {'num_samples': 1997, 'number_of_characters': 623263, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 155.04, 'max_sentence1_length': 524, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 157.06, 'max_sentence2_length': 468, 'unique_sentence2': 1997}, 'tam_Taml-arb_Arab': {'num_samples': 1997, 'number_of_characters': 541142, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 155.21, 'max_sentence1_length': 581, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 115.76, 'max_sentence2_length': 362, 'unique_sentence2': 1995}, 'tam_Taml-ben_Beng': {'num_samples': 1997, 'number_of_characters': 553763, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 155.21, 'max_sentence1_length': 581, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 122.08, 'max_sentence2_length': 402, 'unique_sentence2': 1997}, 'tam_Taml-deu_Latn': {'num_samples': 1997, 'number_of_characters': 605611, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 155.21, 'max_sentence1_length': 581, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 148.05, 'max_sentence2_length': 508, 'unique_sentence2': 1996}, 'tam_Taml-div_Thaa': {'num_samples': 1997, 'number_of_characters': 613809, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 155.21, 'max_sentence1_length': 581, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 152.15, 'max_sentence2_length': 609, 'unique_sentence2': 1996}, 'tam_Taml-ell_Grek': {'num_samples': 1997, 'number_of_characters': 609088, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 155.21, 'max_sentence1_length': 581, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 149.79, 'max_sentence2_length': 584, 'unique_sentence2': 1996}, 'tam_Taml-eng_Latn': {'num_samples': 1997, 'number_of_characters': 557681, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 155.21, 'max_sentence1_length': 581, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'tam_Taml-eus_Latn': {'num_samples': 1997, 'number_of_characters': 585164, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 155.21, 'max_sentence1_length': 581, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 137.81, 'max_sentence2_length': 393, 'unique_sentence2': 1997}, 'tam_Taml-fas_Arab': {'num_samples': 1997, 'number_of_characters': 553300, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 155.21, 'max_sentence1_length': 581, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 121.85, 'max_sentence2_length': 389, 'unique_sentence2': 1995}, 'tam_Taml-fin_Latn': {'num_samples': 1997, 'number_of_characters': 579761, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 155.21, 'max_sentence1_length': 581, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 135.1, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'tam_Taml-fra_Latn': {'num_samples': 1997, 'number_of_characters': 603069, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 155.21, 'max_sentence1_length': 581, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 146.77, 'max_sentence2_length': 512, 'unique_sentence2': 1996}, 'tam_Taml-guj_Gujr': {'num_samples': 1997, 'number_of_characters': 554892, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 155.21, 'max_sentence1_length': 581, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 122.65, 'max_sentence2_length': 378, 'unique_sentence2': 1997}, 'tam_Taml-heb_Hebr': {'num_samples': 1997, 'number_of_characters': 510257, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 155.21, 'max_sentence1_length': 581, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 100.3, 'max_sentence2_length': 375, 'unique_sentence2': 1996}, 'tam_Taml-hin_Deva': {'num_samples': 1997, 'number_of_characters': 571536, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 155.21, 'max_sentence1_length': 581, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 130.98, 'max_sentence2_length': 394, 'unique_sentence2': 1996}, 'tam_Taml-hun_Latn': {'num_samples': 1997, 'number_of_characters': 588337, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 155.21, 'max_sentence1_length': 581, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 139.4, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'tam_Taml-ind_Latn': {'num_samples': 1997, 'number_of_characters': 596933, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 155.21, 'max_sentence1_length': 581, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 143.7, 'max_sentence2_length': 486, 'unique_sentence2': 1997}, 'tam_Taml-jpn_Jpan': {'num_samples': 1997, 'number_of_characters': 421587, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 155.21, 'max_sentence1_length': 581, 'unique_sentence1': 1997, 'min_sentence2_length': 4, 'average_sentence2_length': 55.9, 'max_sentence2_length': 189, 'unique_sentence2': 1994}, 'tam_Taml-kan_Knda': {'num_samples': 1997, 'number_of_characters': 575497, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 155.21, 'max_sentence1_length': 581, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 132.97, 'max_sentence2_length': 449, 'unique_sentence2': 1996}, 'tam_Taml-kor_Hang': {'num_samples': 1997, 'number_of_characters': 443366, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 155.21, 'max_sentence1_length': 581, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 66.8, 'max_sentence2_length': 217, 'unique_sentence2': 1995}, 'tam_Taml-lit_Latn': {'num_samples': 1997, 'number_of_characters': 569358, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 155.21, 'max_sentence1_length': 581, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 129.89, 'max_sentence2_length': 446, 'unique_sentence2': 1995}, 'tam_Taml-mar_Deva': {'num_samples': 1997, 'number_of_characters': 570848, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 155.21, 'max_sentence1_length': 581, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 130.64, 'max_sentence2_length': 443, 'unique_sentence2': 1995}, 'tam_Taml-nep_Deva': {'num_samples': 1997, 'number_of_characters': 558184, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 155.21, 'max_sentence1_length': 581, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 124.3, 'max_sentence2_length': 412, 'unique_sentence2': 1996}, 'tam_Taml-nld_Latn': {'num_samples': 1997, 'number_of_characters': 601876, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 155.21, 'max_sentence1_length': 581, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 146.18, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'tam_Taml-pan_Guru': {'num_samples': 1997, 'number_of_characters': 560383, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 155.21, 'max_sentence1_length': 581, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 125.4, 'max_sentence2_length': 383, 'unique_sentence2': 1996}, 'tam_Taml-pol_Latn': {'num_samples': 1997, 'number_of_characters': 587827, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 155.21, 'max_sentence1_length': 581, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 139.14, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'tam_Taml-por_Latn': {'num_samples': 1997, 'number_of_characters': 587176, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 155.21, 'max_sentence1_length': 581, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 138.82, 'max_sentence2_length': 497, 'unique_sentence2': 1996}, 'tam_Taml-rus_Cyrl': {'num_samples': 1997, 'number_of_characters': 584854, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 155.21, 'max_sentence1_length': 581, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 137.65, 'max_sentence2_length': 419, 'unique_sentence2': 1996}, 'tam_Taml-sin_Sinh': {'num_samples': 1997, 'number_of_characters': 568702, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 155.21, 'max_sentence1_length': 581, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 129.56, 'max_sentence2_length': 441, 'unique_sentence2': 1996}, 'tam_Taml-snd_Arab': {'num_samples': 1997, 'number_of_characters': 530288, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 155.21, 'max_sentence1_length': 581, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 110.33, 'max_sentence2_length': 335, 'unique_sentence2': 1996}, 'tam_Taml-spa_Latn': {'num_samples': 1997, 'number_of_characters': 598161, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 155.21, 'max_sentence1_length': 581, 'unique_sentence1': 1997, 'min_sentence2_length': 1, 'average_sentence2_length': 144.32, 'max_sentence2_length': 504, 'unique_sentence2': 1996}, 'tam_Taml-swa_Latn': {'num_samples': 1997, 'number_of_characters': 582470, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 155.21, 'max_sentence1_length': 581, 'unique_sentence1': 1997, 'min_sentence2_length': 10, 'average_sentence2_length': 136.46, 'max_sentence2_length': 430, 'unique_sentence2': 1997}, 'tam_Taml-swe_Latn': {'num_samples': 1997, 'number_of_characters': 561788, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 155.21, 'max_sentence1_length': 581, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 126.1, 'max_sentence2_length': 430, 'unique_sentence2': 1996}, 'tam_Taml-tel_Telu': {'num_samples': 1997, 'number_of_characters': 557488, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 155.21, 'max_sentence1_length': 581, 'unique_sentence1': 1997, 'min_sentence2_length': 10, 'average_sentence2_length': 123.95, 'max_sentence2_length': 412, 'unique_sentence2': 1996}, 'tam_Taml-tur_Latn': {'num_samples': 1997, 'number_of_characters': 575574, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 155.21, 'max_sentence1_length': 581, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 133.01, 'max_sentence2_length': 504, 'unique_sentence2': 1997}, 'tam_Taml-urd_Arab': {'num_samples': 1997, 'number_of_characters': 557959, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 155.21, 'max_sentence1_length': 581, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.19, 'max_sentence2_length': 390, 'unique_sentence2': 1996}, 'tam_Taml-vie_Latn': {'num_samples': 1997, 'number_of_characters': 581082, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 155.21, 'max_sentence1_length': 581, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 135.76, 'max_sentence2_length': 437, 'unique_sentence2': 1996}, 'tam_Taml-zho_Hant': {'num_samples': 1997, 'number_of_characters': 401439, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 155.21, 'max_sentence1_length': 581, 'unique_sentence1': 1997, 'min_sentence2_length': 3, 'average_sentence2_length': 45.81, 'max_sentence2_length': 200, 'unique_sentence2': 1996}, 'tam_Taml-zul_Latn': {'num_samples': 1997, 'number_of_characters': 567693, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 155.21, 'max_sentence1_length': 581, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 129.06, 'max_sentence2_length': 494, 'unique_sentence2': 1996}, 'tat_Cyrl-aze_Latn': {'num_samples': 1997, 'number_of_characters': 515560, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 123.15, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 135.02, 'max_sentence2_length': 398, 'unique_sentence2': 1997}, 'tat_Cyrl-bak_Cyrl': {'num_samples': 1997, 'number_of_characters': 492252, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 123.15, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 11, 'average_sentence2_length': 123.35, 'max_sentence2_length': 437, 'unique_sentence2': 1995}, 'tat_Cyrl-eng_Latn': {'num_samples': 1997, 'number_of_characters': 493646, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 123.15, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'tat_Cyrl-kaz_Cyrl': {'num_samples': 1997, 'number_of_characters': 506202, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 123.15, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 130.33, 'max_sentence2_length': 473, 'unique_sentence2': 1996}, 'tat_Cyrl-kir_Cyrl': {'num_samples': 1997, 'number_of_characters': 496790, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 123.15, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 125.62, 'max_sentence2_length': 395, 'unique_sentence2': 1996}, 'tat_Cyrl-tuk_Latn': {'num_samples': 1997, 'number_of_characters': 531200, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 123.15, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 142.85, 'max_sentence2_length': 576, 'unique_sentence2': 1996}, 'tat_Cyrl-tur_Latn': {'num_samples': 1997, 'number_of_characters': 511539, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 123.15, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 133.01, 'max_sentence2_length': 504, 'unique_sentence2': 1997}, 'tat_Cyrl-uig_Arab': {'num_samples': 1997, 'number_of_characters': 556948, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 123.15, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 155.74, 'max_sentence2_length': 592, 'unique_sentence2': 1996}, 'tat_Cyrl-uzb_Latn': {'num_samples': 1997, 'number_of_characters': 539621, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 123.15, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 147.07, 'max_sentence2_length': 470, 'unique_sentence2': 1996}, 'tel_Telu-ben_Beng': {'num_samples': 1997, 'number_of_characters': 491329, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 123.95, 'max_sentence1_length': 412, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 122.08, 'max_sentence2_length': 402, 'unique_sentence2': 1997}, 'tel_Telu-div_Thaa': {'num_samples': 1997, 'number_of_characters': 551375, 'unique_pairs': 1996, 'min_sentence1_length': 10, 'average_sentence1_length': 123.95, 'max_sentence1_length': 412, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 152.15, 'max_sentence2_length': 609, 'unique_sentence2': 1996}, 'tel_Telu-eng_Latn': {'num_samples': 1997, 'number_of_characters': 495247, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 123.95, 'max_sentence1_length': 412, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'tel_Telu-eus_Latn': {'num_samples': 1997, 'number_of_characters': 522730, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 123.95, 'max_sentence1_length': 412, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 137.81, 'max_sentence2_length': 393, 'unique_sentence2': 1997}, 'tel_Telu-guj_Gujr': {'num_samples': 1997, 'number_of_characters': 492458, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 123.95, 'max_sentence1_length': 412, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 122.65, 'max_sentence2_length': 378, 'unique_sentence2': 1997}, 'tel_Telu-hin_Deva': {'num_samples': 1997, 'number_of_characters': 509102, 'unique_pairs': 1996, 'min_sentence1_length': 10, 'average_sentence1_length': 123.95, 'max_sentence1_length': 412, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 130.98, 'max_sentence2_length': 394, 'unique_sentence2': 1996}, 'tel_Telu-kan_Knda': {'num_samples': 1997, 'number_of_characters': 513063, 'unique_pairs': 1996, 'min_sentence1_length': 10, 'average_sentence1_length': 123.95, 'max_sentence1_length': 412, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 132.97, 'max_sentence2_length': 449, 'unique_sentence2': 1996}, 'tel_Telu-mar_Deva': {'num_samples': 1997, 'number_of_characters': 508414, 'unique_pairs': 1996, 'min_sentence1_length': 10, 'average_sentence1_length': 123.95, 'max_sentence1_length': 412, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 130.64, 'max_sentence2_length': 443, 'unique_sentence2': 1995}, 'tel_Telu-nep_Deva': {'num_samples': 1997, 'number_of_characters': 495750, 'unique_pairs': 1996, 'min_sentence1_length': 10, 'average_sentence1_length': 123.95, 'max_sentence1_length': 412, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 124.3, 'max_sentence2_length': 412, 'unique_sentence2': 1996}, 'tel_Telu-pan_Guru': {'num_samples': 1997, 'number_of_characters': 497949, 'unique_pairs': 1996, 'min_sentence1_length': 10, 'average_sentence1_length': 123.95, 'max_sentence1_length': 412, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 125.4, 'max_sentence2_length': 383, 'unique_sentence2': 1996}, 'tel_Telu-sin_Sinh': {'num_samples': 1997, 'number_of_characters': 506268, 'unique_pairs': 1996, 'min_sentence1_length': 10, 'average_sentence1_length': 123.95, 'max_sentence1_length': 412, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 129.56, 'max_sentence2_length': 441, 'unique_sentence2': 1996}, 'tel_Telu-snd_Arab': {'num_samples': 1997, 'number_of_characters': 467854, 'unique_pairs': 1996, 'min_sentence1_length': 10, 'average_sentence1_length': 123.95, 'max_sentence1_length': 412, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 110.33, 'max_sentence2_length': 335, 'unique_sentence2': 1996}, 'tel_Telu-tam_Taml': {'num_samples': 1997, 'number_of_characters': 557488, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 123.95, 'max_sentence1_length': 412, 'unique_sentence1': 1996, 'min_sentence2_length': 11, 'average_sentence2_length': 155.21, 'max_sentence2_length': 581, 'unique_sentence2': 1997}, 'tel_Telu-urd_Arab': {'num_samples': 1997, 'number_of_characters': 495525, 'unique_pairs': 1996, 'min_sentence1_length': 10, 'average_sentence1_length': 123.95, 'max_sentence1_length': 412, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.19, 'max_sentence2_length': 390, 'unique_sentence2': 1996}, 'tgk_Cyrl-arb_Arab': {'num_samples': 1997, 'number_of_characters': 505328, 'unique_pairs': 1995, 'min_sentence1_length': 3, 'average_sentence1_length': 137.28, 'max_sentence1_length': 451, 'unique_sentence1': 1995, 'min_sentence2_length': 5, 'average_sentence2_length': 115.76, 'max_sentence2_length': 362, 'unique_sentence2': 1995}, 'tgk_Cyrl-ckb_Arab': {'num_samples': 1997, 'number_of_characters': 526514, 'unique_pairs': 1996, 'min_sentence1_length': 3, 'average_sentence1_length': 137.28, 'max_sentence1_length': 451, 'unique_sentence1': 1995, 'min_sentence2_length': 5, 'average_sentence2_length': 126.37, 'max_sentence2_length': 399, 'unique_sentence2': 1995}, 'tgk_Cyrl-eng_Latn': {'num_samples': 1997, 'number_of_characters': 521867, 'unique_pairs': 1997, 'min_sentence1_length': 3, 'average_sentence1_length': 137.28, 'max_sentence1_length': 451, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'tgk_Cyrl-fas_Arab': {'num_samples': 1997, 'number_of_characters': 517486, 'unique_pairs': 1995, 'min_sentence1_length': 3, 'average_sentence1_length': 137.28, 'max_sentence1_length': 451, 'unique_sentence1': 1995, 'min_sentence2_length': 9, 'average_sentence2_length': 121.85, 'max_sentence2_length': 389, 'unique_sentence2': 1995}, 'tgk_Cyrl-heb_Hebr': {'num_samples': 1997, 'number_of_characters': 474443, 'unique_pairs': 1996, 'min_sentence1_length': 3, 'average_sentence1_length': 137.28, 'max_sentence1_length': 451, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 100.3, 'max_sentence2_length': 375, 'unique_sentence2': 1996}, 'tgk_Cyrl-kmr_Latn': {'num_samples': 1997, 'number_of_characters': 520093, 'unique_pairs': 1996, 'min_sentence1_length': 3, 'average_sentence1_length': 137.28, 'max_sentence1_length': 451, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 123.16, 'max_sentence2_length': 420, 'unique_sentence2': 1996}, 'tgk_Cyrl-mey_Arab': {'num_samples': 1997, 'number_of_characters': 487982, 'unique_pairs': 1996, 'min_sentence1_length': 3, 'average_sentence1_length': 137.28, 'max_sentence1_length': 451, 'unique_sentence1': 1995, 'min_sentence2_length': 6, 'average_sentence2_length': 107.08, 'max_sentence2_length': 392, 'unique_sentence2': 1993}, 'tgk_Cyrl-prs_Arab': {'num_samples': 1997, 'number_of_characters': 516683, 'unique_pairs': 1997, 'min_sentence1_length': 3, 'average_sentence1_length': 137.28, 'max_sentence1_length': 451, 'unique_sentence1': 1995, 'min_sentence2_length': 8, 'average_sentence2_length': 121.45, 'max_sentence2_length': 365, 'unique_sentence2': 1997}, 'tgk_Cyrl-pus_Arab': {'num_samples': 1997, 'number_of_characters': 516780, 'unique_pairs': 1996, 'min_sentence1_length': 3, 'average_sentence1_length': 137.28, 'max_sentence1_length': 451, 'unique_sentence1': 1995, 'min_sentence2_length': 8, 'average_sentence2_length': 121.5, 'max_sentence2_length': 366, 'unique_sentence2': 1996}, 'tgk_Cyrl-shi_Arab': {'num_samples': 1997, 'number_of_characters': 489060, 'unique_pairs': 1997, 'min_sentence1_length': 3, 'average_sentence1_length': 137.28, 'max_sentence1_length': 451, 'unique_sentence1': 1995, 'min_sentence2_length': 3, 'average_sentence2_length': 107.62, 'max_sentence2_length': 378, 'unique_sentence2': 1996}, 'tha_Thai-bod_Tibt': {'num_samples': 1997, 'number_of_characters': 538097, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 118.91, 'max_sentence1_length': 439, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 150.54, 'max_sentence2_length': 478, 'unique_sentence2': 1993}, 'tha_Thai-dzo_Tibt': {'num_samples': 1997, 'number_of_characters': 480689, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 118.91, 'max_sentence1_length': 439, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 121.79, 'max_sentence2_length': 411, 'unique_sentence2': 1992}, 'tha_Thai-eng_Latn': {'num_samples': 1997, 'number_of_characters': 485188, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 118.91, 'max_sentence1_length': 439, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'tha_Thai-khm_Khmr': {'num_samples': 1997, 'number_of_characters': 525959, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 118.91, 'max_sentence1_length': 439, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 144.46, 'max_sentence2_length': 517, 'unique_sentence2': 1996}, 'tha_Thai-lao_Laoo': {'num_samples': 1997, 'number_of_characters': 504448, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 118.91, 'max_sentence1_length': 439, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 133.69, 'max_sentence2_length': 507, 'unique_sentence2': 1997}, 'tha_Thai-mon_Mong': {'num_samples': 1997, 'number_of_characters': 496516, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 118.91, 'max_sentence1_length': 439, 'unique_sentence1': 1996, 'min_sentence2_length': 11, 'average_sentence2_length': 129.72, 'max_sentence2_length': 414, 'unique_sentence2': 1997}, 'tha_Thai-mya_Mymr': {'num_samples': 1997, 'number_of_characters': 549322, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 118.91, 'max_sentence1_length': 439, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 156.16, 'max_sentence2_length': 773, 'unique_sentence2': 1997}, 'tir_Ethi-amh_Ethi': {'num_samples': 1997, 'number_of_characters': 332745, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 82.74, 'max_sentence1_length': 272, 'unique_sentence1': 1996, 'min_sentence2_length': 1, 'average_sentence2_length': 83.88, 'max_sentence2_length': 290, 'unique_sentence2': 1994}, 'tir_Ethi-eng_Latn': {'num_samples': 1997, 'number_of_characters': 412958, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 82.74, 'max_sentence1_length': 272, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'tir_Ethi-hau_Latn': {'num_samples': 1997, 'number_of_characters': 435204, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 82.74, 'max_sentence1_length': 272, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 135.19, 'max_sentence2_length': 483, 'unique_sentence2': 1997}, 'tir_Ethi-ibo_Latn': {'num_samples': 1997, 'number_of_characters': 411339, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 82.74, 'max_sentence1_length': 272, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 123.24, 'max_sentence2_length': 469, 'unique_sentence2': 1997}, 'tir_Ethi-nso_Latn': {'num_samples': 1997, 'number_of_characters': 456737, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 82.74, 'max_sentence1_length': 272, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 145.97, 'max_sentence2_length': 487, 'unique_sentence2': 1996}, 'tir_Ethi-orm_Ethi': {'num_samples': 1997, 'number_of_characters': 402669, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 82.74, 'max_sentence1_length': 272, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 118.89, 'max_sentence2_length': 466, 'unique_sentence2': 1984}, 'tir_Ethi-som_Latn': {'num_samples': 1997, 'number_of_characters': 456530, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 82.74, 'max_sentence1_length': 272, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 145.86, 'max_sentence2_length': 455, 'unique_sentence2': 1997}, 'tir_Ethi-ssw_Latn': {'num_samples': 1997, 'number_of_characters': 453380, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 82.74, 'max_sentence1_length': 272, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 144.29, 'max_sentence2_length': 510, 'unique_sentence2': 1996}, 'tir_Ethi-swa_Latn': {'num_samples': 1997, 'number_of_characters': 437747, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 82.74, 'max_sentence1_length': 272, 'unique_sentence1': 1996, 'min_sentence2_length': 10, 'average_sentence2_length': 136.46, 'max_sentence2_length': 430, 'unique_sentence2': 1997}, 'tir_Ethi-tsn_Latn': {'num_samples': 1997, 'number_of_characters': 499521, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 82.74, 'max_sentence1_length': 272, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 167.39, 'max_sentence2_length': 556, 'unique_sentence2': 1997}, 'tir_Ethi-wol_Latn': {'num_samples': 1997, 'number_of_characters': 405041, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 82.74, 'max_sentence1_length': 272, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 120.08, 'max_sentence2_length': 405, 'unique_sentence2': 1990}, 'tir_Ethi-xho_Latn': {'num_samples': 1997, 'number_of_characters': 433328, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 82.74, 'max_sentence1_length': 272, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 134.25, 'max_sentence2_length': 492, 'unique_sentence2': 1997}, 'tir_Ethi-yor_Latn': {'num_samples': 1997, 'number_of_characters': 481326, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 82.74, 'max_sentence1_length': 272, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 158.28, 'max_sentence2_length': 582, 'unique_sentence2': 1996}, 'tir_Ethi-zul_Latn': {'num_samples': 1997, 'number_of_characters': 422970, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 82.74, 'max_sentence1_length': 272, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 129.06, 'max_sentence2_length': 494, 'unique_sentence2': 1996}, 'ton_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 561360, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 157.06, 'max_sentence1_length': 468, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'ton_Latn-fij_Latn': {'num_samples': 1997, 'number_of_characters': 614145, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 157.06, 'max_sentence1_length': 468, 'unique_sentence1': 1997, 'min_sentence2_length': 10, 'average_sentence2_length': 150.48, 'max_sentence2_length': 448, 'unique_sentence2': 1988}, 'ton_Latn-fil_Latn': {'num_samples': 1997, 'number_of_characters': 607060, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 157.06, 'max_sentence1_length': 468, 'unique_sentence1': 1997, 'min_sentence2_length': 10, 'average_sentence2_length': 146.93, 'max_sentence2_length': 554, 'unique_sentence2': 1997}, 'ton_Latn-ind_Latn': {'num_samples': 1997, 'number_of_characters': 600612, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 157.06, 'max_sentence1_length': 468, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 143.7, 'max_sentence2_length': 486, 'unique_sentence2': 1997}, 'ton_Latn-mal_Mlym': {'num_samples': 1997, 'number_of_characters': 617792, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 157.06, 'max_sentence1_length': 468, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 152.3, 'max_sentence2_length': 540, 'unique_sentence2': 1996}, 'ton_Latn-mlg_Latn': {'num_samples': 1997, 'number_of_characters': 633948, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 157.06, 'max_sentence1_length': 468, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 160.39, 'max_sentence2_length': 559, 'unique_sentence2': 1994}, 'ton_Latn-mri_Latn': {'num_samples': 1997, 'number_of_characters': 587764, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 157.06, 'max_sentence1_length': 468, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 137.27, 'max_sentence2_length': 443, 'unique_sentence2': 1997}, 'ton_Latn-msa_Latn': {'num_samples': 1997, 'number_of_characters': 590823, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 157.06, 'max_sentence1_length': 468, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 138.8, 'max_sentence2_length': 463, 'unique_sentence2': 1997}, 'ton_Latn-smo_Latn': {'num_samples': 1997, 'number_of_characters': 591495, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 157.06, 'max_sentence1_length': 468, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 139.14, 'max_sentence2_length': 431, 'unique_sentence2': 1996}, 'ton_Latn-tah_Latn': {'num_samples': 1997, 'number_of_characters': 623263, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 157.06, 'max_sentence1_length': 468, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 155.04, 'max_sentence2_length': 524, 'unique_sentence2': 1997}, 'tsn_Latn-amh_Ethi': {'num_samples': 1997, 'number_of_characters': 501790, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 167.39, 'max_sentence1_length': 556, 'unique_sentence1': 1997, 'min_sentence2_length': 1, 'average_sentence2_length': 83.88, 'max_sentence2_length': 290, 'unique_sentence2': 1994}, 'tsn_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 582003, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 167.39, 'max_sentence1_length': 556, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'tsn_Latn-hau_Latn': {'num_samples': 1997, 'number_of_characters': 604249, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 167.39, 'max_sentence1_length': 556, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 135.19, 'max_sentence2_length': 483, 'unique_sentence2': 1997}, 'tsn_Latn-ibo_Latn': {'num_samples': 1997, 'number_of_characters': 580384, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 167.39, 'max_sentence1_length': 556, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 123.24, 'max_sentence2_length': 469, 'unique_sentence2': 1997}, 'tsn_Latn-nso_Latn': {'num_samples': 1997, 'number_of_characters': 625782, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 167.39, 'max_sentence1_length': 556, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 145.97, 'max_sentence2_length': 487, 'unique_sentence2': 1996}, 'tsn_Latn-orm_Ethi': {'num_samples': 1997, 'number_of_characters': 571714, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 167.39, 'max_sentence1_length': 556, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 118.89, 'max_sentence2_length': 466, 'unique_sentence2': 1984}, 'tsn_Latn-som_Latn': {'num_samples': 1997, 'number_of_characters': 625575, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 167.39, 'max_sentence1_length': 556, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 145.86, 'max_sentence2_length': 455, 'unique_sentence2': 1997}, 'tsn_Latn-ssw_Latn': {'num_samples': 1997, 'number_of_characters': 622425, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 167.39, 'max_sentence1_length': 556, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 144.29, 'max_sentence2_length': 510, 'unique_sentence2': 1996}, 'tsn_Latn-swa_Latn': {'num_samples': 1997, 'number_of_characters': 606792, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 167.39, 'max_sentence1_length': 556, 'unique_sentence1': 1997, 'min_sentence2_length': 10, 'average_sentence2_length': 136.46, 'max_sentence2_length': 430, 'unique_sentence2': 1997}, 'tsn_Latn-tir_Ethi': {'num_samples': 1997, 'number_of_characters': 499521, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 167.39, 'max_sentence1_length': 556, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 82.74, 'max_sentence2_length': 272, 'unique_sentence2': 1996}, 'tsn_Latn-wol_Latn': {'num_samples': 1997, 'number_of_characters': 574086, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 167.39, 'max_sentence1_length': 556, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 120.08, 'max_sentence2_length': 405, 'unique_sentence2': 1990}, 'tsn_Latn-xho_Latn': {'num_samples': 1997, 'number_of_characters': 602373, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 167.39, 'max_sentence1_length': 556, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 134.25, 'max_sentence2_length': 492, 'unique_sentence2': 1997}, 'tsn_Latn-yor_Latn': {'num_samples': 1997, 'number_of_characters': 650371, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 167.39, 'max_sentence1_length': 556, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 158.28, 'max_sentence2_length': 582, 'unique_sentence2': 1996}, 'tsn_Latn-zul_Latn': {'num_samples': 1997, 'number_of_characters': 592015, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 167.39, 'max_sentence1_length': 556, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 129.06, 'max_sentence2_length': 494, 'unique_sentence2': 1996}, 'tuk_Latn-aze_Latn': {'num_samples': 1997, 'number_of_characters': 554908, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 142.85, 'max_sentence1_length': 576, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 135.02, 'max_sentence2_length': 398, 'unique_sentence2': 1997}, 'tuk_Latn-bak_Cyrl': {'num_samples': 1997, 'number_of_characters': 531600, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 142.85, 'max_sentence1_length': 576, 'unique_sentence1': 1996, 'min_sentence2_length': 11, 'average_sentence2_length': 123.35, 'max_sentence2_length': 437, 'unique_sentence2': 1995}, 'tuk_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 532994, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 142.85, 'max_sentence1_length': 576, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'tuk_Latn-kaz_Cyrl': {'num_samples': 1997, 'number_of_characters': 545550, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 142.85, 'max_sentence1_length': 576, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 130.33, 'max_sentence2_length': 473, 'unique_sentence2': 1996}, 'tuk_Latn-kir_Cyrl': {'num_samples': 1997, 'number_of_characters': 536138, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 142.85, 'max_sentence1_length': 576, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 125.62, 'max_sentence2_length': 395, 'unique_sentence2': 1996}, 'tuk_Latn-tat_Cyrl': {'num_samples': 1997, 'number_of_characters': 531200, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 142.85, 'max_sentence1_length': 576, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 123.15, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'tuk_Latn-tur_Latn': {'num_samples': 1997, 'number_of_characters': 550887, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 142.85, 'max_sentence1_length': 576, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 133.01, 'max_sentence2_length': 504, 'unique_sentence2': 1997}, 'tuk_Latn-uig_Arab': {'num_samples': 1997, 'number_of_characters': 596296, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 142.85, 'max_sentence1_length': 576, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 155.74, 'max_sentence2_length': 592, 'unique_sentence2': 1996}, 'tuk_Latn-uzb_Latn': {'num_samples': 1997, 'number_of_characters': 578969, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 142.85, 'max_sentence1_length': 576, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 147.07, 'max_sentence2_length': 470, 'unique_sentence2': 1996}, 'tur_Latn-arb_Arab': {'num_samples': 1997, 'number_of_characters': 496794, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 133.01, 'max_sentence1_length': 504, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 115.76, 'max_sentence2_length': 362, 'unique_sentence2': 1995}, 'tur_Latn-aze_Latn': {'num_samples': 1997, 'number_of_characters': 535247, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 133.01, 'max_sentence1_length': 504, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 135.02, 'max_sentence2_length': 398, 'unique_sentence2': 1997}, 'tur_Latn-bak_Cyrl': {'num_samples': 1997, 'number_of_characters': 511939, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 133.01, 'max_sentence1_length': 504, 'unique_sentence1': 1997, 'min_sentence2_length': 11, 'average_sentence2_length': 123.35, 'max_sentence2_length': 437, 'unique_sentence2': 1995}, 'tur_Latn-ben_Beng': {'num_samples': 1997, 'number_of_characters': 509415, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 133.01, 'max_sentence1_length': 504, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 122.08, 'max_sentence2_length': 402, 'unique_sentence2': 1997}, 'tur_Latn-deu_Latn': {'num_samples': 1997, 'number_of_characters': 561263, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 133.01, 'max_sentence1_length': 504, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 148.05, 'max_sentence2_length': 508, 'unique_sentence2': 1996}, 'tur_Latn-ell_Grek': {'num_samples': 1997, 'number_of_characters': 564740, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 133.01, 'max_sentence1_length': 504, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 149.79, 'max_sentence2_length': 584, 'unique_sentence2': 1996}, 'tur_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 513333, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 133.01, 'max_sentence1_length': 504, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'tur_Latn-fas_Arab': {'num_samples': 1997, 'number_of_characters': 508952, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 133.01, 'max_sentence1_length': 504, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 121.85, 'max_sentence2_length': 389, 'unique_sentence2': 1995}, 'tur_Latn-fin_Latn': {'num_samples': 1997, 'number_of_characters': 535413, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 133.01, 'max_sentence1_length': 504, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 135.1, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'tur_Latn-fra_Latn': {'num_samples': 1997, 'number_of_characters': 558721, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 133.01, 'max_sentence1_length': 504, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 146.77, 'max_sentence2_length': 512, 'unique_sentence2': 1996}, 'tur_Latn-heb_Hebr': {'num_samples': 1997, 'number_of_characters': 465909, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 133.01, 'max_sentence1_length': 504, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 100.3, 'max_sentence2_length': 375, 'unique_sentence2': 1996}, 'tur_Latn-hin_Deva': {'num_samples': 1997, 'number_of_characters': 527188, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 133.01, 'max_sentence1_length': 504, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 130.98, 'max_sentence2_length': 394, 'unique_sentence2': 1996}, 'tur_Latn-hun_Latn': {'num_samples': 1997, 'number_of_characters': 543989, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 133.01, 'max_sentence1_length': 504, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 139.4, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'tur_Latn-ind_Latn': {'num_samples': 1997, 'number_of_characters': 552585, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 133.01, 'max_sentence1_length': 504, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 143.7, 'max_sentence2_length': 486, 'unique_sentence2': 1997}, 'tur_Latn-jpn_Jpan': {'num_samples': 1997, 'number_of_characters': 377239, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 133.01, 'max_sentence1_length': 504, 'unique_sentence1': 1997, 'min_sentence2_length': 4, 'average_sentence2_length': 55.9, 'max_sentence2_length': 189, 'unique_sentence2': 1994}, 'tur_Latn-kaz_Cyrl': {'num_samples': 1997, 'number_of_characters': 525889, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 133.01, 'max_sentence1_length': 504, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 130.33, 'max_sentence2_length': 473, 'unique_sentence2': 1996}, 'tur_Latn-kir_Cyrl': {'num_samples': 1997, 'number_of_characters': 516477, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 133.01, 'max_sentence1_length': 504, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 125.62, 'max_sentence2_length': 395, 'unique_sentence2': 1996}, 'tur_Latn-kor_Hang': {'num_samples': 1997, 'number_of_characters': 399018, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 133.01, 'max_sentence1_length': 504, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 66.8, 'max_sentence2_length': 217, 'unique_sentence2': 1995}, 'tur_Latn-lit_Latn': {'num_samples': 1997, 'number_of_characters': 525010, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 133.01, 'max_sentence1_length': 504, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 129.89, 'max_sentence2_length': 446, 'unique_sentence2': 1995}, 'tur_Latn-nld_Latn': {'num_samples': 1997, 'number_of_characters': 557528, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 133.01, 'max_sentence1_length': 504, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 146.18, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'tur_Latn-pol_Latn': {'num_samples': 1997, 'number_of_characters': 543479, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 133.01, 'max_sentence1_length': 504, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 139.14, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'tur_Latn-por_Latn': {'num_samples': 1997, 'number_of_characters': 542828, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 133.01, 'max_sentence1_length': 504, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 138.82, 'max_sentence2_length': 497, 'unique_sentence2': 1996}, 'tur_Latn-rus_Cyrl': {'num_samples': 1997, 'number_of_characters': 540506, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 133.01, 'max_sentence1_length': 504, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 137.65, 'max_sentence2_length': 419, 'unique_sentence2': 1996}, 'tur_Latn-spa_Latn': {'num_samples': 1997, 'number_of_characters': 553813, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 133.01, 'max_sentence1_length': 504, 'unique_sentence1': 1997, 'min_sentence2_length': 1, 'average_sentence2_length': 144.32, 'max_sentence2_length': 504, 'unique_sentence2': 1996}, 'tur_Latn-swa_Latn': {'num_samples': 1997, 'number_of_characters': 538122, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 133.01, 'max_sentence1_length': 504, 'unique_sentence1': 1997, 'min_sentence2_length': 10, 'average_sentence2_length': 136.46, 'max_sentence2_length': 430, 'unique_sentence2': 1997}, 'tur_Latn-swe_Latn': {'num_samples': 1997, 'number_of_characters': 517440, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 133.01, 'max_sentence1_length': 504, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 126.1, 'max_sentence2_length': 430, 'unique_sentence2': 1996}, 'tur_Latn-tam_Taml': {'num_samples': 1997, 'number_of_characters': 575574, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 133.01, 'max_sentence1_length': 504, 'unique_sentence1': 1997, 'min_sentence2_length': 11, 'average_sentence2_length': 155.21, 'max_sentence2_length': 581, 'unique_sentence2': 1997}, 'tur_Latn-tat_Cyrl': {'num_samples': 1997, 'number_of_characters': 511539, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 133.01, 'max_sentence1_length': 504, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 123.15, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'tur_Latn-tuk_Latn': {'num_samples': 1997, 'number_of_characters': 550887, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 133.01, 'max_sentence1_length': 504, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 142.85, 'max_sentence2_length': 576, 'unique_sentence2': 1996}, 'tur_Latn-uig_Arab': {'num_samples': 1997, 'number_of_characters': 576635, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 133.01, 'max_sentence1_length': 504, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 155.74, 'max_sentence2_length': 592, 'unique_sentence2': 1996}, 'tur_Latn-uzb_Latn': {'num_samples': 1997, 'number_of_characters': 559308, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 133.01, 'max_sentence1_length': 504, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 147.07, 'max_sentence2_length': 470, 'unique_sentence2': 1996}, 'tur_Latn-vie_Latn': {'num_samples': 1997, 'number_of_characters': 536734, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 133.01, 'max_sentence1_length': 504, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 135.76, 'max_sentence2_length': 437, 'unique_sentence2': 1996}, 'tur_Latn-zho_Hant': {'num_samples': 1997, 'number_of_characters': 357091, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 133.01, 'max_sentence1_length': 504, 'unique_sentence1': 1997, 'min_sentence2_length': 3, 'average_sentence2_length': 45.81, 'max_sentence2_length': 200, 'unique_sentence2': 1996}, 'tur_Latn-zul_Latn': {'num_samples': 1997, 'number_of_characters': 523345, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 133.01, 'max_sentence1_length': 504, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 129.06, 'max_sentence2_length': 494, 'unique_sentence2': 1996}, 'uig_Arab-aze_Latn': {'num_samples': 1997, 'number_of_characters': 580656, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 155.74, 'max_sentence1_length': 592, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 135.02, 'max_sentence2_length': 398, 'unique_sentence2': 1997}, 'uig_Arab-bak_Cyrl': {'num_samples': 1997, 'number_of_characters': 557348, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 155.74, 'max_sentence1_length': 592, 'unique_sentence1': 1996, 'min_sentence2_length': 11, 'average_sentence2_length': 123.35, 'max_sentence2_length': 437, 'unique_sentence2': 1995}, 'uig_Arab-eng_Latn': {'num_samples': 1997, 'number_of_characters': 558742, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 155.74, 'max_sentence1_length': 592, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'uig_Arab-kaz_Cyrl': {'num_samples': 1997, 'number_of_characters': 571298, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 155.74, 'max_sentence1_length': 592, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 130.33, 'max_sentence2_length': 473, 'unique_sentence2': 1996}, 'uig_Arab-kir_Cyrl': {'num_samples': 1997, 'number_of_characters': 561886, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 155.74, 'max_sentence1_length': 592, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 125.62, 'max_sentence2_length': 395, 'unique_sentence2': 1996}, 'uig_Arab-tat_Cyrl': {'num_samples': 1997, 'number_of_characters': 556948, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 155.74, 'max_sentence1_length': 592, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 123.15, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'uig_Arab-tuk_Latn': {'num_samples': 1997, 'number_of_characters': 596296, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 155.74, 'max_sentence1_length': 592, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 142.85, 'max_sentence2_length': 576, 'unique_sentence2': 1996}, 'uig_Arab-tur_Latn': {'num_samples': 1997, 'number_of_characters': 576635, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 155.74, 'max_sentence1_length': 592, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 133.01, 'max_sentence2_length': 504, 'unique_sentence2': 1997}, 'uig_Arab-uzb_Latn': {'num_samples': 1997, 'number_of_characters': 604717, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 155.74, 'max_sentence1_length': 592, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 147.07, 'max_sentence2_length': 470, 'unique_sentence2': 1996}, 'ukr_Cyrl-bel_Cyrl': {'num_samples': 1997, 'number_of_characters': 518873, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 131.59, 'max_sentence1_length': 440, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 128.24, 'max_sentence2_length': 422, 'unique_sentence2': 1996}, 'ukr_Cyrl-bos_Latn': {'num_samples': 1997, 'number_of_characters': 517693, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 131.59, 'max_sentence1_length': 440, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 127.65, 'max_sentence2_length': 434, 'unique_sentence2': 1996}, 'ukr_Cyrl-bul_Cyrl': {'num_samples': 1997, 'number_of_characters': 532672, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 131.59, 'max_sentence1_length': 440, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 135.15, 'max_sentence2_length': 493, 'unique_sentence2': 1996}, 'ukr_Cyrl-ces_Latn': {'num_samples': 1997, 'number_of_characters': 504101, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 131.59, 'max_sentence1_length': 440, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 120.84, 'max_sentence2_length': 474, 'unique_sentence2': 1997}, 'ukr_Cyrl-eng_Latn': {'num_samples': 1997, 'number_of_characters': 510503, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 131.59, 'max_sentence1_length': 440, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'ukr_Cyrl-hrv_Latn': {'num_samples': 1997, 'number_of_characters': 518708, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 131.59, 'max_sentence1_length': 440, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 128.15, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'ukr_Cyrl-mkd_Cyrl': {'num_samples': 1997, 'number_of_characters': 530674, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 131.59, 'max_sentence1_length': 440, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 134.15, 'max_sentence2_length': 451, 'unique_sentence2': 1997}, 'ukr_Cyrl-pol_Latn': {'num_samples': 1997, 'number_of_characters': 540649, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 131.59, 'max_sentence1_length': 440, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 139.14, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'ukr_Cyrl-rus_Cyrl': {'num_samples': 1997, 'number_of_characters': 537676, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 131.59, 'max_sentence1_length': 440, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 137.65, 'max_sentence2_length': 419, 'unique_sentence2': 1996}, 'ukr_Cyrl-slk_Latn': {'num_samples': 1997, 'number_of_characters': 515752, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 131.59, 'max_sentence1_length': 440, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 126.67, 'max_sentence2_length': 403, 'unique_sentence2': 1996}, 'ukr_Cyrl-slv_Latn': {'num_samples': 1997, 'number_of_characters': 515679, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 131.59, 'max_sentence1_length': 440, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 126.64, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'ukr_Cyrl-srp_Cyrl': {'num_samples': 1997, 'number_of_characters': 515086, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 131.59, 'max_sentence1_length': 440, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 126.34, 'max_sentence2_length': 439, 'unique_sentence2': 1995}, 'ukr_Cyrl-srp_Latn': {'num_samples': 1997, 'number_of_characters': 518924, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 131.59, 'max_sentence1_length': 440, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 128.26, 'max_sentence2_length': 452, 'unique_sentence2': 1996}, 'urd_Arab-ben_Beng': {'num_samples': 1997, 'number_of_characters': 491800, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.19, 'max_sentence1_length': 390, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 122.08, 'max_sentence2_length': 402, 'unique_sentence2': 1997}, 'urd_Arab-div_Thaa': {'num_samples': 1997, 'number_of_characters': 551846, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 124.19, 'max_sentence1_length': 390, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 152.15, 'max_sentence2_length': 609, 'unique_sentence2': 1996}, 'urd_Arab-eng_Latn': {'num_samples': 1997, 'number_of_characters': 495718, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.19, 'max_sentence1_length': 390, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'urd_Arab-eus_Latn': {'num_samples': 1997, 'number_of_characters': 523201, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.19, 'max_sentence1_length': 390, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 137.81, 'max_sentence2_length': 393, 'unique_sentence2': 1997}, 'urd_Arab-guj_Gujr': {'num_samples': 1997, 'number_of_characters': 492929, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.19, 'max_sentence1_length': 390, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 122.65, 'max_sentence2_length': 378, 'unique_sentence2': 1997}, 'urd_Arab-hin_Deva': {'num_samples': 1997, 'number_of_characters': 509573, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 124.19, 'max_sentence1_length': 390, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 130.98, 'max_sentence2_length': 394, 'unique_sentence2': 1996}, 'urd_Arab-kan_Knda': {'num_samples': 1997, 'number_of_characters': 513534, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 124.19, 'max_sentence1_length': 390, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 132.97, 'max_sentence2_length': 449, 'unique_sentence2': 1996}, 'urd_Arab-mar_Deva': {'num_samples': 1997, 'number_of_characters': 508885, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 124.19, 'max_sentence1_length': 390, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 130.64, 'max_sentence2_length': 443, 'unique_sentence2': 1995}, 'urd_Arab-nep_Deva': {'num_samples': 1997, 'number_of_characters': 496221, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 124.19, 'max_sentence1_length': 390, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 124.3, 'max_sentence2_length': 412, 'unique_sentence2': 1996}, 'urd_Arab-pan_Guru': {'num_samples': 1997, 'number_of_characters': 498420, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 124.19, 'max_sentence1_length': 390, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 125.4, 'max_sentence2_length': 383, 'unique_sentence2': 1996}, 'urd_Arab-sin_Sinh': {'num_samples': 1997, 'number_of_characters': 506739, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 124.19, 'max_sentence1_length': 390, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 129.56, 'max_sentence2_length': 441, 'unique_sentence2': 1996}, 'urd_Arab-snd_Arab': {'num_samples': 1997, 'number_of_characters': 468325, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 124.19, 'max_sentence1_length': 390, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 110.33, 'max_sentence2_length': 335, 'unique_sentence2': 1996}, 'urd_Arab-tam_Taml': {'num_samples': 1997, 'number_of_characters': 557959, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.19, 'max_sentence1_length': 390, 'unique_sentence1': 1996, 'min_sentence2_length': 11, 'average_sentence2_length': 155.21, 'max_sentence2_length': 581, 'unique_sentence2': 1997}, 'urd_Arab-tel_Telu': {'num_samples': 1997, 'number_of_characters': 495525, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 124.19, 'max_sentence1_length': 390, 'unique_sentence1': 1996, 'min_sentence2_length': 10, 'average_sentence2_length': 123.95, 'max_sentence2_length': 412, 'unique_sentence2': 1996}, 'uzb_Latn-aze_Latn': {'num_samples': 1997, 'number_of_characters': 563329, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 147.07, 'max_sentence1_length': 470, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 135.02, 'max_sentence2_length': 398, 'unique_sentence2': 1997}, 'uzb_Latn-bak_Cyrl': {'num_samples': 1997, 'number_of_characters': 540021, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 147.07, 'max_sentence1_length': 470, 'unique_sentence1': 1996, 'min_sentence2_length': 11, 'average_sentence2_length': 123.35, 'max_sentence2_length': 437, 'unique_sentence2': 1995}, 'uzb_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 541415, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 147.07, 'max_sentence1_length': 470, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'uzb_Latn-kaz_Cyrl': {'num_samples': 1997, 'number_of_characters': 553971, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 147.07, 'max_sentence1_length': 470, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 130.33, 'max_sentence2_length': 473, 'unique_sentence2': 1996}, 'uzb_Latn-kir_Cyrl': {'num_samples': 1997, 'number_of_characters': 544559, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 147.07, 'max_sentence1_length': 470, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 125.62, 'max_sentence2_length': 395, 'unique_sentence2': 1996}, 'uzb_Latn-tat_Cyrl': {'num_samples': 1997, 'number_of_characters': 539621, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 147.07, 'max_sentence1_length': 470, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 123.15, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'uzb_Latn-tuk_Latn': {'num_samples': 1997, 'number_of_characters': 578969, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 147.07, 'max_sentence1_length': 470, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 142.85, 'max_sentence2_length': 576, 'unique_sentence2': 1996}, 'uzb_Latn-tur_Latn': {'num_samples': 1997, 'number_of_characters': 559308, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 147.07, 'max_sentence1_length': 470, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 133.01, 'max_sentence2_length': 504, 'unique_sentence2': 1997}, 'uzb_Latn-uig_Arab': {'num_samples': 1997, 'number_of_characters': 604717, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 147.07, 'max_sentence1_length': 470, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 155.74, 'max_sentence2_length': 592, 'unique_sentence2': 1996}, 'ven_Latn-bem_Latn': {'num_samples': 1997, 'number_of_characters': 598248, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 150.1, 'max_sentence1_length': 535, 'unique_sentence1': 1993, 'min_sentence2_length': 8, 'average_sentence2_length': 149.47, 'max_sentence2_length': 465, 'unique_sentence2': 1997}, 'ven_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 547476, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 150.1, 'max_sentence1_length': 535, 'unique_sentence1': 1993, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'ven_Latn-ewe_Latn': {'num_samples': 1997, 'number_of_characters': 538734, 'unique_pairs': 1996, 'min_sentence1_length': 10, 'average_sentence1_length': 150.1, 'max_sentence1_length': 535, 'unique_sentence1': 1993, 'min_sentence2_length': 6, 'average_sentence2_length': 119.67, 'max_sentence2_length': 493, 'unique_sentence2': 1994}, 'ven_Latn-fuc_Latn': {'num_samples': 1997, 'number_of_characters': 528236, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 150.1, 'max_sentence1_length': 535, 'unique_sentence1': 1993, 'min_sentence2_length': 6, 'average_sentence2_length': 114.41, 'max_sentence2_length': 376, 'unique_sentence2': 1996}, 'ven_Latn-kin_Latn': {'num_samples': 1997, 'number_of_characters': 603543, 'unique_pairs': 1996, 'min_sentence1_length': 10, 'average_sentence1_length': 150.1, 'max_sentence1_length': 535, 'unique_sentence1': 1993, 'min_sentence2_length': 10, 'average_sentence2_length': 152.12, 'max_sentence2_length': 541, 'unique_sentence2': 1996}, 'ven_Latn-nde_Latn': {'num_samples': 1997, 'number_of_characters': 597495, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 150.1, 'max_sentence1_length': 535, 'unique_sentence1': 1993, 'min_sentence2_length': 6, 'average_sentence2_length': 149.09, 'max_sentence2_length': 590, 'unique_sentence2': 1997}, 'ven_Latn-nya_Latn': {'num_samples': 1997, 'number_of_characters': 584038, 'unique_pairs': 1996, 'min_sentence1_length': 10, 'average_sentence1_length': 150.1, 'max_sentence1_length': 535, 'unique_sentence1': 1993, 'min_sentence2_length': 10, 'average_sentence2_length': 142.35, 'max_sentence2_length': 464, 'unique_sentence2': 1993}, 'ven_Latn-sna_Latn': {'num_samples': 1997, 'number_of_characters': 598086, 'unique_pairs': 1995, 'min_sentence1_length': 10, 'average_sentence1_length': 150.1, 'max_sentence1_length': 535, 'unique_sentence1': 1993, 'min_sentence2_length': 6, 'average_sentence2_length': 149.39, 'max_sentence2_length': 511, 'unique_sentence2': 1995}, 'vie_Latn-arb_Arab': {'num_samples': 1997, 'number_of_characters': 502302, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 135.76, 'max_sentence1_length': 437, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 115.76, 'max_sentence2_length': 362, 'unique_sentence2': 1995}, 'vie_Latn-ben_Beng': {'num_samples': 1997, 'number_of_characters': 514923, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 135.76, 'max_sentence1_length': 437, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 122.08, 'max_sentence2_length': 402, 'unique_sentence2': 1997}, 'vie_Latn-deu_Latn': {'num_samples': 1997, 'number_of_characters': 566771, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 135.76, 'max_sentence1_length': 437, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 148.05, 'max_sentence2_length': 508, 'unique_sentence2': 1996}, 'vie_Latn-ell_Grek': {'num_samples': 1997, 'number_of_characters': 570248, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 135.76, 'max_sentence1_length': 437, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 149.79, 'max_sentence2_length': 584, 'unique_sentence2': 1996}, 'vie_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 518841, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 135.76, 'max_sentence1_length': 437, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'vie_Latn-fas_Arab': {'num_samples': 1997, 'number_of_characters': 514460, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 135.76, 'max_sentence1_length': 437, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 121.85, 'max_sentence2_length': 389, 'unique_sentence2': 1995}, 'vie_Latn-fin_Latn': {'num_samples': 1997, 'number_of_characters': 540921, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 135.76, 'max_sentence1_length': 437, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 135.1, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'vie_Latn-fra_Latn': {'num_samples': 1997, 'number_of_characters': 564229, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 135.76, 'max_sentence1_length': 437, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 146.77, 'max_sentence2_length': 512, 'unique_sentence2': 1996}, 'vie_Latn-heb_Hebr': {'num_samples': 1997, 'number_of_characters': 471417, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 135.76, 'max_sentence1_length': 437, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 100.3, 'max_sentence2_length': 375, 'unique_sentence2': 1996}, 'vie_Latn-hin_Deva': {'num_samples': 1997, 'number_of_characters': 532696, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 135.76, 'max_sentence1_length': 437, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 130.98, 'max_sentence2_length': 394, 'unique_sentence2': 1996}, 'vie_Latn-hun_Latn': {'num_samples': 1997, 'number_of_characters': 549497, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 135.76, 'max_sentence1_length': 437, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 139.4, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'vie_Latn-ind_Latn': {'num_samples': 1997, 'number_of_characters': 558093, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 135.76, 'max_sentence1_length': 437, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 143.7, 'max_sentence2_length': 486, 'unique_sentence2': 1997}, 'vie_Latn-jpn_Jpan': {'num_samples': 1997, 'number_of_characters': 382747, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 135.76, 'max_sentence1_length': 437, 'unique_sentence1': 1996, 'min_sentence2_length': 4, 'average_sentence2_length': 55.9, 'max_sentence2_length': 189, 'unique_sentence2': 1994}, 'vie_Latn-kor_Hang': {'num_samples': 1997, 'number_of_characters': 404526, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 135.76, 'max_sentence1_length': 437, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 66.8, 'max_sentence2_length': 217, 'unique_sentence2': 1995}, 'vie_Latn-lit_Latn': {'num_samples': 1997, 'number_of_characters': 530518, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 135.76, 'max_sentence1_length': 437, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 129.89, 'max_sentence2_length': 446, 'unique_sentence2': 1995}, 'vie_Latn-nld_Latn': {'num_samples': 1997, 'number_of_characters': 563036, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 135.76, 'max_sentence1_length': 437, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 146.18, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'vie_Latn-pol_Latn': {'num_samples': 1997, 'number_of_characters': 548987, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 135.76, 'max_sentence1_length': 437, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 139.14, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'vie_Latn-por_Latn': {'num_samples': 1997, 'number_of_characters': 548336, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 135.76, 'max_sentence1_length': 437, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 138.82, 'max_sentence2_length': 497, 'unique_sentence2': 1996}, 'vie_Latn-rus_Cyrl': {'num_samples': 1997, 'number_of_characters': 546014, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 135.76, 'max_sentence1_length': 437, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 137.65, 'max_sentence2_length': 419, 'unique_sentence2': 1996}, 'vie_Latn-spa_Latn': {'num_samples': 1997, 'number_of_characters': 559321, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 135.76, 'max_sentence1_length': 437, 'unique_sentence1': 1996, 'min_sentence2_length': 1, 'average_sentence2_length': 144.32, 'max_sentence2_length': 504, 'unique_sentence2': 1996}, 'vie_Latn-swa_Latn': {'num_samples': 1997, 'number_of_characters': 543630, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 135.76, 'max_sentence1_length': 437, 'unique_sentence1': 1996, 'min_sentence2_length': 10, 'average_sentence2_length': 136.46, 'max_sentence2_length': 430, 'unique_sentence2': 1997}, 'vie_Latn-swe_Latn': {'num_samples': 1997, 'number_of_characters': 522948, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 135.76, 'max_sentence1_length': 437, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 126.1, 'max_sentence2_length': 430, 'unique_sentence2': 1996}, 'vie_Latn-tam_Taml': {'num_samples': 1997, 'number_of_characters': 581082, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 135.76, 'max_sentence1_length': 437, 'unique_sentence1': 1996, 'min_sentence2_length': 11, 'average_sentence2_length': 155.21, 'max_sentence2_length': 581, 'unique_sentence2': 1997}, 'vie_Latn-tur_Latn': {'num_samples': 1997, 'number_of_characters': 536734, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 135.76, 'max_sentence1_length': 437, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 133.01, 'max_sentence2_length': 504, 'unique_sentence2': 1997}, 'vie_Latn-yue_Hant': {'num_samples': 1997, 'number_of_characters': 350008, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 135.76, 'max_sentence1_length': 437, 'unique_sentence1': 1996, 'min_sentence2_length': 4, 'average_sentence2_length': 39.5, 'max_sentence2_length': 133, 'unique_sentence2': 1996}, 'vie_Latn-zho_Hans': {'num_samples': 1997, 'number_of_characters': 356082, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 135.76, 'max_sentence1_length': 437, 'unique_sentence1': 1996, 'min_sentence2_length': 4, 'average_sentence2_length': 42.54, 'max_sentence2_length': 263, 'unique_sentence2': 1997}, 'vie_Latn-zho_Hant': {'num_samples': 1997, 'number_of_characters': 362599, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 135.76, 'max_sentence1_length': 437, 'unique_sentence1': 1996, 'min_sentence2_length': 3, 'average_sentence2_length': 45.81, 'max_sentence2_length': 200, 'unique_sentence2': 1996}, 'vie_Latn-zul_Latn': {'num_samples': 1997, 'number_of_characters': 528853, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 135.76, 'max_sentence1_length': 437, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 129.06, 'max_sentence2_length': 494, 'unique_sentence2': 1996}, 'wol_Latn-amh_Ethi': {'num_samples': 1997, 'number_of_characters': 407310, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 120.08, 'max_sentence1_length': 405, 'unique_sentence1': 1990, 'min_sentence2_length': 1, 'average_sentence2_length': 83.88, 'max_sentence2_length': 290, 'unique_sentence2': 1994}, 'wol_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 487523, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 120.08, 'max_sentence1_length': 405, 'unique_sentence1': 1990, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'wol_Latn-hau_Latn': {'num_samples': 1997, 'number_of_characters': 509769, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 120.08, 'max_sentence1_length': 405, 'unique_sentence1': 1990, 'min_sentence2_length': 8, 'average_sentence2_length': 135.19, 'max_sentence2_length': 483, 'unique_sentence2': 1997}, 'wol_Latn-ibo_Latn': {'num_samples': 1997, 'number_of_characters': 485904, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 120.08, 'max_sentence1_length': 405, 'unique_sentence1': 1990, 'min_sentence2_length': 6, 'average_sentence2_length': 123.24, 'max_sentence2_length': 469, 'unique_sentence2': 1997}, 'wol_Latn-nso_Latn': {'num_samples': 1997, 'number_of_characters': 531302, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 120.08, 'max_sentence1_length': 405, 'unique_sentence1': 1990, 'min_sentence2_length': 5, 'average_sentence2_length': 145.97, 'max_sentence2_length': 487, 'unique_sentence2': 1996}, 'wol_Latn-orm_Ethi': {'num_samples': 1997, 'number_of_characters': 477234, 'unique_pairs': 1992, 'min_sentence1_length': 6, 'average_sentence1_length': 120.08, 'max_sentence1_length': 405, 'unique_sentence1': 1990, 'min_sentence2_length': 9, 'average_sentence2_length': 118.89, 'max_sentence2_length': 466, 'unique_sentence2': 1984}, 'wol_Latn-som_Latn': {'num_samples': 1997, 'number_of_characters': 531095, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 120.08, 'max_sentence1_length': 405, 'unique_sentence1': 1990, 'min_sentence2_length': 8, 'average_sentence2_length': 145.86, 'max_sentence2_length': 455, 'unique_sentence2': 1997}, 'wol_Latn-ssw_Latn': {'num_samples': 1997, 'number_of_characters': 527945, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 120.08, 'max_sentence1_length': 405, 'unique_sentence1': 1990, 'min_sentence2_length': 8, 'average_sentence2_length': 144.29, 'max_sentence2_length': 510, 'unique_sentence2': 1996}, 'wol_Latn-swa_Latn': {'num_samples': 1997, 'number_of_characters': 512312, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 120.08, 'max_sentence1_length': 405, 'unique_sentence1': 1990, 'min_sentence2_length': 10, 'average_sentence2_length': 136.46, 'max_sentence2_length': 430, 'unique_sentence2': 1997}, 'wol_Latn-tir_Ethi': {'num_samples': 1997, 'number_of_characters': 405041, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 120.08, 'max_sentence1_length': 405, 'unique_sentence1': 1990, 'min_sentence2_length': 5, 'average_sentence2_length': 82.74, 'max_sentence2_length': 272, 'unique_sentence2': 1996}, 'wol_Latn-tsn_Latn': {'num_samples': 1997, 'number_of_characters': 574086, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 120.08, 'max_sentence1_length': 405, 'unique_sentence1': 1990, 'min_sentence2_length': 6, 'average_sentence2_length': 167.39, 'max_sentence2_length': 556, 'unique_sentence2': 1997}, 'wol_Latn-xho_Latn': {'num_samples': 1997, 'number_of_characters': 507893, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 120.08, 'max_sentence1_length': 405, 'unique_sentence1': 1990, 'min_sentence2_length': 6, 'average_sentence2_length': 134.25, 'max_sentence2_length': 492, 'unique_sentence2': 1997}, 'wol_Latn-yor_Latn': {'num_samples': 1997, 'number_of_characters': 555891, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 120.08, 'max_sentence1_length': 405, 'unique_sentence1': 1990, 'min_sentence2_length': 7, 'average_sentence2_length': 158.28, 'max_sentence2_length': 582, 'unique_sentence2': 1996}, 'wol_Latn-zul_Latn': {'num_samples': 1997, 'number_of_characters': 497535, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 120.08, 'max_sentence1_length': 405, 'unique_sentence1': 1990, 'min_sentence2_length': 8, 'average_sentence2_length': 129.06, 'max_sentence2_length': 494, 'unique_sentence2': 1996}, 'xho_Latn-amh_Ethi': {'num_samples': 1997, 'number_of_characters': 435597, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 134.25, 'max_sentence1_length': 492, 'unique_sentence1': 1997, 'min_sentence2_length': 1, 'average_sentence2_length': 83.88, 'max_sentence2_length': 290, 'unique_sentence2': 1994}, 'xho_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 515810, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 134.25, 'max_sentence1_length': 492, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'xho_Latn-hau_Latn': {'num_samples': 1997, 'number_of_characters': 538056, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 134.25, 'max_sentence1_length': 492, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 135.19, 'max_sentence2_length': 483, 'unique_sentence2': 1997}, 'xho_Latn-ibo_Latn': {'num_samples': 1997, 'number_of_characters': 514191, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 134.25, 'max_sentence1_length': 492, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 123.24, 'max_sentence2_length': 469, 'unique_sentence2': 1997}, 'xho_Latn-nso_Latn': {'num_samples': 1997, 'number_of_characters': 559589, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 134.25, 'max_sentence1_length': 492, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 145.97, 'max_sentence2_length': 487, 'unique_sentence2': 1996}, 'xho_Latn-orm_Ethi': {'num_samples': 1997, 'number_of_characters': 505521, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 134.25, 'max_sentence1_length': 492, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 118.89, 'max_sentence2_length': 466, 'unique_sentence2': 1984}, 'xho_Latn-som_Latn': {'num_samples': 1997, 'number_of_characters': 559382, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 134.25, 'max_sentence1_length': 492, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 145.86, 'max_sentence2_length': 455, 'unique_sentence2': 1997}, 'xho_Latn-ssw_Latn': {'num_samples': 1997, 'number_of_characters': 556232, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 134.25, 'max_sentence1_length': 492, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 144.29, 'max_sentence2_length': 510, 'unique_sentence2': 1996}, 'xho_Latn-swa_Latn': {'num_samples': 1997, 'number_of_characters': 540599, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 134.25, 'max_sentence1_length': 492, 'unique_sentence1': 1997, 'min_sentence2_length': 10, 'average_sentence2_length': 136.46, 'max_sentence2_length': 430, 'unique_sentence2': 1997}, 'xho_Latn-tir_Ethi': {'num_samples': 1997, 'number_of_characters': 433328, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 134.25, 'max_sentence1_length': 492, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 82.74, 'max_sentence2_length': 272, 'unique_sentence2': 1996}, 'xho_Latn-tsn_Latn': {'num_samples': 1997, 'number_of_characters': 602373, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 134.25, 'max_sentence1_length': 492, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 167.39, 'max_sentence2_length': 556, 'unique_sentence2': 1997}, 'xho_Latn-wol_Latn': {'num_samples': 1997, 'number_of_characters': 507893, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 134.25, 'max_sentence1_length': 492, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 120.08, 'max_sentence2_length': 405, 'unique_sentence2': 1990}, 'xho_Latn-yor_Latn': {'num_samples': 1997, 'number_of_characters': 584178, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 134.25, 'max_sentence1_length': 492, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 158.28, 'max_sentence2_length': 582, 'unique_sentence2': 1996}, 'xho_Latn-zul_Latn': {'num_samples': 1997, 'number_of_characters': 525822, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 134.25, 'max_sentence1_length': 492, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 129.06, 'max_sentence2_length': 494, 'unique_sentence2': 1996}, 'yor_Latn-amh_Ethi': {'num_samples': 1997, 'number_of_characters': 483595, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 158.28, 'max_sentence1_length': 582, 'unique_sentence1': 1996, 'min_sentence2_length': 1, 'average_sentence2_length': 83.88, 'max_sentence2_length': 290, 'unique_sentence2': 1994}, 'yor_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 563808, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 158.28, 'max_sentence1_length': 582, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'yor_Latn-hau_Latn': {'num_samples': 1997, 'number_of_characters': 586054, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 158.28, 'max_sentence1_length': 582, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 135.19, 'max_sentence2_length': 483, 'unique_sentence2': 1997}, 'yor_Latn-ibo_Latn': {'num_samples': 1997, 'number_of_characters': 562189, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 158.28, 'max_sentence1_length': 582, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 123.24, 'max_sentence2_length': 469, 'unique_sentence2': 1997}, 'yor_Latn-nso_Latn': {'num_samples': 1997, 'number_of_characters': 607587, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 158.28, 'max_sentence1_length': 582, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 145.97, 'max_sentence2_length': 487, 'unique_sentence2': 1996}, 'yor_Latn-orm_Ethi': {'num_samples': 1997, 'number_of_characters': 553519, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 158.28, 'max_sentence1_length': 582, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 118.89, 'max_sentence2_length': 466, 'unique_sentence2': 1984}, 'yor_Latn-som_Latn': {'num_samples': 1997, 'number_of_characters': 607380, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 158.28, 'max_sentence1_length': 582, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 145.86, 'max_sentence2_length': 455, 'unique_sentence2': 1997}, 'yor_Latn-ssw_Latn': {'num_samples': 1997, 'number_of_characters': 604230, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 158.28, 'max_sentence1_length': 582, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 144.29, 'max_sentence2_length': 510, 'unique_sentence2': 1996}, 'yor_Latn-swa_Latn': {'num_samples': 1997, 'number_of_characters': 588597, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 158.28, 'max_sentence1_length': 582, 'unique_sentence1': 1996, 'min_sentence2_length': 10, 'average_sentence2_length': 136.46, 'max_sentence2_length': 430, 'unique_sentence2': 1997}, 'yor_Latn-tir_Ethi': {'num_samples': 1997, 'number_of_characters': 481326, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 158.28, 'max_sentence1_length': 582, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 82.74, 'max_sentence2_length': 272, 'unique_sentence2': 1996}, 'yor_Latn-tsn_Latn': {'num_samples': 1997, 'number_of_characters': 650371, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 158.28, 'max_sentence1_length': 582, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 167.39, 'max_sentence2_length': 556, 'unique_sentence2': 1997}, 'yor_Latn-wol_Latn': {'num_samples': 1997, 'number_of_characters': 555891, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 158.28, 'max_sentence1_length': 582, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 120.08, 'max_sentence2_length': 405, 'unique_sentence2': 1990}, 'yor_Latn-xho_Latn': {'num_samples': 1997, 'number_of_characters': 584178, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 158.28, 'max_sentence1_length': 582, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 134.25, 'max_sentence2_length': 492, 'unique_sentence2': 1997}, 'yor_Latn-zul_Latn': {'num_samples': 1997, 'number_of_characters': 573820, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 158.28, 'max_sentence1_length': 582, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 129.06, 'max_sentence2_length': 494, 'unique_sentence2': 1996}, 'yue_Hant-eng_Latn': {'num_samples': 1997, 'number_of_characters': 326607, 'unique_pairs': 1997, 'min_sentence1_length': 4, 'average_sentence1_length': 39.5, 'max_sentence1_length': 133, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'yue_Hant-jpn_Jpan': {'num_samples': 1997, 'number_of_characters': 190513, 'unique_pairs': 1996, 'min_sentence1_length': 4, 'average_sentence1_length': 39.5, 'max_sentence1_length': 133, 'unique_sentence1': 1996, 'min_sentence2_length': 4, 'average_sentence2_length': 55.9, 'max_sentence2_length': 189, 'unique_sentence2': 1994}, 'yue_Hant-kor_Hang': {'num_samples': 1997, 'number_of_characters': 212292, 'unique_pairs': 1996, 'min_sentence1_length': 4, 'average_sentence1_length': 39.5, 'max_sentence1_length': 133, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 66.8, 'max_sentence2_length': 217, 'unique_sentence2': 1995}, 'yue_Hant-vie_Latn': {'num_samples': 1997, 'number_of_characters': 350008, 'unique_pairs': 1997, 'min_sentence1_length': 4, 'average_sentence1_length': 39.5, 'max_sentence1_length': 133, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 135.76, 'max_sentence2_length': 437, 'unique_sentence2': 1996}, 'yue_Hant-zho_Hans': {'num_samples': 1997, 'number_of_characters': 163848, 'unique_pairs': 1997, 'min_sentence1_length': 4, 'average_sentence1_length': 39.5, 'max_sentence1_length': 133, 'unique_sentence1': 1996, 'min_sentence2_length': 4, 'average_sentence2_length': 42.54, 'max_sentence2_length': 263, 'unique_sentence2': 1997}, 'yue_Hant-zho_Hant': {'num_samples': 1997, 'number_of_characters': 170365, 'unique_pairs': 1996, 'min_sentence1_length': 4, 'average_sentence1_length': 39.5, 'max_sentence1_length': 133, 'unique_sentence1': 1996, 'min_sentence2_length': 3, 'average_sentence2_length': 45.81, 'max_sentence2_length': 200, 'unique_sentence2': 1996}, 'zho_Hans-eng_Latn': {'num_samples': 1997, 'number_of_characters': 332681, 'unique_pairs': 1997, 'min_sentence1_length': 4, 'average_sentence1_length': 42.54, 'max_sentence1_length': 263, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'zho_Hans-jpn_Jpan': {'num_samples': 1997, 'number_of_characters': 196587, 'unique_pairs': 1997, 'min_sentence1_length': 4, 'average_sentence1_length': 42.54, 'max_sentence1_length': 263, 'unique_sentence1': 1997, 'min_sentence2_length': 4, 'average_sentence2_length': 55.9, 'max_sentence2_length': 189, 'unique_sentence2': 1994}, 'zho_Hans-kor_Hang': {'num_samples': 1997, 'number_of_characters': 218366, 'unique_pairs': 1997, 'min_sentence1_length': 4, 'average_sentence1_length': 42.54, 'max_sentence1_length': 263, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 66.8, 'max_sentence2_length': 217, 'unique_sentence2': 1995}, 'zho_Hans-vie_Latn': {'num_samples': 1997, 'number_of_characters': 356082, 'unique_pairs': 1997, 'min_sentence1_length': 4, 'average_sentence1_length': 42.54, 'max_sentence1_length': 263, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 135.76, 'max_sentence2_length': 437, 'unique_sentence2': 1996}, 'zho_Hans-yue_Hant': {'num_samples': 1997, 'number_of_characters': 163848, 'unique_pairs': 1997, 'min_sentence1_length': 4, 'average_sentence1_length': 42.54, 'max_sentence1_length': 263, 'unique_sentence1': 1997, 'min_sentence2_length': 4, 'average_sentence2_length': 39.5, 'max_sentence2_length': 133, 'unique_sentence2': 1996}, 'zho_Hans-zho_Hant': {'num_samples': 1997, 'number_of_characters': 176439, 'unique_pairs': 1997, 'min_sentence1_length': 4, 'average_sentence1_length': 42.54, 'max_sentence1_length': 263, 'unique_sentence1': 1997, 'min_sentence2_length': 3, 'average_sentence2_length': 45.81, 'max_sentence2_length': 200, 'unique_sentence2': 1996}, 'zho_Hant-arb_Arab': {'num_samples': 1997, 'number_of_characters': 322659, 'unique_pairs': 1996, 'min_sentence1_length': 3, 'average_sentence1_length': 45.81, 'max_sentence1_length': 200, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 115.76, 'max_sentence2_length': 362, 'unique_sentence2': 1995}, 'zho_Hant-ben_Beng': {'num_samples': 1997, 'number_of_characters': 335280, 'unique_pairs': 1997, 'min_sentence1_length': 3, 'average_sentence1_length': 45.81, 'max_sentence1_length': 200, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 122.08, 'max_sentence2_length': 402, 'unique_sentence2': 1997}, 'zho_Hant-deu_Latn': {'num_samples': 1997, 'number_of_characters': 387128, 'unique_pairs': 1996, 'min_sentence1_length': 3, 'average_sentence1_length': 45.81, 'max_sentence1_length': 200, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 148.05, 'max_sentence2_length': 508, 'unique_sentence2': 1996}, 'zho_Hant-ell_Grek': {'num_samples': 1997, 'number_of_characters': 390605, 'unique_pairs': 1996, 'min_sentence1_length': 3, 'average_sentence1_length': 45.81, 'max_sentence1_length': 200, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 149.79, 'max_sentence2_length': 584, 'unique_sentence2': 1996}, 'zho_Hant-eng_Latn': {'num_samples': 1997, 'number_of_characters': 339198, 'unique_pairs': 1997, 'min_sentence1_length': 3, 'average_sentence1_length': 45.81, 'max_sentence1_length': 200, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'zho_Hant-fas_Arab': {'num_samples': 1997, 'number_of_characters': 334817, 'unique_pairs': 1996, 'min_sentence1_length': 3, 'average_sentence1_length': 45.81, 'max_sentence1_length': 200, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 121.85, 'max_sentence2_length': 389, 'unique_sentence2': 1995}, 'zho_Hant-fin_Latn': {'num_samples': 1997, 'number_of_characters': 361278, 'unique_pairs': 1996, 'min_sentence1_length': 3, 'average_sentence1_length': 45.81, 'max_sentence1_length': 200, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 135.1, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'zho_Hant-fra_Latn': {'num_samples': 1997, 'number_of_characters': 384586, 'unique_pairs': 1996, 'min_sentence1_length': 3, 'average_sentence1_length': 45.81, 'max_sentence1_length': 200, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 146.77, 'max_sentence2_length': 512, 'unique_sentence2': 1996}, 'zho_Hant-heb_Hebr': {'num_samples': 1997, 'number_of_characters': 291774, 'unique_pairs': 1996, 'min_sentence1_length': 3, 'average_sentence1_length': 45.81, 'max_sentence1_length': 200, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 100.3, 'max_sentence2_length': 375, 'unique_sentence2': 1996}, 'zho_Hant-hin_Deva': {'num_samples': 1997, 'number_of_characters': 353053, 'unique_pairs': 1996, 'min_sentence1_length': 3, 'average_sentence1_length': 45.81, 'max_sentence1_length': 200, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 130.98, 'max_sentence2_length': 394, 'unique_sentence2': 1996}, 'zho_Hant-hun_Latn': {'num_samples': 1997, 'number_of_characters': 369854, 'unique_pairs': 1997, 'min_sentence1_length': 3, 'average_sentence1_length': 45.81, 'max_sentence1_length': 200, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 139.4, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'zho_Hant-ind_Latn': {'num_samples': 1997, 'number_of_characters': 378450, 'unique_pairs': 1997, 'min_sentence1_length': 3, 'average_sentence1_length': 45.81, 'max_sentence1_length': 200, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 143.7, 'max_sentence2_length': 486, 'unique_sentence2': 1997}, 'zho_Hant-jpn_Jpan': {'num_samples': 1997, 'number_of_characters': 203104, 'unique_pairs': 1996, 'min_sentence1_length': 3, 'average_sentence1_length': 45.81, 'max_sentence1_length': 200, 'unique_sentence1': 1996, 'min_sentence2_length': 4, 'average_sentence2_length': 55.9, 'max_sentence2_length': 189, 'unique_sentence2': 1994}, 'zho_Hant-kor_Hang': {'num_samples': 1997, 'number_of_characters': 224883, 'unique_pairs': 1996, 'min_sentence1_length': 3, 'average_sentence1_length': 45.81, 'max_sentence1_length': 200, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 66.8, 'max_sentence2_length': 217, 'unique_sentence2': 1995}, 'zho_Hant-lit_Latn': {'num_samples': 1997, 'number_of_characters': 350875, 'unique_pairs': 1996, 'min_sentence1_length': 3, 'average_sentence1_length': 45.81, 'max_sentence1_length': 200, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 129.89, 'max_sentence2_length': 446, 'unique_sentence2': 1995}, 'zho_Hant-nld_Latn': {'num_samples': 1997, 'number_of_characters': 383393, 'unique_pairs': 1996, 'min_sentence1_length': 3, 'average_sentence1_length': 45.81, 'max_sentence1_length': 200, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 146.18, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'zho_Hant-pol_Latn': {'num_samples': 1997, 'number_of_characters': 369344, 'unique_pairs': 1996, 'min_sentence1_length': 3, 'average_sentence1_length': 45.81, 'max_sentence1_length': 200, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 139.14, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'zho_Hant-por_Latn': {'num_samples': 1997, 'number_of_characters': 368693, 'unique_pairs': 1996, 'min_sentence1_length': 3, 'average_sentence1_length': 45.81, 'max_sentence1_length': 200, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 138.82, 'max_sentence2_length': 497, 'unique_sentence2': 1996}, 'zho_Hant-rus_Cyrl': {'num_samples': 1997, 'number_of_characters': 366371, 'unique_pairs': 1996, 'min_sentence1_length': 3, 'average_sentence1_length': 45.81, 'max_sentence1_length': 200, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 137.65, 'max_sentence2_length': 419, 'unique_sentence2': 1996}, 'zho_Hant-spa_Latn': {'num_samples': 1997, 'number_of_characters': 379678, 'unique_pairs': 1996, 'min_sentence1_length': 3, 'average_sentence1_length': 45.81, 'max_sentence1_length': 200, 'unique_sentence1': 1996, 'min_sentence2_length': 1, 'average_sentence2_length': 144.32, 'max_sentence2_length': 504, 'unique_sentence2': 1996}, 'zho_Hant-swa_Latn': {'num_samples': 1997, 'number_of_characters': 363987, 'unique_pairs': 1997, 'min_sentence1_length': 3, 'average_sentence1_length': 45.81, 'max_sentence1_length': 200, 'unique_sentence1': 1996, 'min_sentence2_length': 10, 'average_sentence2_length': 136.46, 'max_sentence2_length': 430, 'unique_sentence2': 1997}, 'zho_Hant-swe_Latn': {'num_samples': 1997, 'number_of_characters': 343305, 'unique_pairs': 1996, 'min_sentence1_length': 3, 'average_sentence1_length': 45.81, 'max_sentence1_length': 200, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 126.1, 'max_sentence2_length': 430, 'unique_sentence2': 1996}, 'zho_Hant-tam_Taml': {'num_samples': 1997, 'number_of_characters': 401439, 'unique_pairs': 1997, 'min_sentence1_length': 3, 'average_sentence1_length': 45.81, 'max_sentence1_length': 200, 'unique_sentence1': 1996, 'min_sentence2_length': 11, 'average_sentence2_length': 155.21, 'max_sentence2_length': 581, 'unique_sentence2': 1997}, 'zho_Hant-tur_Latn': {'num_samples': 1997, 'number_of_characters': 357091, 'unique_pairs': 1997, 'min_sentence1_length': 3, 'average_sentence1_length': 45.81, 'max_sentence1_length': 200, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 133.01, 'max_sentence2_length': 504, 'unique_sentence2': 1997}, 'zho_Hant-vie_Latn': {'num_samples': 1997, 'number_of_characters': 362599, 'unique_pairs': 1997, 'min_sentence1_length': 3, 'average_sentence1_length': 45.81, 'max_sentence1_length': 200, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 135.76, 'max_sentence2_length': 437, 'unique_sentence2': 1996}, 'zho_Hant-yue_Hant': {'num_samples': 1997, 'number_of_characters': 170365, 'unique_pairs': 1996, 'min_sentence1_length': 3, 'average_sentence1_length': 45.81, 'max_sentence1_length': 200, 'unique_sentence1': 1996, 'min_sentence2_length': 4, 'average_sentence2_length': 39.5, 'max_sentence2_length': 133, 'unique_sentence2': 1996}, 'zho_Hant-zho_Hans': {'num_samples': 1997, 'number_of_characters': 176439, 'unique_pairs': 1997, 'min_sentence1_length': 3, 'average_sentence1_length': 45.81, 'max_sentence1_length': 200, 'unique_sentence1': 1996, 'min_sentence2_length': 4, 'average_sentence2_length': 42.54, 'max_sentence2_length': 263, 'unique_sentence2': 1997}, 'zho_Hant-zul_Latn': {'num_samples': 1997, 'number_of_characters': 349210, 'unique_pairs': 1997, 'min_sentence1_length': 3, 'average_sentence1_length': 45.81, 'max_sentence1_length': 200, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 129.06, 'max_sentence2_length': 494, 'unique_sentence2': 1996}, 'zul_Latn-amh_Ethi': {'num_samples': 1997, 'number_of_characters': 425239, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.06, 'max_sentence1_length': 494, 'unique_sentence1': 1996, 'min_sentence2_length': 1, 'average_sentence2_length': 83.88, 'max_sentence2_length': 290, 'unique_sentence2': 1994}, 'zul_Latn-arb_Arab': {'num_samples': 1997, 'number_of_characters': 488913, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.06, 'max_sentence1_length': 494, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 115.76, 'max_sentence2_length': 362, 'unique_sentence2': 1995}, 'zul_Latn-ben_Beng': {'num_samples': 1997, 'number_of_characters': 501534, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.06, 'max_sentence1_length': 494, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 122.08, 'max_sentence2_length': 402, 'unique_sentence2': 1997}, 'zul_Latn-deu_Latn': {'num_samples': 1997, 'number_of_characters': 553382, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.06, 'max_sentence1_length': 494, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 148.05, 'max_sentence2_length': 508, 'unique_sentence2': 1996}, 'zul_Latn-ell_Grek': {'num_samples': 1997, 'number_of_characters': 556859, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.06, 'max_sentence1_length': 494, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 149.79, 'max_sentence2_length': 584, 'unique_sentence2': 1996}, 'zul_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 505452, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.06, 'max_sentence1_length': 494, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'zul_Latn-fas_Arab': {'num_samples': 1997, 'number_of_characters': 501071, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.06, 'max_sentence1_length': 494, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 121.85, 'max_sentence2_length': 389, 'unique_sentence2': 1995}, 'zul_Latn-fin_Latn': {'num_samples': 1997, 'number_of_characters': 527532, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.06, 'max_sentence1_length': 494, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 135.1, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'zul_Latn-fra_Latn': {'num_samples': 1997, 'number_of_characters': 550840, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.06, 'max_sentence1_length': 494, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 146.77, 'max_sentence2_length': 512, 'unique_sentence2': 1996}, 'zul_Latn-hau_Latn': {'num_samples': 1997, 'number_of_characters': 527698, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.06, 'max_sentence1_length': 494, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 135.19, 'max_sentence2_length': 483, 'unique_sentence2': 1997}, 'zul_Latn-heb_Hebr': {'num_samples': 1997, 'number_of_characters': 458028, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.06, 'max_sentence1_length': 494, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 100.3, 'max_sentence2_length': 375, 'unique_sentence2': 1996}, 'zul_Latn-hin_Deva': {'num_samples': 1997, 'number_of_characters': 519307, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.06, 'max_sentence1_length': 494, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 130.98, 'max_sentence2_length': 394, 'unique_sentence2': 1996}, 'zul_Latn-hun_Latn': {'num_samples': 1997, 'number_of_characters': 536108, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.06, 'max_sentence1_length': 494, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 139.4, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'zul_Latn-ibo_Latn': {'num_samples': 1997, 'number_of_characters': 503833, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.06, 'max_sentence1_length': 494, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 123.24, 'max_sentence2_length': 469, 'unique_sentence2': 1997}, 'zul_Latn-ind_Latn': {'num_samples': 1997, 'number_of_characters': 544704, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.06, 'max_sentence1_length': 494, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 143.7, 'max_sentence2_length': 486, 'unique_sentence2': 1997}, 'zul_Latn-jpn_Jpan': {'num_samples': 1997, 'number_of_characters': 369358, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.06, 'max_sentence1_length': 494, 'unique_sentence1': 1996, 'min_sentence2_length': 4, 'average_sentence2_length': 55.9, 'max_sentence2_length': 189, 'unique_sentence2': 1994}, 'zul_Latn-kor_Hang': {'num_samples': 1997, 'number_of_characters': 391137, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.06, 'max_sentence1_length': 494, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 66.8, 'max_sentence2_length': 217, 'unique_sentence2': 1995}, 'zul_Latn-lit_Latn': {'num_samples': 1997, 'number_of_characters': 517129, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.06, 'max_sentence1_length': 494, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 129.89, 'max_sentence2_length': 446, 'unique_sentence2': 1995}, 'zul_Latn-nld_Latn': {'num_samples': 1997, 'number_of_characters': 549647, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.06, 'max_sentence1_length': 494, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 146.18, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'zul_Latn-nso_Latn': {'num_samples': 1997, 'number_of_characters': 549231, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.06, 'max_sentence1_length': 494, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 145.97, 'max_sentence2_length': 487, 'unique_sentence2': 1996}, 'zul_Latn-orm_Ethi': {'num_samples': 1997, 'number_of_characters': 495163, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.06, 'max_sentence1_length': 494, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 118.89, 'max_sentence2_length': 466, 'unique_sentence2': 1984}, 'zul_Latn-pol_Latn': {'num_samples': 1997, 'number_of_characters': 535598, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.06, 'max_sentence1_length': 494, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 139.14, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'zul_Latn-por_Latn': {'num_samples': 1997, 'number_of_characters': 534947, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.06, 'max_sentence1_length': 494, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 138.82, 'max_sentence2_length': 497, 'unique_sentence2': 1996}, 'zul_Latn-rus_Cyrl': {'num_samples': 1997, 'number_of_characters': 532625, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.06, 'max_sentence1_length': 494, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 137.65, 'max_sentence2_length': 419, 'unique_sentence2': 1996}, 'zul_Latn-som_Latn': {'num_samples': 1997, 'number_of_characters': 549024, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.06, 'max_sentence1_length': 494, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 145.86, 'max_sentence2_length': 455, 'unique_sentence2': 1997}, 'zul_Latn-spa_Latn': {'num_samples': 1997, 'number_of_characters': 545932, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.06, 'max_sentence1_length': 494, 'unique_sentence1': 1996, 'min_sentence2_length': 1, 'average_sentence2_length': 144.32, 'max_sentence2_length': 504, 'unique_sentence2': 1996}, 'zul_Latn-ssw_Latn': {'num_samples': 1997, 'number_of_characters': 545874, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.06, 'max_sentence1_length': 494, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 144.29, 'max_sentence2_length': 510, 'unique_sentence2': 1996}, 'zul_Latn-swa_Latn': {'num_samples': 1997, 'number_of_characters': 530241, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.06, 'max_sentence1_length': 494, 'unique_sentence1': 1996, 'min_sentence2_length': 10, 'average_sentence2_length': 136.46, 'max_sentence2_length': 430, 'unique_sentence2': 1997}, 'zul_Latn-swe_Latn': {'num_samples': 1997, 'number_of_characters': 509559, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.06, 'max_sentence1_length': 494, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 126.1, 'max_sentence2_length': 430, 'unique_sentence2': 1996}, 'zul_Latn-tam_Taml': {'num_samples': 1997, 'number_of_characters': 567693, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.06, 'max_sentence1_length': 494, 'unique_sentence1': 1996, 'min_sentence2_length': 11, 'average_sentence2_length': 155.21, 'max_sentence2_length': 581, 'unique_sentence2': 1997}, 'zul_Latn-tir_Ethi': {'num_samples': 1997, 'number_of_characters': 422970, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.06, 'max_sentence1_length': 494, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 82.74, 'max_sentence2_length': 272, 'unique_sentence2': 1996}, 'zul_Latn-tsn_Latn': {'num_samples': 1997, 'number_of_characters': 592015, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.06, 'max_sentence1_length': 494, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 167.39, 'max_sentence2_length': 556, 'unique_sentence2': 1997}, 'zul_Latn-tur_Latn': {'num_samples': 1997, 'number_of_characters': 523345, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.06, 'max_sentence1_length': 494, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 133.01, 'max_sentence2_length': 504, 'unique_sentence2': 1997}, 'zul_Latn-vie_Latn': {'num_samples': 1997, 'number_of_characters': 528853, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.06, 'max_sentence1_length': 494, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 135.76, 'max_sentence2_length': 437, 'unique_sentence2': 1996}, 'zul_Latn-wol_Latn': {'num_samples': 1997, 'number_of_characters': 497535, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.06, 'max_sentence1_length': 494, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 120.08, 'max_sentence2_length': 405, 'unique_sentence2': 1990}, 'zul_Latn-xho_Latn': {'num_samples': 1997, 'number_of_characters': 525822, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.06, 'max_sentence1_length': 494, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 134.25, 'max_sentence2_length': 492, 'unique_sentence2': 1997}, 'zul_Latn-yor_Latn': {'num_samples': 1997, 'number_of_characters': 573820, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.06, 'max_sentence1_length': 494, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 158.28, 'max_sentence2_length': 582, 'unique_sentence2': 1996}, 'zul_Latn-zho_Hant': {'num_samples': 1997, 'number_of_characters': 349210, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.06, 'max_sentence1_length': 494, 'unique_sentence1': 1996, 'min_sentence2_length': 3, 'average_sentence2_length': 45.81, 'max_sentence2_length': 200, 'unique_sentence2': 1996}}}} | +| [NSynth](https://huggingface.co/datasets/anime-sh/NSYNTH_PITCH_HEAR) (Jesse Engel, 2017) | ['eng'] | AudioClassification | a2t | [Music] | None | None | +| [NTREXBitextMining](https://huggingface.co/datasets/davidstap/NTREX) (Federmann et al., 2022) | ['afr', 'amh', 'arb', 'aze', 'bak', 'bel', 'bem', 'ben', 'bod', 'bos', 'bul', 'cat', 'ces', 'ckb', 'cym', 'dan', 'deu', 'div', 'dzo', 'ell', 'eng', 'eus', 'ewe', 'fao', 'fas', 'fij', 'fil', 'fin', 'fra', 'fuc', 'gle', 'glg', 'guj', 'hau', 'heb', 'hin', 'hmn', 'hrv', 'hun', 'hye', 'ibo', 'ind', 'isl', 'ita', 'jpn', 'kan', 'kat', 'kaz', 'khm', 'kin', 'kir', 'kmr', 'kor', 'lao', 'lav', 'lit', 'ltz', 'mal', 'mar', 'mey', 'mkd', 'mlg', 'mlt', 'mon', 'mri', 'msa', 'mya', 'nde', 'nep', 'nld', 'nno', 'nob', 'nso', 'nya', 'orm', 'pan', 'pol', 'por', 'prs', 'pus', 'ron', 'rus', 'shi', 'sin', 'slk', 'slv', 'smo', 'sna', 'snd', 'som', 'spa', 'sqi', 'srp', 'ssw', 'swa', 'swe', 'tah', 'tam', 'tat', 'tel', 'tgk', 'tha', 'tir', 'ton', 'tsn', 'tuk', 'tur', 'uig', 'ukr', 'urd', 'uzb', 'ven', 'vie', 'wol', 'xho', 'yor', 'yue', 'zho', 'zul'] | BitextMining | s2s | [News, Written] | {'test': 3826252} | {'test': {'num_samples': 3826252, 'number_of_characters': 988355274, 'unique_pairs': 3820263, 'min_sentence1_length': 1, 'average_sentence1_length': 129.15, 'max_sentence1_length': 773, 'unique_sentence1': 241259, 'min_sentence2_length': 1, 'average_sentence2_length': 129.15, 'max_sentence2_length': 773, 'unique_sentence2': 241259, 'hf_subset_descriptive_stats': {'afr_Latn-dan_Latn': {'num_samples': 1997, 'number_of_characters': 520490, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 134.38, 'max_sentence1_length': 437, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 126.26, 'max_sentence2_length': 522, 'unique_sentence2': 1995}, 'afr_Latn-deu_Latn': {'num_samples': 1997, 'number_of_characters': 564002, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 134.38, 'max_sentence1_length': 437, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 148.05, 'max_sentence2_length': 508, 'unique_sentence2': 1996}, 'afr_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 516072, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 134.38, 'max_sentence1_length': 437, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'afr_Latn-fao_Latn': {'num_samples': 1997, 'number_of_characters': 526155, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 134.38, 'max_sentence1_length': 437, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 129.1, 'max_sentence2_length': 433, 'unique_sentence2': 1997}, 'afr_Latn-isl_Latn': {'num_samples': 1997, 'number_of_characters': 530560, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 134.38, 'max_sentence1_length': 437, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 131.3, 'max_sentence2_length': 399, 'unique_sentence2': 1996}, 'afr_Latn-ltz_Latn': {'num_samples': 1997, 'number_of_characters': 549109, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 134.38, 'max_sentence1_length': 437, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 140.59, 'max_sentence2_length': 543, 'unique_sentence2': 1996}, 'afr_Latn-nld_Latn': {'num_samples': 1997, 'number_of_characters': 560267, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 134.38, 'max_sentence1_length': 437, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 146.18, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'afr_Latn-nno_Latn': {'num_samples': 1997, 'number_of_characters': 516709, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 134.38, 'max_sentence1_length': 437, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.37, 'max_sentence2_length': 417, 'unique_sentence2': 1996}, 'afr_Latn-nob_Latn': {'num_samples': 1997, 'number_of_characters': 519796, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 134.38, 'max_sentence1_length': 437, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 125.91, 'max_sentence2_length': 482, 'unique_sentence2': 1996}, 'afr_Latn-swe_Latn': {'num_samples': 1997, 'number_of_characters': 520179, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 134.38, 'max_sentence1_length': 437, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 126.1, 'max_sentence2_length': 430, 'unique_sentence2': 1996}, 'amh_Ethi-eng_Latn': {'num_samples': 1997, 'number_of_characters': 415227, 'unique_pairs': 1997, 'min_sentence1_length': 1, 'average_sentence1_length': 83.88, 'max_sentence1_length': 290, 'unique_sentence1': 1994, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'amh_Ethi-hau_Latn': {'num_samples': 1997, 'number_of_characters': 437473, 'unique_pairs': 1997, 'min_sentence1_length': 1, 'average_sentence1_length': 83.88, 'max_sentence1_length': 290, 'unique_sentence1': 1994, 'min_sentence2_length': 8, 'average_sentence2_length': 135.19, 'max_sentence2_length': 483, 'unique_sentence2': 1997}, 'amh_Ethi-ibo_Latn': {'num_samples': 1997, 'number_of_characters': 413608, 'unique_pairs': 1997, 'min_sentence1_length': 1, 'average_sentence1_length': 83.88, 'max_sentence1_length': 290, 'unique_sentence1': 1994, 'min_sentence2_length': 6, 'average_sentence2_length': 123.24, 'max_sentence2_length': 469, 'unique_sentence2': 1997}, 'amh_Ethi-nso_Latn': {'num_samples': 1997, 'number_of_characters': 459006, 'unique_pairs': 1996, 'min_sentence1_length': 1, 'average_sentence1_length': 83.88, 'max_sentence1_length': 290, 'unique_sentence1': 1994, 'min_sentence2_length': 5, 'average_sentence2_length': 145.97, 'max_sentence2_length': 487, 'unique_sentence2': 1996}, 'amh_Ethi-orm_Ethi': {'num_samples': 1997, 'number_of_characters': 404938, 'unique_pairs': 1996, 'min_sentence1_length': 1, 'average_sentence1_length': 83.88, 'max_sentence1_length': 290, 'unique_sentence1': 1994, 'min_sentence2_length': 9, 'average_sentence2_length': 118.89, 'max_sentence2_length': 466, 'unique_sentence2': 1984}, 'amh_Ethi-som_Latn': {'num_samples': 1997, 'number_of_characters': 458799, 'unique_pairs': 1997, 'min_sentence1_length': 1, 'average_sentence1_length': 83.88, 'max_sentence1_length': 290, 'unique_sentence1': 1994, 'min_sentence2_length': 8, 'average_sentence2_length': 145.86, 'max_sentence2_length': 455, 'unique_sentence2': 1997}, 'amh_Ethi-ssw_Latn': {'num_samples': 1997, 'number_of_characters': 455649, 'unique_pairs': 1996, 'min_sentence1_length': 1, 'average_sentence1_length': 83.88, 'max_sentence1_length': 290, 'unique_sentence1': 1994, 'min_sentence2_length': 8, 'average_sentence2_length': 144.29, 'max_sentence2_length': 510, 'unique_sentence2': 1996}, 'amh_Ethi-swa_Latn': {'num_samples': 1997, 'number_of_characters': 440016, 'unique_pairs': 1997, 'min_sentence1_length': 1, 'average_sentence1_length': 83.88, 'max_sentence1_length': 290, 'unique_sentence1': 1994, 'min_sentence2_length': 10, 'average_sentence2_length': 136.46, 'max_sentence2_length': 430, 'unique_sentence2': 1997}, 'amh_Ethi-tir_Ethi': {'num_samples': 1997, 'number_of_characters': 332745, 'unique_pairs': 1996, 'min_sentence1_length': 1, 'average_sentence1_length': 83.88, 'max_sentence1_length': 290, 'unique_sentence1': 1994, 'min_sentence2_length': 5, 'average_sentence2_length': 82.74, 'max_sentence2_length': 272, 'unique_sentence2': 1996}, 'amh_Ethi-tsn_Latn': {'num_samples': 1997, 'number_of_characters': 501790, 'unique_pairs': 1997, 'min_sentence1_length': 1, 'average_sentence1_length': 83.88, 'max_sentence1_length': 290, 'unique_sentence1': 1994, 'min_sentence2_length': 6, 'average_sentence2_length': 167.39, 'max_sentence2_length': 556, 'unique_sentence2': 1997}, 'amh_Ethi-wol_Latn': {'num_samples': 1997, 'number_of_characters': 407310, 'unique_pairs': 1996, 'min_sentence1_length': 1, 'average_sentence1_length': 83.88, 'max_sentence1_length': 290, 'unique_sentence1': 1994, 'min_sentence2_length': 6, 'average_sentence2_length': 120.08, 'max_sentence2_length': 405, 'unique_sentence2': 1990}, 'amh_Ethi-xho_Latn': {'num_samples': 1997, 'number_of_characters': 435597, 'unique_pairs': 1997, 'min_sentence1_length': 1, 'average_sentence1_length': 83.88, 'max_sentence1_length': 290, 'unique_sentence1': 1994, 'min_sentence2_length': 6, 'average_sentence2_length': 134.25, 'max_sentence2_length': 492, 'unique_sentence2': 1997}, 'amh_Ethi-yor_Latn': {'num_samples': 1997, 'number_of_characters': 483595, 'unique_pairs': 1996, 'min_sentence1_length': 1, 'average_sentence1_length': 83.88, 'max_sentence1_length': 290, 'unique_sentence1': 1994, 'min_sentence2_length': 7, 'average_sentence2_length': 158.28, 'max_sentence2_length': 582, 'unique_sentence2': 1996}, 'amh_Ethi-zul_Latn': {'num_samples': 1997, 'number_of_characters': 425239, 'unique_pairs': 1997, 'min_sentence1_length': 1, 'average_sentence1_length': 83.88, 'max_sentence1_length': 290, 'unique_sentence1': 1994, 'min_sentence2_length': 8, 'average_sentence2_length': 129.06, 'max_sentence2_length': 494, 'unique_sentence2': 1996}, 'arb_Arab-ben_Beng': {'num_samples': 1997, 'number_of_characters': 474983, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 115.76, 'max_sentence1_length': 362, 'unique_sentence1': 1995, 'min_sentence2_length': 6, 'average_sentence2_length': 122.08, 'max_sentence2_length': 402, 'unique_sentence2': 1997}, 'arb_Arab-ckb_Arab': {'num_samples': 1997, 'number_of_characters': 483548, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 115.76, 'max_sentence1_length': 362, 'unique_sentence1': 1995, 'min_sentence2_length': 5, 'average_sentence2_length': 126.37, 'max_sentence2_length': 399, 'unique_sentence2': 1995}, 'arb_Arab-deu_Latn': {'num_samples': 1997, 'number_of_characters': 526831, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 115.76, 'max_sentence1_length': 362, 'unique_sentence1': 1995, 'min_sentence2_length': 9, 'average_sentence2_length': 148.05, 'max_sentence2_length': 508, 'unique_sentence2': 1996}, 'arb_Arab-ell_Grek': {'num_samples': 1997, 'number_of_characters': 530308, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 115.76, 'max_sentence1_length': 362, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 149.79, 'max_sentence2_length': 584, 'unique_sentence2': 1996}, 'arb_Arab-eng_Latn': {'num_samples': 1997, 'number_of_characters': 478901, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 115.76, 'max_sentence1_length': 362, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'arb_Arab-fas_Arab': {'num_samples': 1997, 'number_of_characters': 474520, 'unique_pairs': 1995, 'min_sentence1_length': 5, 'average_sentence1_length': 115.76, 'max_sentence1_length': 362, 'unique_sentence1': 1995, 'min_sentence2_length': 9, 'average_sentence2_length': 121.85, 'max_sentence2_length': 389, 'unique_sentence2': 1995}, 'arb_Arab-fin_Latn': {'num_samples': 1997, 'number_of_characters': 500981, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 115.76, 'max_sentence1_length': 362, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 135.1, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'arb_Arab-fra_Latn': {'num_samples': 1997, 'number_of_characters': 524289, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 115.76, 'max_sentence1_length': 362, 'unique_sentence1': 1995, 'min_sentence2_length': 8, 'average_sentence2_length': 146.77, 'max_sentence2_length': 512, 'unique_sentence2': 1996}, 'arb_Arab-heb_Hebr': {'num_samples': 1997, 'number_of_characters': 431477, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 115.76, 'max_sentence1_length': 362, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 100.3, 'max_sentence2_length': 375, 'unique_sentence2': 1996}, 'arb_Arab-hin_Deva': {'num_samples': 1997, 'number_of_characters': 492756, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 115.76, 'max_sentence1_length': 362, 'unique_sentence1': 1995, 'min_sentence2_length': 5, 'average_sentence2_length': 130.98, 'max_sentence2_length': 394, 'unique_sentence2': 1996}, 'arb_Arab-hun_Latn': {'num_samples': 1997, 'number_of_characters': 509557, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 115.76, 'max_sentence1_length': 362, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 139.4, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'arb_Arab-ind_Latn': {'num_samples': 1997, 'number_of_characters': 518153, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 115.76, 'max_sentence1_length': 362, 'unique_sentence1': 1995, 'min_sentence2_length': 6, 'average_sentence2_length': 143.7, 'max_sentence2_length': 486, 'unique_sentence2': 1997}, 'arb_Arab-jpn_Jpan': {'num_samples': 1997, 'number_of_characters': 342807, 'unique_pairs': 1995, 'min_sentence1_length': 5, 'average_sentence1_length': 115.76, 'max_sentence1_length': 362, 'unique_sentence1': 1995, 'min_sentence2_length': 4, 'average_sentence2_length': 55.9, 'max_sentence2_length': 189, 'unique_sentence2': 1994}, 'arb_Arab-kmr_Latn': {'num_samples': 1997, 'number_of_characters': 477127, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 115.76, 'max_sentence1_length': 362, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 123.16, 'max_sentence2_length': 420, 'unique_sentence2': 1996}, 'arb_Arab-kor_Hang': {'num_samples': 1997, 'number_of_characters': 364586, 'unique_pairs': 1995, 'min_sentence1_length': 5, 'average_sentence1_length': 115.76, 'max_sentence1_length': 362, 'unique_sentence1': 1995, 'min_sentence2_length': 5, 'average_sentence2_length': 66.8, 'max_sentence2_length': 217, 'unique_sentence2': 1995}, 'arb_Arab-lit_Latn': {'num_samples': 1997, 'number_of_characters': 490578, 'unique_pairs': 1995, 'min_sentence1_length': 5, 'average_sentence1_length': 115.76, 'max_sentence1_length': 362, 'unique_sentence1': 1995, 'min_sentence2_length': 9, 'average_sentence2_length': 129.89, 'max_sentence2_length': 446, 'unique_sentence2': 1995}, 'arb_Arab-mey_Arab': {'num_samples': 1997, 'number_of_characters': 445016, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 115.76, 'max_sentence1_length': 362, 'unique_sentence1': 1995, 'min_sentence2_length': 6, 'average_sentence2_length': 107.08, 'max_sentence2_length': 392, 'unique_sentence2': 1993}, 'arb_Arab-nld_Latn': {'num_samples': 1997, 'number_of_characters': 523096, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 115.76, 'max_sentence1_length': 362, 'unique_sentence1': 1995, 'min_sentence2_length': 8, 'average_sentence2_length': 146.18, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'arb_Arab-pol_Latn': {'num_samples': 1997, 'number_of_characters': 509047, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 115.76, 'max_sentence1_length': 362, 'unique_sentence1': 1995, 'min_sentence2_length': 9, 'average_sentence2_length': 139.14, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'arb_Arab-por_Latn': {'num_samples': 1997, 'number_of_characters': 508396, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 115.76, 'max_sentence1_length': 362, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 138.82, 'max_sentence2_length': 497, 'unique_sentence2': 1996}, 'arb_Arab-prs_Arab': {'num_samples': 1997, 'number_of_characters': 473717, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 115.76, 'max_sentence1_length': 362, 'unique_sentence1': 1995, 'min_sentence2_length': 8, 'average_sentence2_length': 121.45, 'max_sentence2_length': 365, 'unique_sentence2': 1997}, 'arb_Arab-pus_Arab': {'num_samples': 1997, 'number_of_characters': 473814, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 115.76, 'max_sentence1_length': 362, 'unique_sentence1': 1995, 'min_sentence2_length': 8, 'average_sentence2_length': 121.5, 'max_sentence2_length': 366, 'unique_sentence2': 1996}, 'arb_Arab-rus_Cyrl': {'num_samples': 1997, 'number_of_characters': 506074, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 115.76, 'max_sentence1_length': 362, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 137.65, 'max_sentence2_length': 419, 'unique_sentence2': 1996}, 'arb_Arab-shi_Arab': {'num_samples': 1997, 'number_of_characters': 446094, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 115.76, 'max_sentence1_length': 362, 'unique_sentence1': 1995, 'min_sentence2_length': 3, 'average_sentence2_length': 107.62, 'max_sentence2_length': 378, 'unique_sentence2': 1996}, 'arb_Arab-spa_Latn': {'num_samples': 1997, 'number_of_characters': 519381, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 115.76, 'max_sentence1_length': 362, 'unique_sentence1': 1995, 'min_sentence2_length': 1, 'average_sentence2_length': 144.32, 'max_sentence2_length': 504, 'unique_sentence2': 1996}, 'arb_Arab-swa_Latn': {'num_samples': 1997, 'number_of_characters': 503690, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 115.76, 'max_sentence1_length': 362, 'unique_sentence1': 1995, 'min_sentence2_length': 10, 'average_sentence2_length': 136.46, 'max_sentence2_length': 430, 'unique_sentence2': 1997}, 'arb_Arab-swe_Latn': {'num_samples': 1997, 'number_of_characters': 483008, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 115.76, 'max_sentence1_length': 362, 'unique_sentence1': 1995, 'min_sentence2_length': 8, 'average_sentence2_length': 126.1, 'max_sentence2_length': 430, 'unique_sentence2': 1996}, 'arb_Arab-tam_Taml': {'num_samples': 1997, 'number_of_characters': 541142, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 115.76, 'max_sentence1_length': 362, 'unique_sentence1': 1995, 'min_sentence2_length': 11, 'average_sentence2_length': 155.21, 'max_sentence2_length': 581, 'unique_sentence2': 1997}, 'arb_Arab-tgk_Cyrl': {'num_samples': 1997, 'number_of_characters': 505328, 'unique_pairs': 1995, 'min_sentence1_length': 5, 'average_sentence1_length': 115.76, 'max_sentence1_length': 362, 'unique_sentence1': 1995, 'min_sentence2_length': 3, 'average_sentence2_length': 137.28, 'max_sentence2_length': 451, 'unique_sentence2': 1995}, 'arb_Arab-tur_Latn': {'num_samples': 1997, 'number_of_characters': 496794, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 115.76, 'max_sentence1_length': 362, 'unique_sentence1': 1995, 'min_sentence2_length': 6, 'average_sentence2_length': 133.01, 'max_sentence2_length': 504, 'unique_sentence2': 1997}, 'arb_Arab-vie_Latn': {'num_samples': 1997, 'number_of_characters': 502302, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 115.76, 'max_sentence1_length': 362, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 135.76, 'max_sentence2_length': 437, 'unique_sentence2': 1996}, 'arb_Arab-zho_Hant': {'num_samples': 1997, 'number_of_characters': 322659, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 115.76, 'max_sentence1_length': 362, 'unique_sentence1': 1995, 'min_sentence2_length': 3, 'average_sentence2_length': 45.81, 'max_sentence2_length': 200, 'unique_sentence2': 1996}, 'arb_Arab-zul_Latn': {'num_samples': 1997, 'number_of_characters': 488913, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 115.76, 'max_sentence1_length': 362, 'unique_sentence1': 1995, 'min_sentence2_length': 8, 'average_sentence2_length': 129.06, 'max_sentence2_length': 494, 'unique_sentence2': 1996}, 'aze_Latn-bak_Cyrl': {'num_samples': 1997, 'number_of_characters': 515960, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 135.02, 'max_sentence1_length': 398, 'unique_sentence1': 1997, 'min_sentence2_length': 11, 'average_sentence2_length': 123.35, 'max_sentence2_length': 437, 'unique_sentence2': 1995}, 'aze_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 517354, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 135.02, 'max_sentence1_length': 398, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'aze_Latn-kaz_Cyrl': {'num_samples': 1997, 'number_of_characters': 529910, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 135.02, 'max_sentence1_length': 398, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 130.33, 'max_sentence2_length': 473, 'unique_sentence2': 1996}, 'aze_Latn-kir_Cyrl': {'num_samples': 1997, 'number_of_characters': 520498, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 135.02, 'max_sentence1_length': 398, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 125.62, 'max_sentence2_length': 395, 'unique_sentence2': 1996}, 'aze_Latn-tat_Cyrl': {'num_samples': 1997, 'number_of_characters': 515560, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 135.02, 'max_sentence1_length': 398, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 123.15, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'aze_Latn-tuk_Latn': {'num_samples': 1997, 'number_of_characters': 554908, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 135.02, 'max_sentence1_length': 398, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 142.85, 'max_sentence2_length': 576, 'unique_sentence2': 1996}, 'aze_Latn-tur_Latn': {'num_samples': 1997, 'number_of_characters': 535247, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 135.02, 'max_sentence1_length': 398, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 133.01, 'max_sentence2_length': 504, 'unique_sentence2': 1997}, 'aze_Latn-uig_Arab': {'num_samples': 1997, 'number_of_characters': 580656, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 135.02, 'max_sentence1_length': 398, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 155.74, 'max_sentence2_length': 592, 'unique_sentence2': 1996}, 'aze_Latn-uzb_Latn': {'num_samples': 1997, 'number_of_characters': 563329, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 135.02, 'max_sentence1_length': 398, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 147.07, 'max_sentence2_length': 470, 'unique_sentence2': 1996}, 'bak_Cyrl-aze_Latn': {'num_samples': 1997, 'number_of_characters': 515960, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 123.35, 'max_sentence1_length': 437, 'unique_sentence1': 1995, 'min_sentence2_length': 5, 'average_sentence2_length': 135.02, 'max_sentence2_length': 398, 'unique_sentence2': 1997}, 'bak_Cyrl-eng_Latn': {'num_samples': 1997, 'number_of_characters': 494046, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 123.35, 'max_sentence1_length': 437, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'bak_Cyrl-kaz_Cyrl': {'num_samples': 1997, 'number_of_characters': 506602, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 123.35, 'max_sentence1_length': 437, 'unique_sentence1': 1995, 'min_sentence2_length': 8, 'average_sentence2_length': 130.33, 'max_sentence2_length': 473, 'unique_sentence2': 1996}, 'bak_Cyrl-kir_Cyrl': {'num_samples': 1997, 'number_of_characters': 497190, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 123.35, 'max_sentence1_length': 437, 'unique_sentence1': 1995, 'min_sentence2_length': 8, 'average_sentence2_length': 125.62, 'max_sentence2_length': 395, 'unique_sentence2': 1996}, 'bak_Cyrl-tat_Cyrl': {'num_samples': 1997, 'number_of_characters': 492252, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 123.35, 'max_sentence1_length': 437, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 123.15, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'bak_Cyrl-tuk_Latn': {'num_samples': 1997, 'number_of_characters': 531600, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 123.35, 'max_sentence1_length': 437, 'unique_sentence1': 1995, 'min_sentence2_length': 9, 'average_sentence2_length': 142.85, 'max_sentence2_length': 576, 'unique_sentence2': 1996}, 'bak_Cyrl-tur_Latn': {'num_samples': 1997, 'number_of_characters': 511939, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 123.35, 'max_sentence1_length': 437, 'unique_sentence1': 1995, 'min_sentence2_length': 6, 'average_sentence2_length': 133.01, 'max_sentence2_length': 504, 'unique_sentence2': 1997}, 'bak_Cyrl-uig_Arab': {'num_samples': 1997, 'number_of_characters': 557348, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 123.35, 'max_sentence1_length': 437, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 155.74, 'max_sentence2_length': 592, 'unique_sentence2': 1996}, 'bak_Cyrl-uzb_Latn': {'num_samples': 1997, 'number_of_characters': 540021, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 123.35, 'max_sentence1_length': 437, 'unique_sentence1': 1995, 'min_sentence2_length': 8, 'average_sentence2_length': 147.07, 'max_sentence2_length': 470, 'unique_sentence2': 1996}, 'bel_Cyrl-bos_Latn': {'num_samples': 1997, 'number_of_characters': 511000, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 128.24, 'max_sentence1_length': 422, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 127.65, 'max_sentence2_length': 434, 'unique_sentence2': 1996}, 'bel_Cyrl-bul_Cyrl': {'num_samples': 1997, 'number_of_characters': 525979, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 128.24, 'max_sentence1_length': 422, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 135.15, 'max_sentence2_length': 493, 'unique_sentence2': 1996}, 'bel_Cyrl-ces_Latn': {'num_samples': 1997, 'number_of_characters': 497408, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 128.24, 'max_sentence1_length': 422, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 120.84, 'max_sentence2_length': 474, 'unique_sentence2': 1997}, 'bel_Cyrl-eng_Latn': {'num_samples': 1997, 'number_of_characters': 503810, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 128.24, 'max_sentence1_length': 422, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'bel_Cyrl-hrv_Latn': {'num_samples': 1997, 'number_of_characters': 512015, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 128.24, 'max_sentence1_length': 422, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 128.15, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'bel_Cyrl-mkd_Cyrl': {'num_samples': 1997, 'number_of_characters': 523981, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 128.24, 'max_sentence1_length': 422, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 134.15, 'max_sentence2_length': 451, 'unique_sentence2': 1997}, 'bel_Cyrl-pol_Latn': {'num_samples': 1997, 'number_of_characters': 533956, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 128.24, 'max_sentence1_length': 422, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 139.14, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'bel_Cyrl-rus_Cyrl': {'num_samples': 1997, 'number_of_characters': 530983, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 128.24, 'max_sentence1_length': 422, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 137.65, 'max_sentence2_length': 419, 'unique_sentence2': 1996}, 'bel_Cyrl-slk_Latn': {'num_samples': 1997, 'number_of_characters': 509059, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 128.24, 'max_sentence1_length': 422, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 126.67, 'max_sentence2_length': 403, 'unique_sentence2': 1996}, 'bel_Cyrl-slv_Latn': {'num_samples': 1997, 'number_of_characters': 508986, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 128.24, 'max_sentence1_length': 422, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 126.64, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'bel_Cyrl-srp_Cyrl': {'num_samples': 1997, 'number_of_characters': 508393, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 128.24, 'max_sentence1_length': 422, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 126.34, 'max_sentence2_length': 439, 'unique_sentence2': 1995}, 'bel_Cyrl-srp_Latn': {'num_samples': 1997, 'number_of_characters': 512231, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 128.24, 'max_sentence1_length': 422, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 128.26, 'max_sentence2_length': 452, 'unique_sentence2': 1996}, 'bel_Cyrl-ukr_Cyrl': {'num_samples': 1997, 'number_of_characters': 518873, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 128.24, 'max_sentence1_length': 422, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 131.59, 'max_sentence2_length': 440, 'unique_sentence2': 1996}, 'bem_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 546212, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 149.47, 'max_sentence1_length': 465, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'bem_Latn-ewe_Latn': {'num_samples': 1997, 'number_of_characters': 537470, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 149.47, 'max_sentence1_length': 465, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 119.67, 'max_sentence2_length': 493, 'unique_sentence2': 1994}, 'bem_Latn-fuc_Latn': {'num_samples': 1997, 'number_of_characters': 526972, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 149.47, 'max_sentence1_length': 465, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 114.41, 'max_sentence2_length': 376, 'unique_sentence2': 1996}, 'bem_Latn-kin_Latn': {'num_samples': 1997, 'number_of_characters': 602279, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 149.47, 'max_sentence1_length': 465, 'unique_sentence1': 1997, 'min_sentence2_length': 10, 'average_sentence2_length': 152.12, 'max_sentence2_length': 541, 'unique_sentence2': 1996}, 'bem_Latn-nde_Latn': {'num_samples': 1997, 'number_of_characters': 596231, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 149.47, 'max_sentence1_length': 465, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 149.09, 'max_sentence2_length': 590, 'unique_sentence2': 1997}, 'bem_Latn-nya_Latn': {'num_samples': 1997, 'number_of_characters': 582774, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 149.47, 'max_sentence1_length': 465, 'unique_sentence1': 1997, 'min_sentence2_length': 10, 'average_sentence2_length': 142.35, 'max_sentence2_length': 464, 'unique_sentence2': 1993}, 'bem_Latn-sna_Latn': {'num_samples': 1997, 'number_of_characters': 596822, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 149.47, 'max_sentence1_length': 465, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 149.39, 'max_sentence2_length': 511, 'unique_sentence2': 1995}, 'bem_Latn-ven_Latn': {'num_samples': 1997, 'number_of_characters': 598248, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 149.47, 'max_sentence1_length': 465, 'unique_sentence1': 1997, 'min_sentence2_length': 10, 'average_sentence2_length': 150.1, 'max_sentence2_length': 535, 'unique_sentence2': 1993}, 'ben_Beng-arb_Arab': {'num_samples': 1997, 'number_of_characters': 474983, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 122.08, 'max_sentence1_length': 402, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 115.76, 'max_sentence2_length': 362, 'unique_sentence2': 1995}, 'ben_Beng-deu_Latn': {'num_samples': 1997, 'number_of_characters': 539452, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 122.08, 'max_sentence1_length': 402, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 148.05, 'max_sentence2_length': 508, 'unique_sentence2': 1996}, 'ben_Beng-div_Thaa': {'num_samples': 1997, 'number_of_characters': 547650, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 122.08, 'max_sentence1_length': 402, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 152.15, 'max_sentence2_length': 609, 'unique_sentence2': 1996}, 'ben_Beng-ell_Grek': {'num_samples': 1997, 'number_of_characters': 542929, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 122.08, 'max_sentence1_length': 402, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 149.79, 'max_sentence2_length': 584, 'unique_sentence2': 1996}, 'ben_Beng-eng_Latn': {'num_samples': 1997, 'number_of_characters': 491522, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 122.08, 'max_sentence1_length': 402, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'ben_Beng-eus_Latn': {'num_samples': 1997, 'number_of_characters': 519005, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 122.08, 'max_sentence1_length': 402, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 137.81, 'max_sentence2_length': 393, 'unique_sentence2': 1997}, 'ben_Beng-fas_Arab': {'num_samples': 1997, 'number_of_characters': 487141, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 122.08, 'max_sentence1_length': 402, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 121.85, 'max_sentence2_length': 389, 'unique_sentence2': 1995}, 'ben_Beng-fin_Latn': {'num_samples': 1997, 'number_of_characters': 513602, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 122.08, 'max_sentence1_length': 402, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 135.1, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'ben_Beng-fra_Latn': {'num_samples': 1997, 'number_of_characters': 536910, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 122.08, 'max_sentence1_length': 402, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 146.77, 'max_sentence2_length': 512, 'unique_sentence2': 1996}, 'ben_Beng-guj_Gujr': {'num_samples': 1997, 'number_of_characters': 488733, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 122.08, 'max_sentence1_length': 402, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 122.65, 'max_sentence2_length': 378, 'unique_sentence2': 1997}, 'ben_Beng-heb_Hebr': {'num_samples': 1997, 'number_of_characters': 444098, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 122.08, 'max_sentence1_length': 402, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 100.3, 'max_sentence2_length': 375, 'unique_sentence2': 1996}, 'ben_Beng-hin_Deva': {'num_samples': 1997, 'number_of_characters': 505377, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 122.08, 'max_sentence1_length': 402, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 130.98, 'max_sentence2_length': 394, 'unique_sentence2': 1996}, 'ben_Beng-hun_Latn': {'num_samples': 1997, 'number_of_characters': 522178, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 122.08, 'max_sentence1_length': 402, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 139.4, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'ben_Beng-ind_Latn': {'num_samples': 1997, 'number_of_characters': 530774, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 122.08, 'max_sentence1_length': 402, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 143.7, 'max_sentence2_length': 486, 'unique_sentence2': 1997}, 'ben_Beng-jpn_Jpan': {'num_samples': 1997, 'number_of_characters': 355428, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 122.08, 'max_sentence1_length': 402, 'unique_sentence1': 1997, 'min_sentence2_length': 4, 'average_sentence2_length': 55.9, 'max_sentence2_length': 189, 'unique_sentence2': 1994}, 'ben_Beng-kan_Knda': {'num_samples': 1997, 'number_of_characters': 509338, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 122.08, 'max_sentence1_length': 402, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 132.97, 'max_sentence2_length': 449, 'unique_sentence2': 1996}, 'ben_Beng-kor_Hang': {'num_samples': 1997, 'number_of_characters': 377207, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 122.08, 'max_sentence1_length': 402, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 66.8, 'max_sentence2_length': 217, 'unique_sentence2': 1995}, 'ben_Beng-lit_Latn': {'num_samples': 1997, 'number_of_characters': 503199, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 122.08, 'max_sentence1_length': 402, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 129.89, 'max_sentence2_length': 446, 'unique_sentence2': 1995}, 'ben_Beng-mar_Deva': {'num_samples': 1997, 'number_of_characters': 504689, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 122.08, 'max_sentence1_length': 402, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 130.64, 'max_sentence2_length': 443, 'unique_sentence2': 1995}, 'ben_Beng-nep_Deva': {'num_samples': 1997, 'number_of_characters': 492025, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 122.08, 'max_sentence1_length': 402, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 124.3, 'max_sentence2_length': 412, 'unique_sentence2': 1996}, 'ben_Beng-nld_Latn': {'num_samples': 1997, 'number_of_characters': 535717, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 122.08, 'max_sentence1_length': 402, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 146.18, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'ben_Beng-pan_Guru': {'num_samples': 1997, 'number_of_characters': 494224, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 122.08, 'max_sentence1_length': 402, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 125.4, 'max_sentence2_length': 383, 'unique_sentence2': 1996}, 'ben_Beng-pol_Latn': {'num_samples': 1997, 'number_of_characters': 521668, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 122.08, 'max_sentence1_length': 402, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 139.14, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'ben_Beng-por_Latn': {'num_samples': 1997, 'number_of_characters': 521017, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 122.08, 'max_sentence1_length': 402, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 138.82, 'max_sentence2_length': 497, 'unique_sentence2': 1996}, 'ben_Beng-rus_Cyrl': {'num_samples': 1997, 'number_of_characters': 518695, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 122.08, 'max_sentence1_length': 402, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 137.65, 'max_sentence2_length': 419, 'unique_sentence2': 1996}, 'ben_Beng-sin_Sinh': {'num_samples': 1997, 'number_of_characters': 502543, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 122.08, 'max_sentence1_length': 402, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 129.56, 'max_sentence2_length': 441, 'unique_sentence2': 1996}, 'ben_Beng-snd_Arab': {'num_samples': 1997, 'number_of_characters': 464129, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 122.08, 'max_sentence1_length': 402, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 110.33, 'max_sentence2_length': 335, 'unique_sentence2': 1996}, 'ben_Beng-spa_Latn': {'num_samples': 1997, 'number_of_characters': 532002, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 122.08, 'max_sentence1_length': 402, 'unique_sentence1': 1997, 'min_sentence2_length': 1, 'average_sentence2_length': 144.32, 'max_sentence2_length': 504, 'unique_sentence2': 1996}, 'ben_Beng-swa_Latn': {'num_samples': 1997, 'number_of_characters': 516311, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 122.08, 'max_sentence1_length': 402, 'unique_sentence1': 1997, 'min_sentence2_length': 10, 'average_sentence2_length': 136.46, 'max_sentence2_length': 430, 'unique_sentence2': 1997}, 'ben_Beng-swe_Latn': {'num_samples': 1997, 'number_of_characters': 495629, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 122.08, 'max_sentence1_length': 402, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 126.1, 'max_sentence2_length': 430, 'unique_sentence2': 1996}, 'ben_Beng-tam_Taml': {'num_samples': 1997, 'number_of_characters': 553763, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 122.08, 'max_sentence1_length': 402, 'unique_sentence1': 1997, 'min_sentence2_length': 11, 'average_sentence2_length': 155.21, 'max_sentence2_length': 581, 'unique_sentence2': 1997}, 'ben_Beng-tel_Telu': {'num_samples': 1997, 'number_of_characters': 491329, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 122.08, 'max_sentence1_length': 402, 'unique_sentence1': 1997, 'min_sentence2_length': 10, 'average_sentence2_length': 123.95, 'max_sentence2_length': 412, 'unique_sentence2': 1996}, 'ben_Beng-tur_Latn': {'num_samples': 1997, 'number_of_characters': 509415, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 122.08, 'max_sentence1_length': 402, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 133.01, 'max_sentence2_length': 504, 'unique_sentence2': 1997}, 'ben_Beng-urd_Arab': {'num_samples': 1997, 'number_of_characters': 491800, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 122.08, 'max_sentence1_length': 402, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.19, 'max_sentence2_length': 390, 'unique_sentence2': 1996}, 'ben_Beng-vie_Latn': {'num_samples': 1997, 'number_of_characters': 514923, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 122.08, 'max_sentence1_length': 402, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 135.76, 'max_sentence2_length': 437, 'unique_sentence2': 1996}, 'ben_Beng-zho_Hant': {'num_samples': 1997, 'number_of_characters': 335280, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 122.08, 'max_sentence1_length': 402, 'unique_sentence1': 1997, 'min_sentence2_length': 3, 'average_sentence2_length': 45.81, 'max_sentence2_length': 200, 'unique_sentence2': 1996}, 'ben_Beng-zul_Latn': {'num_samples': 1997, 'number_of_characters': 501534, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 122.08, 'max_sentence1_length': 402, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 129.06, 'max_sentence2_length': 494, 'unique_sentence2': 1996}, 'bod_Tibt-dzo_Tibt': {'num_samples': 1997, 'number_of_characters': 543850, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 150.54, 'max_sentence1_length': 478, 'unique_sentence1': 1993, 'min_sentence2_length': 5, 'average_sentence2_length': 121.79, 'max_sentence2_length': 411, 'unique_sentence2': 1992}, 'bod_Tibt-eng_Latn': {'num_samples': 1997, 'number_of_characters': 548349, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 150.54, 'max_sentence1_length': 478, 'unique_sentence1': 1993, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'bod_Tibt-khm_Khmr': {'num_samples': 1997, 'number_of_characters': 589120, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 150.54, 'max_sentence1_length': 478, 'unique_sentence1': 1993, 'min_sentence2_length': 6, 'average_sentence2_length': 144.46, 'max_sentence2_length': 517, 'unique_sentence2': 1996}, 'bod_Tibt-lao_Laoo': {'num_samples': 1997, 'number_of_characters': 567609, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 150.54, 'max_sentence1_length': 478, 'unique_sentence1': 1993, 'min_sentence2_length': 7, 'average_sentence2_length': 133.69, 'max_sentence2_length': 507, 'unique_sentence2': 1997}, 'bod_Tibt-mon_Mong': {'num_samples': 1997, 'number_of_characters': 559677, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 150.54, 'max_sentence1_length': 478, 'unique_sentence1': 1993, 'min_sentence2_length': 11, 'average_sentence2_length': 129.72, 'max_sentence2_length': 414, 'unique_sentence2': 1997}, 'bod_Tibt-mya_Mymr': {'num_samples': 1997, 'number_of_characters': 612483, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 150.54, 'max_sentence1_length': 478, 'unique_sentence1': 1993, 'min_sentence2_length': 9, 'average_sentence2_length': 156.16, 'max_sentence2_length': 773, 'unique_sentence2': 1997}, 'bod_Tibt-tha_Thai': {'num_samples': 1997, 'number_of_characters': 538097, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 150.54, 'max_sentence1_length': 478, 'unique_sentence1': 1993, 'min_sentence2_length': 7, 'average_sentence2_length': 118.91, 'max_sentence2_length': 439, 'unique_sentence2': 1996}, 'bos_Latn-bel_Cyrl': {'num_samples': 1997, 'number_of_characters': 511000, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 127.65, 'max_sentence1_length': 434, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 128.24, 'max_sentence2_length': 422, 'unique_sentence2': 1996}, 'bos_Latn-bul_Cyrl': {'num_samples': 1997, 'number_of_characters': 524799, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 127.65, 'max_sentence1_length': 434, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 135.15, 'max_sentence2_length': 493, 'unique_sentence2': 1996}, 'bos_Latn-ces_Latn': {'num_samples': 1997, 'number_of_characters': 496228, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 127.65, 'max_sentence1_length': 434, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 120.84, 'max_sentence2_length': 474, 'unique_sentence2': 1997}, 'bos_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 502630, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 127.65, 'max_sentence1_length': 434, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'bos_Latn-hrv_Latn': {'num_samples': 1997, 'number_of_characters': 510835, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 127.65, 'max_sentence1_length': 434, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 128.15, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'bos_Latn-mkd_Cyrl': {'num_samples': 1997, 'number_of_characters': 522801, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 127.65, 'max_sentence1_length': 434, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 134.15, 'max_sentence2_length': 451, 'unique_sentence2': 1997}, 'bos_Latn-pol_Latn': {'num_samples': 1997, 'number_of_characters': 532776, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 127.65, 'max_sentence1_length': 434, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 139.14, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'bos_Latn-rus_Cyrl': {'num_samples': 1997, 'number_of_characters': 529803, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 127.65, 'max_sentence1_length': 434, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 137.65, 'max_sentence2_length': 419, 'unique_sentence2': 1996}, 'bos_Latn-slk_Latn': {'num_samples': 1997, 'number_of_characters': 507879, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 127.65, 'max_sentence1_length': 434, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 126.67, 'max_sentence2_length': 403, 'unique_sentence2': 1996}, 'bos_Latn-slv_Latn': {'num_samples': 1997, 'number_of_characters': 507806, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 127.65, 'max_sentence1_length': 434, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 126.64, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'bos_Latn-srp_Cyrl': {'num_samples': 1997, 'number_of_characters': 507213, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 127.65, 'max_sentence1_length': 434, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 126.34, 'max_sentence2_length': 439, 'unique_sentence2': 1995}, 'bos_Latn-srp_Latn': {'num_samples': 1997, 'number_of_characters': 511051, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 127.65, 'max_sentence1_length': 434, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 128.26, 'max_sentence2_length': 452, 'unique_sentence2': 1996}, 'bos_Latn-ukr_Cyrl': {'num_samples': 1997, 'number_of_characters': 517693, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 127.65, 'max_sentence1_length': 434, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 131.59, 'max_sentence2_length': 440, 'unique_sentence2': 1996}, 'bul_Cyrl-bel_Cyrl': {'num_samples': 1997, 'number_of_characters': 525979, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 135.15, 'max_sentence1_length': 493, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 128.24, 'max_sentence2_length': 422, 'unique_sentence2': 1996}, 'bul_Cyrl-bos_Latn': {'num_samples': 1997, 'number_of_characters': 524799, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 135.15, 'max_sentence1_length': 493, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 127.65, 'max_sentence2_length': 434, 'unique_sentence2': 1996}, 'bul_Cyrl-ces_Latn': {'num_samples': 1997, 'number_of_characters': 511207, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 135.15, 'max_sentence1_length': 493, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 120.84, 'max_sentence2_length': 474, 'unique_sentence2': 1997}, 'bul_Cyrl-eng_Latn': {'num_samples': 1997, 'number_of_characters': 517609, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 135.15, 'max_sentence1_length': 493, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'bul_Cyrl-hrv_Latn': {'num_samples': 1997, 'number_of_characters': 525814, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 135.15, 'max_sentence1_length': 493, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 128.15, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'bul_Cyrl-mkd_Cyrl': {'num_samples': 1997, 'number_of_characters': 537780, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 135.15, 'max_sentence1_length': 493, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 134.15, 'max_sentence2_length': 451, 'unique_sentence2': 1997}, 'bul_Cyrl-pol_Latn': {'num_samples': 1997, 'number_of_characters': 547755, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 135.15, 'max_sentence1_length': 493, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 139.14, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'bul_Cyrl-rus_Cyrl': {'num_samples': 1997, 'number_of_characters': 544782, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 135.15, 'max_sentence1_length': 493, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 137.65, 'max_sentence2_length': 419, 'unique_sentence2': 1996}, 'bul_Cyrl-slk_Latn': {'num_samples': 1997, 'number_of_characters': 522858, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 135.15, 'max_sentence1_length': 493, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 126.67, 'max_sentence2_length': 403, 'unique_sentence2': 1996}, 'bul_Cyrl-slv_Latn': {'num_samples': 1997, 'number_of_characters': 522785, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 135.15, 'max_sentence1_length': 493, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 126.64, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'bul_Cyrl-srp_Cyrl': {'num_samples': 1997, 'number_of_characters': 522192, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 135.15, 'max_sentence1_length': 493, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 126.34, 'max_sentence2_length': 439, 'unique_sentence2': 1995}, 'bul_Cyrl-srp_Latn': {'num_samples': 1997, 'number_of_characters': 526030, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 135.15, 'max_sentence1_length': 493, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 128.26, 'max_sentence2_length': 452, 'unique_sentence2': 1996}, 'bul_Cyrl-ukr_Cyrl': {'num_samples': 1997, 'number_of_characters': 532672, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 135.15, 'max_sentence1_length': 493, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 131.59, 'max_sentence2_length': 440, 'unique_sentence2': 1996}, 'cat_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 530680, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 141.69, 'max_sentence1_length': 460, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'cat_Latn-fra_Latn': {'num_samples': 1997, 'number_of_characters': 576068, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 141.69, 'max_sentence1_length': 460, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 146.77, 'max_sentence2_length': 512, 'unique_sentence2': 1996}, 'cat_Latn-glg_Latn': {'num_samples': 1997, 'number_of_characters': 554946, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 141.69, 'max_sentence1_length': 460, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 136.2, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'cat_Latn-ita_Latn': {'num_samples': 1997, 'number_of_characters': 572177, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 141.69, 'max_sentence1_length': 460, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 144.83, 'max_sentence2_length': 623, 'unique_sentence2': 1996}, 'cat_Latn-mlt_Latn': {'num_samples': 1997, 'number_of_characters': 560435, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 141.69, 'max_sentence1_length': 460, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 138.95, 'max_sentence2_length': 582, 'unique_sentence2': 1996}, 'cat_Latn-por_Latn': {'num_samples': 1997, 'number_of_characters': 560175, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 141.69, 'max_sentence1_length': 460, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 138.82, 'max_sentence2_length': 497, 'unique_sentence2': 1996}, 'cat_Latn-ron_Latn': {'num_samples': 1997, 'number_of_characters': 575445, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 141.69, 'max_sentence1_length': 460, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 146.46, 'max_sentence2_length': 518, 'unique_sentence2': 1997}, 'cat_Latn-spa_Latn': {'num_samples': 1997, 'number_of_characters': 571160, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 141.69, 'max_sentence1_length': 460, 'unique_sentence1': 1997, 'min_sentence2_length': 1, 'average_sentence2_length': 144.32, 'max_sentence2_length': 504, 'unique_sentence2': 1996}, 'ces_Latn-bel_Cyrl': {'num_samples': 1997, 'number_of_characters': 497408, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 120.84, 'max_sentence1_length': 474, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 128.24, 'max_sentence2_length': 422, 'unique_sentence2': 1996}, 'ces_Latn-bos_Latn': {'num_samples': 1997, 'number_of_characters': 496228, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 120.84, 'max_sentence1_length': 474, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 127.65, 'max_sentence2_length': 434, 'unique_sentence2': 1996}, 'ces_Latn-bul_Cyrl': {'num_samples': 1997, 'number_of_characters': 511207, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 120.84, 'max_sentence1_length': 474, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 135.15, 'max_sentence2_length': 493, 'unique_sentence2': 1996}, 'ces_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 489038, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 120.84, 'max_sentence1_length': 474, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'ces_Latn-hrv_Latn': {'num_samples': 1997, 'number_of_characters': 497243, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 120.84, 'max_sentence1_length': 474, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 128.15, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'ces_Latn-mkd_Cyrl': {'num_samples': 1997, 'number_of_characters': 509209, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 120.84, 'max_sentence1_length': 474, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 134.15, 'max_sentence2_length': 451, 'unique_sentence2': 1997}, 'ces_Latn-pol_Latn': {'num_samples': 1997, 'number_of_characters': 519184, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 120.84, 'max_sentence1_length': 474, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 139.14, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'ces_Latn-rus_Cyrl': {'num_samples': 1997, 'number_of_characters': 516211, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 120.84, 'max_sentence1_length': 474, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 137.65, 'max_sentence2_length': 419, 'unique_sentence2': 1996}, 'ces_Latn-slk_Latn': {'num_samples': 1997, 'number_of_characters': 494287, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 120.84, 'max_sentence1_length': 474, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 126.67, 'max_sentence2_length': 403, 'unique_sentence2': 1996}, 'ces_Latn-slv_Latn': {'num_samples': 1997, 'number_of_characters': 494214, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 120.84, 'max_sentence1_length': 474, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 126.64, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'ces_Latn-srp_Cyrl': {'num_samples': 1997, 'number_of_characters': 493621, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 120.84, 'max_sentence1_length': 474, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 126.34, 'max_sentence2_length': 439, 'unique_sentence2': 1995}, 'ces_Latn-srp_Latn': {'num_samples': 1997, 'number_of_characters': 497459, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 120.84, 'max_sentence1_length': 474, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 128.26, 'max_sentence2_length': 452, 'unique_sentence2': 1996}, 'ces_Latn-ukr_Cyrl': {'num_samples': 1997, 'number_of_characters': 504101, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 120.84, 'max_sentence1_length': 474, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 131.59, 'max_sentence2_length': 440, 'unique_sentence2': 1996}, 'ckb_Arab-arb_Arab': {'num_samples': 1997, 'number_of_characters': 483548, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 126.37, 'max_sentence1_length': 399, 'unique_sentence1': 1995, 'min_sentence2_length': 5, 'average_sentence2_length': 115.76, 'max_sentence2_length': 362, 'unique_sentence2': 1995}, 'ckb_Arab-eng_Latn': {'num_samples': 1997, 'number_of_characters': 500087, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 126.37, 'max_sentence1_length': 399, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'ckb_Arab-fas_Arab': {'num_samples': 1997, 'number_of_characters': 495706, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 126.37, 'max_sentence1_length': 399, 'unique_sentence1': 1995, 'min_sentence2_length': 9, 'average_sentence2_length': 121.85, 'max_sentence2_length': 389, 'unique_sentence2': 1995}, 'ckb_Arab-heb_Hebr': {'num_samples': 1997, 'number_of_characters': 452663, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 126.37, 'max_sentence1_length': 399, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 100.3, 'max_sentence2_length': 375, 'unique_sentence2': 1996}, 'ckb_Arab-kmr_Latn': {'num_samples': 1997, 'number_of_characters': 498313, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 126.37, 'max_sentence1_length': 399, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 123.16, 'max_sentence2_length': 420, 'unique_sentence2': 1996}, 'ckb_Arab-mey_Arab': {'num_samples': 1997, 'number_of_characters': 466202, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 126.37, 'max_sentence1_length': 399, 'unique_sentence1': 1995, 'min_sentence2_length': 6, 'average_sentence2_length': 107.08, 'max_sentence2_length': 392, 'unique_sentence2': 1993}, 'ckb_Arab-prs_Arab': {'num_samples': 1997, 'number_of_characters': 494903, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 126.37, 'max_sentence1_length': 399, 'unique_sentence1': 1995, 'min_sentence2_length': 8, 'average_sentence2_length': 121.45, 'max_sentence2_length': 365, 'unique_sentence2': 1997}, 'ckb_Arab-pus_Arab': {'num_samples': 1997, 'number_of_characters': 495000, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 126.37, 'max_sentence1_length': 399, 'unique_sentence1': 1995, 'min_sentence2_length': 8, 'average_sentence2_length': 121.5, 'max_sentence2_length': 366, 'unique_sentence2': 1996}, 'ckb_Arab-shi_Arab': {'num_samples': 1997, 'number_of_characters': 467280, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 126.37, 'max_sentence1_length': 399, 'unique_sentence1': 1995, 'min_sentence2_length': 3, 'average_sentence2_length': 107.62, 'max_sentence2_length': 378, 'unique_sentence2': 1996}, 'ckb_Arab-tgk_Cyrl': {'num_samples': 1997, 'number_of_characters': 526514, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 126.37, 'max_sentence1_length': 399, 'unique_sentence1': 1995, 'min_sentence2_length': 3, 'average_sentence2_length': 137.28, 'max_sentence2_length': 451, 'unique_sentence2': 1995}, 'cym_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 514225, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 133.45, 'max_sentence1_length': 444, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'cym_Latn-gle_Latn': {'num_samples': 1997, 'number_of_characters': 561314, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 133.45, 'max_sentence1_length': 444, 'unique_sentence1': 1997, 'min_sentence2_length': 11, 'average_sentence2_length': 147.63, 'max_sentence2_length': 461, 'unique_sentence2': 1997}, 'dan_Latn-afr_Latn': {'num_samples': 1997, 'number_of_characters': 520490, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 126.26, 'max_sentence1_length': 522, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 134.38, 'max_sentence2_length': 437, 'unique_sentence2': 1996}, 'dan_Latn-deu_Latn': {'num_samples': 1997, 'number_of_characters': 547788, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 126.26, 'max_sentence1_length': 522, 'unique_sentence1': 1995, 'min_sentence2_length': 9, 'average_sentence2_length': 148.05, 'max_sentence2_length': 508, 'unique_sentence2': 1996}, 'dan_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 499858, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 126.26, 'max_sentence1_length': 522, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'dan_Latn-fao_Latn': {'num_samples': 1997, 'number_of_characters': 509941, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 126.26, 'max_sentence1_length': 522, 'unique_sentence1': 1995, 'min_sentence2_length': 8, 'average_sentence2_length': 129.1, 'max_sentence2_length': 433, 'unique_sentence2': 1997}, 'dan_Latn-isl_Latn': {'num_samples': 1997, 'number_of_characters': 514346, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 126.26, 'max_sentence1_length': 522, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 131.3, 'max_sentence2_length': 399, 'unique_sentence2': 1996}, 'dan_Latn-ltz_Latn': {'num_samples': 1997, 'number_of_characters': 532895, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 126.26, 'max_sentence1_length': 522, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 140.59, 'max_sentence2_length': 543, 'unique_sentence2': 1996}, 'dan_Latn-nld_Latn': {'num_samples': 1997, 'number_of_characters': 544053, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 126.26, 'max_sentence1_length': 522, 'unique_sentence1': 1995, 'min_sentence2_length': 8, 'average_sentence2_length': 146.18, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'dan_Latn-nno_Latn': {'num_samples': 1997, 'number_of_characters': 500495, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 126.26, 'max_sentence1_length': 522, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 124.37, 'max_sentence2_length': 417, 'unique_sentence2': 1996}, 'dan_Latn-nob_Latn': {'num_samples': 1997, 'number_of_characters': 503582, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 126.26, 'max_sentence1_length': 522, 'unique_sentence1': 1995, 'min_sentence2_length': 8, 'average_sentence2_length': 125.91, 'max_sentence2_length': 482, 'unique_sentence2': 1996}, 'dan_Latn-swe_Latn': {'num_samples': 1997, 'number_of_characters': 503965, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 126.26, 'max_sentence1_length': 522, 'unique_sentence1': 1995, 'min_sentence2_length': 8, 'average_sentence2_length': 126.1, 'max_sentence2_length': 430, 'unique_sentence2': 1996}, 'deu_Latn-afr_Latn': {'num_samples': 1997, 'number_of_characters': 564002, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 148.05, 'max_sentence1_length': 508, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 134.38, 'max_sentence2_length': 437, 'unique_sentence2': 1996}, 'deu_Latn-arb_Arab': {'num_samples': 1997, 'number_of_characters': 526831, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 148.05, 'max_sentence1_length': 508, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 115.76, 'max_sentence2_length': 362, 'unique_sentence2': 1995}, 'deu_Latn-ben_Beng': {'num_samples': 1997, 'number_of_characters': 539452, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 148.05, 'max_sentence1_length': 508, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 122.08, 'max_sentence2_length': 402, 'unique_sentence2': 1997}, 'deu_Latn-dan_Latn': {'num_samples': 1997, 'number_of_characters': 547788, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 148.05, 'max_sentence1_length': 508, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 126.26, 'max_sentence2_length': 522, 'unique_sentence2': 1995}, 'deu_Latn-ell_Grek': {'num_samples': 1997, 'number_of_characters': 594777, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 148.05, 'max_sentence1_length': 508, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 149.79, 'max_sentence2_length': 584, 'unique_sentence2': 1996}, 'deu_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 543370, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 148.05, 'max_sentence1_length': 508, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'deu_Latn-fao_Latn': {'num_samples': 1997, 'number_of_characters': 553453, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 148.05, 'max_sentence1_length': 508, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 129.1, 'max_sentence2_length': 433, 'unique_sentence2': 1997}, 'deu_Latn-fas_Arab': {'num_samples': 1997, 'number_of_characters': 538989, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 148.05, 'max_sentence1_length': 508, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 121.85, 'max_sentence2_length': 389, 'unique_sentence2': 1995}, 'deu_Latn-fin_Latn': {'num_samples': 1997, 'number_of_characters': 565450, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 148.05, 'max_sentence1_length': 508, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 135.1, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'deu_Latn-fra_Latn': {'num_samples': 1997, 'number_of_characters': 588758, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 148.05, 'max_sentence1_length': 508, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 146.77, 'max_sentence2_length': 512, 'unique_sentence2': 1996}, 'deu_Latn-heb_Hebr': {'num_samples': 1997, 'number_of_characters': 495946, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 148.05, 'max_sentence1_length': 508, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 100.3, 'max_sentence2_length': 375, 'unique_sentence2': 1996}, 'deu_Latn-hin_Deva': {'num_samples': 1997, 'number_of_characters': 557225, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 148.05, 'max_sentence1_length': 508, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 130.98, 'max_sentence2_length': 394, 'unique_sentence2': 1996}, 'deu_Latn-hun_Latn': {'num_samples': 1997, 'number_of_characters': 574026, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 148.05, 'max_sentence1_length': 508, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 139.4, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'deu_Latn-ind_Latn': {'num_samples': 1997, 'number_of_characters': 582622, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 148.05, 'max_sentence1_length': 508, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 143.7, 'max_sentence2_length': 486, 'unique_sentence2': 1997}, 'deu_Latn-isl_Latn': {'num_samples': 1997, 'number_of_characters': 557858, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 148.05, 'max_sentence1_length': 508, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 131.3, 'max_sentence2_length': 399, 'unique_sentence2': 1996}, 'deu_Latn-jpn_Jpan': {'num_samples': 1997, 'number_of_characters': 407276, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 148.05, 'max_sentence1_length': 508, 'unique_sentence1': 1996, 'min_sentence2_length': 4, 'average_sentence2_length': 55.9, 'max_sentence2_length': 189, 'unique_sentence2': 1994}, 'deu_Latn-kor_Hang': {'num_samples': 1997, 'number_of_characters': 429055, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 148.05, 'max_sentence1_length': 508, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 66.8, 'max_sentence2_length': 217, 'unique_sentence2': 1995}, 'deu_Latn-lit_Latn': {'num_samples': 1997, 'number_of_characters': 555047, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 148.05, 'max_sentence1_length': 508, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 129.89, 'max_sentence2_length': 446, 'unique_sentence2': 1995}, 'deu_Latn-ltz_Latn': {'num_samples': 1997, 'number_of_characters': 576407, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 148.05, 'max_sentence1_length': 508, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 140.59, 'max_sentence2_length': 543, 'unique_sentence2': 1996}, 'deu_Latn-nld_Latn': {'num_samples': 1997, 'number_of_characters': 587565, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 148.05, 'max_sentence1_length': 508, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 146.18, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'deu_Latn-nno_Latn': {'num_samples': 1997, 'number_of_characters': 544007, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 148.05, 'max_sentence1_length': 508, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.37, 'max_sentence2_length': 417, 'unique_sentence2': 1996}, 'deu_Latn-nob_Latn': {'num_samples': 1997, 'number_of_characters': 547094, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 148.05, 'max_sentence1_length': 508, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 125.91, 'max_sentence2_length': 482, 'unique_sentence2': 1996}, 'deu_Latn-pol_Latn': {'num_samples': 1997, 'number_of_characters': 573516, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 148.05, 'max_sentence1_length': 508, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 139.14, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'deu_Latn-por_Latn': {'num_samples': 1997, 'number_of_characters': 572865, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 148.05, 'max_sentence1_length': 508, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 138.82, 'max_sentence2_length': 497, 'unique_sentence2': 1996}, 'deu_Latn-rus_Cyrl': {'num_samples': 1997, 'number_of_characters': 570543, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 148.05, 'max_sentence1_length': 508, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 137.65, 'max_sentence2_length': 419, 'unique_sentence2': 1996}, 'deu_Latn-spa_Latn': {'num_samples': 1997, 'number_of_characters': 583850, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 148.05, 'max_sentence1_length': 508, 'unique_sentence1': 1996, 'min_sentence2_length': 1, 'average_sentence2_length': 144.32, 'max_sentence2_length': 504, 'unique_sentence2': 1996}, 'deu_Latn-swa_Latn': {'num_samples': 1997, 'number_of_characters': 568159, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 148.05, 'max_sentence1_length': 508, 'unique_sentence1': 1996, 'min_sentence2_length': 10, 'average_sentence2_length': 136.46, 'max_sentence2_length': 430, 'unique_sentence2': 1997}, 'deu_Latn-swe_Latn': {'num_samples': 1997, 'number_of_characters': 547477, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 148.05, 'max_sentence1_length': 508, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 126.1, 'max_sentence2_length': 430, 'unique_sentence2': 1996}, 'deu_Latn-tam_Taml': {'num_samples': 1997, 'number_of_characters': 605611, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 148.05, 'max_sentence1_length': 508, 'unique_sentence1': 1996, 'min_sentence2_length': 11, 'average_sentence2_length': 155.21, 'max_sentence2_length': 581, 'unique_sentence2': 1997}, 'deu_Latn-tur_Latn': {'num_samples': 1997, 'number_of_characters': 561263, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 148.05, 'max_sentence1_length': 508, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 133.01, 'max_sentence2_length': 504, 'unique_sentence2': 1997}, 'deu_Latn-vie_Latn': {'num_samples': 1997, 'number_of_characters': 566771, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 148.05, 'max_sentence1_length': 508, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 135.76, 'max_sentence2_length': 437, 'unique_sentence2': 1996}, 'deu_Latn-zho_Hant': {'num_samples': 1997, 'number_of_characters': 387128, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 148.05, 'max_sentence1_length': 508, 'unique_sentence1': 1996, 'min_sentence2_length': 3, 'average_sentence2_length': 45.81, 'max_sentence2_length': 200, 'unique_sentence2': 1996}, 'deu_Latn-zul_Latn': {'num_samples': 1997, 'number_of_characters': 553382, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 148.05, 'max_sentence1_length': 508, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 129.06, 'max_sentence2_length': 494, 'unique_sentence2': 1996}, 'div_Thaa-ben_Beng': {'num_samples': 1997, 'number_of_characters': 547650, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 152.15, 'max_sentence1_length': 609, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 122.08, 'max_sentence2_length': 402, 'unique_sentence2': 1997}, 'div_Thaa-eng_Latn': {'num_samples': 1997, 'number_of_characters': 551568, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 152.15, 'max_sentence1_length': 609, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'div_Thaa-eus_Latn': {'num_samples': 1997, 'number_of_characters': 579051, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 152.15, 'max_sentence1_length': 609, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 137.81, 'max_sentence2_length': 393, 'unique_sentence2': 1997}, 'div_Thaa-guj_Gujr': {'num_samples': 1997, 'number_of_characters': 548779, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 152.15, 'max_sentence1_length': 609, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 122.65, 'max_sentence2_length': 378, 'unique_sentence2': 1997}, 'div_Thaa-hin_Deva': {'num_samples': 1997, 'number_of_characters': 565423, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 152.15, 'max_sentence1_length': 609, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 130.98, 'max_sentence2_length': 394, 'unique_sentence2': 1996}, 'div_Thaa-kan_Knda': {'num_samples': 1997, 'number_of_characters': 569384, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 152.15, 'max_sentence1_length': 609, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 132.97, 'max_sentence2_length': 449, 'unique_sentence2': 1996}, 'div_Thaa-mar_Deva': {'num_samples': 1997, 'number_of_characters': 564735, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 152.15, 'max_sentence1_length': 609, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 130.64, 'max_sentence2_length': 443, 'unique_sentence2': 1995}, 'div_Thaa-nep_Deva': {'num_samples': 1997, 'number_of_characters': 552071, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 152.15, 'max_sentence1_length': 609, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 124.3, 'max_sentence2_length': 412, 'unique_sentence2': 1996}, 'div_Thaa-pan_Guru': {'num_samples': 1997, 'number_of_characters': 554270, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 152.15, 'max_sentence1_length': 609, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 125.4, 'max_sentence2_length': 383, 'unique_sentence2': 1996}, 'div_Thaa-sin_Sinh': {'num_samples': 1997, 'number_of_characters': 562589, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 152.15, 'max_sentence1_length': 609, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 129.56, 'max_sentence2_length': 441, 'unique_sentence2': 1996}, 'div_Thaa-snd_Arab': {'num_samples': 1997, 'number_of_characters': 524175, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 152.15, 'max_sentence1_length': 609, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 110.33, 'max_sentence2_length': 335, 'unique_sentence2': 1996}, 'div_Thaa-tam_Taml': {'num_samples': 1997, 'number_of_characters': 613809, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 152.15, 'max_sentence1_length': 609, 'unique_sentence1': 1996, 'min_sentence2_length': 11, 'average_sentence2_length': 155.21, 'max_sentence2_length': 581, 'unique_sentence2': 1997}, 'div_Thaa-tel_Telu': {'num_samples': 1997, 'number_of_characters': 551375, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 152.15, 'max_sentence1_length': 609, 'unique_sentence1': 1996, 'min_sentence2_length': 10, 'average_sentence2_length': 123.95, 'max_sentence2_length': 412, 'unique_sentence2': 1996}, 'div_Thaa-urd_Arab': {'num_samples': 1997, 'number_of_characters': 551846, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 152.15, 'max_sentence1_length': 609, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.19, 'max_sentence2_length': 390, 'unique_sentence2': 1996}, 'dzo_Tibt-bod_Tibt': {'num_samples': 1997, 'number_of_characters': 543850, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 121.79, 'max_sentence1_length': 411, 'unique_sentence1': 1992, 'min_sentence2_length': 8, 'average_sentence2_length': 150.54, 'max_sentence2_length': 478, 'unique_sentence2': 1993}, 'dzo_Tibt-eng_Latn': {'num_samples': 1997, 'number_of_characters': 490941, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 121.79, 'max_sentence1_length': 411, 'unique_sentence1': 1992, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'dzo_Tibt-khm_Khmr': {'num_samples': 1997, 'number_of_characters': 531712, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 121.79, 'max_sentence1_length': 411, 'unique_sentence1': 1992, 'min_sentence2_length': 6, 'average_sentence2_length': 144.46, 'max_sentence2_length': 517, 'unique_sentence2': 1996}, 'dzo_Tibt-lao_Laoo': {'num_samples': 1997, 'number_of_characters': 510201, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 121.79, 'max_sentence1_length': 411, 'unique_sentence1': 1992, 'min_sentence2_length': 7, 'average_sentence2_length': 133.69, 'max_sentence2_length': 507, 'unique_sentence2': 1997}, 'dzo_Tibt-mon_Mong': {'num_samples': 1997, 'number_of_characters': 502269, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 121.79, 'max_sentence1_length': 411, 'unique_sentence1': 1992, 'min_sentence2_length': 11, 'average_sentence2_length': 129.72, 'max_sentence2_length': 414, 'unique_sentence2': 1997}, 'dzo_Tibt-mya_Mymr': {'num_samples': 1997, 'number_of_characters': 555075, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 121.79, 'max_sentence1_length': 411, 'unique_sentence1': 1992, 'min_sentence2_length': 9, 'average_sentence2_length': 156.16, 'max_sentence2_length': 773, 'unique_sentence2': 1997}, 'dzo_Tibt-tha_Thai': {'num_samples': 1997, 'number_of_characters': 480689, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 121.79, 'max_sentence1_length': 411, 'unique_sentence1': 1992, 'min_sentence2_length': 7, 'average_sentence2_length': 118.91, 'max_sentence2_length': 439, 'unique_sentence2': 1996}, 'ell_Grek-arb_Arab': {'num_samples': 1997, 'number_of_characters': 530308, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 149.79, 'max_sentence1_length': 584, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 115.76, 'max_sentence2_length': 362, 'unique_sentence2': 1995}, 'ell_Grek-ben_Beng': {'num_samples': 1997, 'number_of_characters': 542929, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 149.79, 'max_sentence1_length': 584, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 122.08, 'max_sentence2_length': 402, 'unique_sentence2': 1997}, 'ell_Grek-deu_Latn': {'num_samples': 1997, 'number_of_characters': 594777, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 149.79, 'max_sentence1_length': 584, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 148.05, 'max_sentence2_length': 508, 'unique_sentence2': 1996}, 'ell_Grek-eng_Latn': {'num_samples': 1997, 'number_of_characters': 546847, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 149.79, 'max_sentence1_length': 584, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'ell_Grek-fas_Arab': {'num_samples': 1997, 'number_of_characters': 542466, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 149.79, 'max_sentence1_length': 584, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 121.85, 'max_sentence2_length': 389, 'unique_sentence2': 1995}, 'ell_Grek-fin_Latn': {'num_samples': 1997, 'number_of_characters': 568927, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 149.79, 'max_sentence1_length': 584, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 135.1, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'ell_Grek-fra_Latn': {'num_samples': 1997, 'number_of_characters': 592235, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 149.79, 'max_sentence1_length': 584, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 146.77, 'max_sentence2_length': 512, 'unique_sentence2': 1996}, 'ell_Grek-heb_Hebr': {'num_samples': 1997, 'number_of_characters': 499423, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 149.79, 'max_sentence1_length': 584, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 100.3, 'max_sentence2_length': 375, 'unique_sentence2': 1996}, 'ell_Grek-hin_Deva': {'num_samples': 1997, 'number_of_characters': 560702, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 149.79, 'max_sentence1_length': 584, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 130.98, 'max_sentence2_length': 394, 'unique_sentence2': 1996}, 'ell_Grek-hun_Latn': {'num_samples': 1997, 'number_of_characters': 577503, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 149.79, 'max_sentence1_length': 584, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 139.4, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'ell_Grek-hye_Armn': {'num_samples': 1997, 'number_of_characters': 563842, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 149.79, 'max_sentence1_length': 584, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 132.56, 'max_sentence2_length': 441, 'unique_sentence2': 1996}, 'ell_Grek-ind_Latn': {'num_samples': 1997, 'number_of_characters': 586099, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 149.79, 'max_sentence1_length': 584, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 143.7, 'max_sentence2_length': 486, 'unique_sentence2': 1997}, 'ell_Grek-jpn_Jpan': {'num_samples': 1997, 'number_of_characters': 410753, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 149.79, 'max_sentence1_length': 584, 'unique_sentence1': 1996, 'min_sentence2_length': 4, 'average_sentence2_length': 55.9, 'max_sentence2_length': 189, 'unique_sentence2': 1994}, 'ell_Grek-kat_Geor': {'num_samples': 1997, 'number_of_characters': 565719, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 149.79, 'max_sentence1_length': 584, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 133.5, 'max_sentence2_length': 503, 'unique_sentence2': 1995}, 'ell_Grek-kor_Hang': {'num_samples': 1997, 'number_of_characters': 432532, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 149.79, 'max_sentence1_length': 584, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 66.8, 'max_sentence2_length': 217, 'unique_sentence2': 1995}, 'ell_Grek-lit_Latn': {'num_samples': 1997, 'number_of_characters': 558524, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 149.79, 'max_sentence1_length': 584, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 129.89, 'max_sentence2_length': 446, 'unique_sentence2': 1995}, 'ell_Grek-nld_Latn': {'num_samples': 1997, 'number_of_characters': 591042, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 149.79, 'max_sentence1_length': 584, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 146.18, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'ell_Grek-pol_Latn': {'num_samples': 1997, 'number_of_characters': 576993, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 149.79, 'max_sentence1_length': 584, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 139.14, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'ell_Grek-por_Latn': {'num_samples': 1997, 'number_of_characters': 576342, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 149.79, 'max_sentence1_length': 584, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 138.82, 'max_sentence2_length': 497, 'unique_sentence2': 1996}, 'ell_Grek-rus_Cyrl': {'num_samples': 1997, 'number_of_characters': 574020, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 149.79, 'max_sentence1_length': 584, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 137.65, 'max_sentence2_length': 419, 'unique_sentence2': 1996}, 'ell_Grek-spa_Latn': {'num_samples': 1997, 'number_of_characters': 587327, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 149.79, 'max_sentence1_length': 584, 'unique_sentence1': 1996, 'min_sentence2_length': 1, 'average_sentence2_length': 144.32, 'max_sentence2_length': 504, 'unique_sentence2': 1996}, 'ell_Grek-sqi_Latn': {'num_samples': 1997, 'number_of_characters': 582734, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 149.79, 'max_sentence1_length': 584, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 142.02, 'max_sentence2_length': 461, 'unique_sentence2': 1996}, 'ell_Grek-swa_Latn': {'num_samples': 1997, 'number_of_characters': 571636, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 149.79, 'max_sentence1_length': 584, 'unique_sentence1': 1996, 'min_sentence2_length': 10, 'average_sentence2_length': 136.46, 'max_sentence2_length': 430, 'unique_sentence2': 1997}, 'ell_Grek-swe_Latn': {'num_samples': 1997, 'number_of_characters': 550954, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 149.79, 'max_sentence1_length': 584, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 126.1, 'max_sentence2_length': 430, 'unique_sentence2': 1996}, 'ell_Grek-tam_Taml': {'num_samples': 1997, 'number_of_characters': 609088, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 149.79, 'max_sentence1_length': 584, 'unique_sentence1': 1996, 'min_sentence2_length': 11, 'average_sentence2_length': 155.21, 'max_sentence2_length': 581, 'unique_sentence2': 1997}, 'ell_Grek-tur_Latn': {'num_samples': 1997, 'number_of_characters': 564740, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 149.79, 'max_sentence1_length': 584, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 133.01, 'max_sentence2_length': 504, 'unique_sentence2': 1997}, 'ell_Grek-vie_Latn': {'num_samples': 1997, 'number_of_characters': 570248, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 149.79, 'max_sentence1_length': 584, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 135.76, 'max_sentence2_length': 437, 'unique_sentence2': 1996}, 'ell_Grek-zho_Hant': {'num_samples': 1997, 'number_of_characters': 390605, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 149.79, 'max_sentence1_length': 584, 'unique_sentence1': 1996, 'min_sentence2_length': 3, 'average_sentence2_length': 45.81, 'max_sentence2_length': 200, 'unique_sentence2': 1996}, 'ell_Grek-zul_Latn': {'num_samples': 1997, 'number_of_characters': 556859, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 149.79, 'max_sentence1_length': 584, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 129.06, 'max_sentence2_length': 494, 'unique_sentence2': 1996}, 'eng_Latn-afr_Latn': {'num_samples': 1997, 'number_of_characters': 516072, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 134.38, 'max_sentence2_length': 437, 'unique_sentence2': 1996}, 'eng_Latn-amh_Ethi': {'num_samples': 1997, 'number_of_characters': 415227, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 1, 'average_sentence2_length': 83.88, 'max_sentence2_length': 290, 'unique_sentence2': 1994}, 'eng_Latn-arb_Arab': {'num_samples': 1997, 'number_of_characters': 478901, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 115.76, 'max_sentence2_length': 362, 'unique_sentence2': 1995}, 'eng_Latn-aze_Latn': {'num_samples': 1997, 'number_of_characters': 517354, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 135.02, 'max_sentence2_length': 398, 'unique_sentence2': 1997}, 'eng_Latn-bak_Cyrl': {'num_samples': 1997, 'number_of_characters': 494046, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 11, 'average_sentence2_length': 123.35, 'max_sentence2_length': 437, 'unique_sentence2': 1995}, 'eng_Latn-bel_Cyrl': {'num_samples': 1997, 'number_of_characters': 503810, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 128.24, 'max_sentence2_length': 422, 'unique_sentence2': 1996}, 'eng_Latn-bem_Latn': {'num_samples': 1997, 'number_of_characters': 546212, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 149.47, 'max_sentence2_length': 465, 'unique_sentence2': 1997}, 'eng_Latn-ben_Beng': {'num_samples': 1997, 'number_of_characters': 491522, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 122.08, 'max_sentence2_length': 402, 'unique_sentence2': 1997}, 'eng_Latn-bod_Tibt': {'num_samples': 1997, 'number_of_characters': 548349, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 150.54, 'max_sentence2_length': 478, 'unique_sentence2': 1993}, 'eng_Latn-bos_Latn': {'num_samples': 1997, 'number_of_characters': 502630, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 127.65, 'max_sentence2_length': 434, 'unique_sentence2': 1996}, 'eng_Latn-bul_Cyrl': {'num_samples': 1997, 'number_of_characters': 517609, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 135.15, 'max_sentence2_length': 493, 'unique_sentence2': 1996}, 'eng_Latn-cat_Latn': {'num_samples': 1997, 'number_of_characters': 530680, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 141.69, 'max_sentence2_length': 460, 'unique_sentence2': 1997}, 'eng_Latn-ces_Latn': {'num_samples': 1997, 'number_of_characters': 489038, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 120.84, 'max_sentence2_length': 474, 'unique_sentence2': 1997}, 'eng_Latn-ckb_Arab': {'num_samples': 1997, 'number_of_characters': 500087, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 126.37, 'max_sentence2_length': 399, 'unique_sentence2': 1995}, 'eng_Latn-cym_Latn': {'num_samples': 1997, 'number_of_characters': 514225, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 133.45, 'max_sentence2_length': 444, 'unique_sentence2': 1997}, 'eng_Latn-dan_Latn': {'num_samples': 1997, 'number_of_characters': 499858, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 126.26, 'max_sentence2_length': 522, 'unique_sentence2': 1995}, 'eng_Latn-deu_Latn': {'num_samples': 1997, 'number_of_characters': 543370, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 148.05, 'max_sentence2_length': 508, 'unique_sentence2': 1996}, 'eng_Latn-div_Thaa': {'num_samples': 1997, 'number_of_characters': 551568, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 152.15, 'max_sentence2_length': 609, 'unique_sentence2': 1996}, 'eng_Latn-dzo_Tibt': {'num_samples': 1997, 'number_of_characters': 490941, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 121.79, 'max_sentence2_length': 411, 'unique_sentence2': 1992}, 'eng_Latn-ell_Grek': {'num_samples': 1997, 'number_of_characters': 546847, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 149.79, 'max_sentence2_length': 584, 'unique_sentence2': 1996}, 'eng_Latn-eus_Latn': {'num_samples': 1997, 'number_of_characters': 522923, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 137.81, 'max_sentence2_length': 393, 'unique_sentence2': 1997}, 'eng_Latn-ewe_Latn': {'num_samples': 1997, 'number_of_characters': 486698, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 119.67, 'max_sentence2_length': 493, 'unique_sentence2': 1994}, 'eng_Latn-fao_Latn': {'num_samples': 1997, 'number_of_characters': 505523, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 129.1, 'max_sentence2_length': 433, 'unique_sentence2': 1997}, 'eng_Latn-fas_Arab': {'num_samples': 1997, 'number_of_characters': 491059, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 121.85, 'max_sentence2_length': 389, 'unique_sentence2': 1995}, 'eng_Latn-fij_Latn': {'num_samples': 1997, 'number_of_characters': 548225, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 10, 'average_sentence2_length': 150.48, 'max_sentence2_length': 448, 'unique_sentence2': 1988}, 'eng_Latn-fil_Latn': {'num_samples': 1997, 'number_of_characters': 541140, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 10, 'average_sentence2_length': 146.93, 'max_sentence2_length': 554, 'unique_sentence2': 1997}, 'eng_Latn-fin_Latn': {'num_samples': 1997, 'number_of_characters': 517520, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 135.1, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'eng_Latn-fra_Latn': {'num_samples': 1997, 'number_of_characters': 540828, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 146.77, 'max_sentence2_length': 512, 'unique_sentence2': 1996}, 'eng_Latn-fuc_Latn': {'num_samples': 1997, 'number_of_characters': 476200, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 114.41, 'max_sentence2_length': 376, 'unique_sentence2': 1996}, 'eng_Latn-gle_Latn': {'num_samples': 1997, 'number_of_characters': 542529, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 11, 'average_sentence2_length': 147.63, 'max_sentence2_length': 461, 'unique_sentence2': 1997}, 'eng_Latn-glg_Latn': {'num_samples': 1997, 'number_of_characters': 519706, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 136.2, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'eng_Latn-guj_Gujr': {'num_samples': 1997, 'number_of_characters': 492651, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 122.65, 'max_sentence2_length': 378, 'unique_sentence2': 1997}, 'eng_Latn-hau_Latn': {'num_samples': 1997, 'number_of_characters': 517686, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 135.19, 'max_sentence2_length': 483, 'unique_sentence2': 1997}, 'eng_Latn-heb_Hebr': {'num_samples': 1997, 'number_of_characters': 448016, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 100.3, 'max_sentence2_length': 375, 'unique_sentence2': 1996}, 'eng_Latn-hin_Deva': {'num_samples': 1997, 'number_of_characters': 509295, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 130.98, 'max_sentence2_length': 394, 'unique_sentence2': 1996}, 'eng_Latn-hmn_Latn': {'num_samples': 1997, 'number_of_characters': 578510, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 165.64, 'max_sentence2_length': 643, 'unique_sentence2': 1997}, 'eng_Latn-hrv_Latn': {'num_samples': 1997, 'number_of_characters': 503645, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 128.15, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'eng_Latn-hun_Latn': {'num_samples': 1997, 'number_of_characters': 526096, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 139.4, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'eng_Latn-hye_Armn': {'num_samples': 1997, 'number_of_characters': 512435, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 132.56, 'max_sentence2_length': 441, 'unique_sentence2': 1996}, 'eng_Latn-ibo_Latn': {'num_samples': 1997, 'number_of_characters': 493821, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 123.24, 'max_sentence2_length': 469, 'unique_sentence2': 1997}, 'eng_Latn-ind_Latn': {'num_samples': 1997, 'number_of_characters': 534692, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 143.7, 'max_sentence2_length': 486, 'unique_sentence2': 1997}, 'eng_Latn-isl_Latn': {'num_samples': 1997, 'number_of_characters': 509928, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 131.3, 'max_sentence2_length': 399, 'unique_sentence2': 1996}, 'eng_Latn-ita_Latn': {'num_samples': 1997, 'number_of_characters': 536937, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 144.83, 'max_sentence2_length': 623, 'unique_sentence2': 1996}, 'eng_Latn-jpn_Jpan': {'num_samples': 1997, 'number_of_characters': 359346, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 4, 'average_sentence2_length': 55.9, 'max_sentence2_length': 189, 'unique_sentence2': 1994}, 'eng_Latn-kan_Knda': {'num_samples': 1997, 'number_of_characters': 513256, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 132.97, 'max_sentence2_length': 449, 'unique_sentence2': 1996}, 'eng_Latn-kat_Geor': {'num_samples': 1997, 'number_of_characters': 514312, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 133.5, 'max_sentence2_length': 503, 'unique_sentence2': 1995}, 'eng_Latn-kaz_Cyrl': {'num_samples': 1997, 'number_of_characters': 507996, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 130.33, 'max_sentence2_length': 473, 'unique_sentence2': 1996}, 'eng_Latn-khm_Khmr': {'num_samples': 1997, 'number_of_characters': 536211, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 144.46, 'max_sentence2_length': 517, 'unique_sentence2': 1996}, 'eng_Latn-kin_Latn': {'num_samples': 1997, 'number_of_characters': 551507, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 10, 'average_sentence2_length': 152.12, 'max_sentence2_length': 541, 'unique_sentence2': 1996}, 'eng_Latn-kir_Cyrl': {'num_samples': 1997, 'number_of_characters': 498584, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 125.62, 'max_sentence2_length': 395, 'unique_sentence2': 1996}, 'eng_Latn-kmr_Latn': {'num_samples': 1997, 'number_of_characters': 493666, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 123.16, 'max_sentence2_length': 420, 'unique_sentence2': 1996}, 'eng_Latn-kor_Hang': {'num_samples': 1997, 'number_of_characters': 381125, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 66.8, 'max_sentence2_length': 217, 'unique_sentence2': 1995}, 'eng_Latn-lao_Laoo': {'num_samples': 1997, 'number_of_characters': 514700, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 133.69, 'max_sentence2_length': 507, 'unique_sentence2': 1997}, 'eng_Latn-lav_Latn': {'num_samples': 1997, 'number_of_characters': 515908, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 134.3, 'max_sentence2_length': 503, 'unique_sentence2': 1994}, 'eng_Latn-lit_Latn': {'num_samples': 1997, 'number_of_characters': 507117, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 129.89, 'max_sentence2_length': 446, 'unique_sentence2': 1995}, 'eng_Latn-ltz_Latn': {'num_samples': 1997, 'number_of_characters': 528477, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 140.59, 'max_sentence2_length': 543, 'unique_sentence2': 1996}, 'eng_Latn-mal_Mlym': {'num_samples': 1997, 'number_of_characters': 551872, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 152.3, 'max_sentence2_length': 540, 'unique_sentence2': 1996}, 'eng_Latn-mar_Deva': {'num_samples': 1997, 'number_of_characters': 508607, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 130.64, 'max_sentence2_length': 443, 'unique_sentence2': 1995}, 'eng_Latn-mey_Arab': {'num_samples': 1997, 'number_of_characters': 461555, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 107.08, 'max_sentence2_length': 392, 'unique_sentence2': 1993}, 'eng_Latn-mkd_Cyrl': {'num_samples': 1997, 'number_of_characters': 515611, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 134.15, 'max_sentence2_length': 451, 'unique_sentence2': 1997}, 'eng_Latn-mlg_Latn': {'num_samples': 1997, 'number_of_characters': 568028, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 160.39, 'max_sentence2_length': 559, 'unique_sentence2': 1994}, 'eng_Latn-mlt_Latn': {'num_samples': 1997, 'number_of_characters': 525195, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 138.95, 'max_sentence2_length': 582, 'unique_sentence2': 1996}, 'eng_Latn-mon_Mong': {'num_samples': 1997, 'number_of_characters': 506768, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 11, 'average_sentence2_length': 129.72, 'max_sentence2_length': 414, 'unique_sentence2': 1997}, 'eng_Latn-mri_Latn': {'num_samples': 1997, 'number_of_characters': 521844, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 137.27, 'max_sentence2_length': 443, 'unique_sentence2': 1997}, 'eng_Latn-msa_Latn': {'num_samples': 1997, 'number_of_characters': 524903, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 138.8, 'max_sentence2_length': 463, 'unique_sentence2': 1997}, 'eng_Latn-mya_Mymr': {'num_samples': 1997, 'number_of_characters': 559574, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 156.16, 'max_sentence2_length': 773, 'unique_sentence2': 1997}, 'eng_Latn-nde_Latn': {'num_samples': 1997, 'number_of_characters': 545459, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 149.09, 'max_sentence2_length': 590, 'unique_sentence2': 1997}, 'eng_Latn-nep_Deva': {'num_samples': 1997, 'number_of_characters': 495943, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 124.3, 'max_sentence2_length': 412, 'unique_sentence2': 1996}, 'eng_Latn-nld_Latn': {'num_samples': 1997, 'number_of_characters': 539635, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 146.18, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'eng_Latn-nno_Latn': {'num_samples': 1997, 'number_of_characters': 496077, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.37, 'max_sentence2_length': 417, 'unique_sentence2': 1996}, 'eng_Latn-nob_Latn': {'num_samples': 1997, 'number_of_characters': 499164, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 125.91, 'max_sentence2_length': 482, 'unique_sentence2': 1996}, 'eng_Latn-nso_Latn': {'num_samples': 1997, 'number_of_characters': 539219, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 145.97, 'max_sentence2_length': 487, 'unique_sentence2': 1996}, 'eng_Latn-nya_Latn': {'num_samples': 1997, 'number_of_characters': 532002, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 10, 'average_sentence2_length': 142.35, 'max_sentence2_length': 464, 'unique_sentence2': 1993}, 'eng_Latn-orm_Ethi': {'num_samples': 1997, 'number_of_characters': 485151, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 118.89, 'max_sentence2_length': 466, 'unique_sentence2': 1984}, 'eng_Latn-pan_Guru': {'num_samples': 1997, 'number_of_characters': 498142, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 125.4, 'max_sentence2_length': 383, 'unique_sentence2': 1996}, 'eng_Latn-pol_Latn': {'num_samples': 1997, 'number_of_characters': 525586, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 139.14, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'eng_Latn-por_Latn': {'num_samples': 1997, 'number_of_characters': 524935, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 138.82, 'max_sentence2_length': 497, 'unique_sentence2': 1996}, 'eng_Latn-prs_Arab': {'num_samples': 1997, 'number_of_characters': 490256, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 121.45, 'max_sentence2_length': 365, 'unique_sentence2': 1997}, 'eng_Latn-pus_Arab': {'num_samples': 1997, 'number_of_characters': 490353, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 121.5, 'max_sentence2_length': 366, 'unique_sentence2': 1996}, 'eng_Latn-ron_Latn': {'num_samples': 1997, 'number_of_characters': 540205, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 146.46, 'max_sentence2_length': 518, 'unique_sentence2': 1997}, 'eng_Latn-rus_Cyrl': {'num_samples': 1997, 'number_of_characters': 522613, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 137.65, 'max_sentence2_length': 419, 'unique_sentence2': 1996}, 'eng_Latn-shi_Arab': {'num_samples': 1997, 'number_of_characters': 462633, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 3, 'average_sentence2_length': 107.62, 'max_sentence2_length': 378, 'unique_sentence2': 1996}, 'eng_Latn-sin_Sinh': {'num_samples': 1997, 'number_of_characters': 506461, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 129.56, 'max_sentence2_length': 441, 'unique_sentence2': 1996}, 'eng_Latn-slk_Latn': {'num_samples': 1997, 'number_of_characters': 500689, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 126.67, 'max_sentence2_length': 403, 'unique_sentence2': 1996}, 'eng_Latn-slv_Latn': {'num_samples': 1997, 'number_of_characters': 500616, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 126.64, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'eng_Latn-smo_Latn': {'num_samples': 1997, 'number_of_characters': 525575, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 139.14, 'max_sentence2_length': 431, 'unique_sentence2': 1996}, 'eng_Latn-sna_Latn': {'num_samples': 1997, 'number_of_characters': 546050, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 149.39, 'max_sentence2_length': 511, 'unique_sentence2': 1995}, 'eng_Latn-snd_Arab': {'num_samples': 1997, 'number_of_characters': 468047, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 110.33, 'max_sentence2_length': 335, 'unique_sentence2': 1996}, 'eng_Latn-som_Latn': {'num_samples': 1997, 'number_of_characters': 539012, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 145.86, 'max_sentence2_length': 455, 'unique_sentence2': 1997}, 'eng_Latn-spa_Latn': {'num_samples': 1997, 'number_of_characters': 535920, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 1, 'average_sentence2_length': 144.32, 'max_sentence2_length': 504, 'unique_sentence2': 1996}, 'eng_Latn-sqi_Latn': {'num_samples': 1997, 'number_of_characters': 531327, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 142.02, 'max_sentence2_length': 461, 'unique_sentence2': 1996}, 'eng_Latn-srp_Cyrl': {'num_samples': 1997, 'number_of_characters': 500023, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 126.34, 'max_sentence2_length': 439, 'unique_sentence2': 1995}, 'eng_Latn-srp_Latn': {'num_samples': 1997, 'number_of_characters': 503861, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 128.26, 'max_sentence2_length': 452, 'unique_sentence2': 1996}, 'eng_Latn-ssw_Latn': {'num_samples': 1997, 'number_of_characters': 535862, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 144.29, 'max_sentence2_length': 510, 'unique_sentence2': 1996}, 'eng_Latn-swa_Latn': {'num_samples': 1997, 'number_of_characters': 520229, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 10, 'average_sentence2_length': 136.46, 'max_sentence2_length': 430, 'unique_sentence2': 1997}, 'eng_Latn-swe_Latn': {'num_samples': 1997, 'number_of_characters': 499547, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 126.1, 'max_sentence2_length': 430, 'unique_sentence2': 1996}, 'eng_Latn-tah_Latn': {'num_samples': 1997, 'number_of_characters': 557343, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 155.04, 'max_sentence2_length': 524, 'unique_sentence2': 1997}, 'eng_Latn-tam_Taml': {'num_samples': 1997, 'number_of_characters': 557681, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 11, 'average_sentence2_length': 155.21, 'max_sentence2_length': 581, 'unique_sentence2': 1997}, 'eng_Latn-tat_Cyrl': {'num_samples': 1997, 'number_of_characters': 493646, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 123.15, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'eng_Latn-tel_Telu': {'num_samples': 1997, 'number_of_characters': 495247, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 10, 'average_sentence2_length': 123.95, 'max_sentence2_length': 412, 'unique_sentence2': 1996}, 'eng_Latn-tgk_Cyrl': {'num_samples': 1997, 'number_of_characters': 521867, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 3, 'average_sentence2_length': 137.28, 'max_sentence2_length': 451, 'unique_sentence2': 1995}, 'eng_Latn-tha_Thai': {'num_samples': 1997, 'number_of_characters': 485188, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 118.91, 'max_sentence2_length': 439, 'unique_sentence2': 1996}, 'eng_Latn-tir_Ethi': {'num_samples': 1997, 'number_of_characters': 412958, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 82.74, 'max_sentence2_length': 272, 'unique_sentence2': 1996}, 'eng_Latn-ton_Latn': {'num_samples': 1997, 'number_of_characters': 561360, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 157.06, 'max_sentence2_length': 468, 'unique_sentence2': 1997}, 'eng_Latn-tsn_Latn': {'num_samples': 1997, 'number_of_characters': 582003, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 167.39, 'max_sentence2_length': 556, 'unique_sentence2': 1997}, 'eng_Latn-tuk_Latn': {'num_samples': 1997, 'number_of_characters': 532994, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 142.85, 'max_sentence2_length': 576, 'unique_sentence2': 1996}, 'eng_Latn-tur_Latn': {'num_samples': 1997, 'number_of_characters': 513333, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 133.01, 'max_sentence2_length': 504, 'unique_sentence2': 1997}, 'eng_Latn-uig_Arab': {'num_samples': 1997, 'number_of_characters': 558742, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 155.74, 'max_sentence2_length': 592, 'unique_sentence2': 1996}, 'eng_Latn-ukr_Cyrl': {'num_samples': 1997, 'number_of_characters': 510503, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 131.59, 'max_sentence2_length': 440, 'unique_sentence2': 1996}, 'eng_Latn-urd_Arab': {'num_samples': 1997, 'number_of_characters': 495718, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.19, 'max_sentence2_length': 390, 'unique_sentence2': 1996}, 'eng_Latn-uzb_Latn': {'num_samples': 1997, 'number_of_characters': 541415, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 147.07, 'max_sentence2_length': 470, 'unique_sentence2': 1996}, 'eng_Latn-ven_Latn': {'num_samples': 1997, 'number_of_characters': 547476, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 10, 'average_sentence2_length': 150.1, 'max_sentence2_length': 535, 'unique_sentence2': 1993}, 'eng_Latn-vie_Latn': {'num_samples': 1997, 'number_of_characters': 518841, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 135.76, 'max_sentence2_length': 437, 'unique_sentence2': 1996}, 'eng_Latn-wol_Latn': {'num_samples': 1997, 'number_of_characters': 487523, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 120.08, 'max_sentence2_length': 405, 'unique_sentence2': 1990}, 'eng_Latn-xho_Latn': {'num_samples': 1997, 'number_of_characters': 515810, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 134.25, 'max_sentence2_length': 492, 'unique_sentence2': 1997}, 'eng_Latn-yor_Latn': {'num_samples': 1997, 'number_of_characters': 563808, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 158.28, 'max_sentence2_length': 582, 'unique_sentence2': 1996}, 'eng_Latn-yue_Hant': {'num_samples': 1997, 'number_of_characters': 326607, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 4, 'average_sentence2_length': 39.5, 'max_sentence2_length': 133, 'unique_sentence2': 1996}, 'eng_Latn-zho_Hans': {'num_samples': 1997, 'number_of_characters': 332681, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 4, 'average_sentence2_length': 42.54, 'max_sentence2_length': 263, 'unique_sentence2': 1997}, 'eng_Latn-zho_Hant': {'num_samples': 1997, 'number_of_characters': 339198, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 3, 'average_sentence2_length': 45.81, 'max_sentence2_length': 200, 'unique_sentence2': 1996}, 'eng_Latn-zul_Latn': {'num_samples': 1997, 'number_of_characters': 505452, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.05, 'max_sentence1_length': 437, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 129.06, 'max_sentence2_length': 494, 'unique_sentence2': 1996}, 'eus_Latn-ben_Beng': {'num_samples': 1997, 'number_of_characters': 519005, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 137.81, 'max_sentence1_length': 393, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 122.08, 'max_sentence2_length': 402, 'unique_sentence2': 1997}, 'eus_Latn-div_Thaa': {'num_samples': 1997, 'number_of_characters': 579051, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 137.81, 'max_sentence1_length': 393, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 152.15, 'max_sentence2_length': 609, 'unique_sentence2': 1996}, 'eus_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 522923, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 137.81, 'max_sentence1_length': 393, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'eus_Latn-guj_Gujr': {'num_samples': 1997, 'number_of_characters': 520134, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 137.81, 'max_sentence1_length': 393, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 122.65, 'max_sentence2_length': 378, 'unique_sentence2': 1997}, 'eus_Latn-hin_Deva': {'num_samples': 1997, 'number_of_characters': 536778, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 137.81, 'max_sentence1_length': 393, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 130.98, 'max_sentence2_length': 394, 'unique_sentence2': 1996}, 'eus_Latn-kan_Knda': {'num_samples': 1997, 'number_of_characters': 540739, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 137.81, 'max_sentence1_length': 393, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 132.97, 'max_sentence2_length': 449, 'unique_sentence2': 1996}, 'eus_Latn-mar_Deva': {'num_samples': 1997, 'number_of_characters': 536090, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 137.81, 'max_sentence1_length': 393, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 130.64, 'max_sentence2_length': 443, 'unique_sentence2': 1995}, 'eus_Latn-nep_Deva': {'num_samples': 1997, 'number_of_characters': 523426, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 137.81, 'max_sentence1_length': 393, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 124.3, 'max_sentence2_length': 412, 'unique_sentence2': 1996}, 'eus_Latn-pan_Guru': {'num_samples': 1997, 'number_of_characters': 525625, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 137.81, 'max_sentence1_length': 393, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 125.4, 'max_sentence2_length': 383, 'unique_sentence2': 1996}, 'eus_Latn-sin_Sinh': {'num_samples': 1997, 'number_of_characters': 533944, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 137.81, 'max_sentence1_length': 393, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 129.56, 'max_sentence2_length': 441, 'unique_sentence2': 1996}, 'eus_Latn-snd_Arab': {'num_samples': 1997, 'number_of_characters': 495530, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 137.81, 'max_sentence1_length': 393, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 110.33, 'max_sentence2_length': 335, 'unique_sentence2': 1996}, 'eus_Latn-tam_Taml': {'num_samples': 1997, 'number_of_characters': 585164, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 137.81, 'max_sentence1_length': 393, 'unique_sentence1': 1997, 'min_sentence2_length': 11, 'average_sentence2_length': 155.21, 'max_sentence2_length': 581, 'unique_sentence2': 1997}, 'eus_Latn-tel_Telu': {'num_samples': 1997, 'number_of_characters': 522730, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 137.81, 'max_sentence1_length': 393, 'unique_sentence1': 1997, 'min_sentence2_length': 10, 'average_sentence2_length': 123.95, 'max_sentence2_length': 412, 'unique_sentence2': 1996}, 'eus_Latn-urd_Arab': {'num_samples': 1997, 'number_of_characters': 523201, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 137.81, 'max_sentence1_length': 393, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.19, 'max_sentence2_length': 390, 'unique_sentence2': 1996}, 'ewe_Latn-bem_Latn': {'num_samples': 1997, 'number_of_characters': 537470, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 119.67, 'max_sentence1_length': 493, 'unique_sentence1': 1994, 'min_sentence2_length': 8, 'average_sentence2_length': 149.47, 'max_sentence2_length': 465, 'unique_sentence2': 1997}, 'ewe_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 486698, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 119.67, 'max_sentence1_length': 493, 'unique_sentence1': 1994, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'ewe_Latn-fuc_Latn': {'num_samples': 1997, 'number_of_characters': 467458, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 119.67, 'max_sentence1_length': 493, 'unique_sentence1': 1994, 'min_sentence2_length': 6, 'average_sentence2_length': 114.41, 'max_sentence2_length': 376, 'unique_sentence2': 1996}, 'ewe_Latn-kin_Latn': {'num_samples': 1997, 'number_of_characters': 542765, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 119.67, 'max_sentence1_length': 493, 'unique_sentence1': 1994, 'min_sentence2_length': 10, 'average_sentence2_length': 152.12, 'max_sentence2_length': 541, 'unique_sentence2': 1996}, 'ewe_Latn-nde_Latn': {'num_samples': 1997, 'number_of_characters': 536717, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 119.67, 'max_sentence1_length': 493, 'unique_sentence1': 1994, 'min_sentence2_length': 6, 'average_sentence2_length': 149.09, 'max_sentence2_length': 590, 'unique_sentence2': 1997}, 'ewe_Latn-nya_Latn': {'num_samples': 1997, 'number_of_characters': 523260, 'unique_pairs': 1995, 'min_sentence1_length': 6, 'average_sentence1_length': 119.67, 'max_sentence1_length': 493, 'unique_sentence1': 1994, 'min_sentence2_length': 10, 'average_sentence2_length': 142.35, 'max_sentence2_length': 464, 'unique_sentence2': 1993}, 'ewe_Latn-sna_Latn': {'num_samples': 1997, 'number_of_characters': 537308, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 119.67, 'max_sentence1_length': 493, 'unique_sentence1': 1994, 'min_sentence2_length': 6, 'average_sentence2_length': 149.39, 'max_sentence2_length': 511, 'unique_sentence2': 1995}, 'ewe_Latn-ven_Latn': {'num_samples': 1997, 'number_of_characters': 538734, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 119.67, 'max_sentence1_length': 493, 'unique_sentence1': 1994, 'min_sentence2_length': 10, 'average_sentence2_length': 150.1, 'max_sentence2_length': 535, 'unique_sentence2': 1993}, 'fao_Latn-afr_Latn': {'num_samples': 1997, 'number_of_characters': 526155, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.1, 'max_sentence1_length': 433, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 134.38, 'max_sentence2_length': 437, 'unique_sentence2': 1996}, 'fao_Latn-dan_Latn': {'num_samples': 1997, 'number_of_characters': 509941, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.1, 'max_sentence1_length': 433, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 126.26, 'max_sentence2_length': 522, 'unique_sentence2': 1995}, 'fao_Latn-deu_Latn': {'num_samples': 1997, 'number_of_characters': 553453, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.1, 'max_sentence1_length': 433, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 148.05, 'max_sentence2_length': 508, 'unique_sentence2': 1996}, 'fao_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 505523, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.1, 'max_sentence1_length': 433, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'fao_Latn-isl_Latn': {'num_samples': 1997, 'number_of_characters': 520011, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.1, 'max_sentence1_length': 433, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 131.3, 'max_sentence2_length': 399, 'unique_sentence2': 1996}, 'fao_Latn-ltz_Latn': {'num_samples': 1997, 'number_of_characters': 538560, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.1, 'max_sentence1_length': 433, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 140.59, 'max_sentence2_length': 543, 'unique_sentence2': 1996}, 'fao_Latn-nld_Latn': {'num_samples': 1997, 'number_of_characters': 549718, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.1, 'max_sentence1_length': 433, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 146.18, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'fao_Latn-nno_Latn': {'num_samples': 1997, 'number_of_characters': 506160, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.1, 'max_sentence1_length': 433, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.37, 'max_sentence2_length': 417, 'unique_sentence2': 1996}, 'fao_Latn-nob_Latn': {'num_samples': 1997, 'number_of_characters': 509247, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.1, 'max_sentence1_length': 433, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 125.91, 'max_sentence2_length': 482, 'unique_sentence2': 1996}, 'fao_Latn-swe_Latn': {'num_samples': 1997, 'number_of_characters': 509630, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.1, 'max_sentence1_length': 433, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 126.1, 'max_sentence2_length': 430, 'unique_sentence2': 1996}, 'fas_Arab-arb_Arab': {'num_samples': 1997, 'number_of_characters': 474520, 'unique_pairs': 1995, 'min_sentence1_length': 9, 'average_sentence1_length': 121.85, 'max_sentence1_length': 389, 'unique_sentence1': 1995, 'min_sentence2_length': 5, 'average_sentence2_length': 115.76, 'max_sentence2_length': 362, 'unique_sentence2': 1995}, 'fas_Arab-ben_Beng': {'num_samples': 1997, 'number_of_characters': 487141, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 121.85, 'max_sentence1_length': 389, 'unique_sentence1': 1995, 'min_sentence2_length': 6, 'average_sentence2_length': 122.08, 'max_sentence2_length': 402, 'unique_sentence2': 1997}, 'fas_Arab-ckb_Arab': {'num_samples': 1997, 'number_of_characters': 495706, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 121.85, 'max_sentence1_length': 389, 'unique_sentence1': 1995, 'min_sentence2_length': 5, 'average_sentence2_length': 126.37, 'max_sentence2_length': 399, 'unique_sentence2': 1995}, 'fas_Arab-deu_Latn': {'num_samples': 1997, 'number_of_characters': 538989, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 121.85, 'max_sentence1_length': 389, 'unique_sentence1': 1995, 'min_sentence2_length': 9, 'average_sentence2_length': 148.05, 'max_sentence2_length': 508, 'unique_sentence2': 1996}, 'fas_Arab-ell_Grek': {'num_samples': 1997, 'number_of_characters': 542466, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 121.85, 'max_sentence1_length': 389, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 149.79, 'max_sentence2_length': 584, 'unique_sentence2': 1996}, 'fas_Arab-eng_Latn': {'num_samples': 1997, 'number_of_characters': 491059, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 121.85, 'max_sentence1_length': 389, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'fas_Arab-fin_Latn': {'num_samples': 1997, 'number_of_characters': 513139, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 121.85, 'max_sentence1_length': 389, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 135.1, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'fas_Arab-fra_Latn': {'num_samples': 1997, 'number_of_characters': 536447, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 121.85, 'max_sentence1_length': 389, 'unique_sentence1': 1995, 'min_sentence2_length': 8, 'average_sentence2_length': 146.77, 'max_sentence2_length': 512, 'unique_sentence2': 1996}, 'fas_Arab-heb_Hebr': {'num_samples': 1997, 'number_of_characters': 443635, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 121.85, 'max_sentence1_length': 389, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 100.3, 'max_sentence2_length': 375, 'unique_sentence2': 1996}, 'fas_Arab-hin_Deva': {'num_samples': 1997, 'number_of_characters': 504914, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 121.85, 'max_sentence1_length': 389, 'unique_sentence1': 1995, 'min_sentence2_length': 5, 'average_sentence2_length': 130.98, 'max_sentence2_length': 394, 'unique_sentence2': 1996}, 'fas_Arab-hun_Latn': {'num_samples': 1997, 'number_of_characters': 521715, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 121.85, 'max_sentence1_length': 389, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 139.4, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'fas_Arab-ind_Latn': {'num_samples': 1997, 'number_of_characters': 530311, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 121.85, 'max_sentence1_length': 389, 'unique_sentence1': 1995, 'min_sentence2_length': 6, 'average_sentence2_length': 143.7, 'max_sentence2_length': 486, 'unique_sentence2': 1997}, 'fas_Arab-jpn_Jpan': {'num_samples': 1997, 'number_of_characters': 354965, 'unique_pairs': 1995, 'min_sentence1_length': 9, 'average_sentence1_length': 121.85, 'max_sentence1_length': 389, 'unique_sentence1': 1995, 'min_sentence2_length': 4, 'average_sentence2_length': 55.9, 'max_sentence2_length': 189, 'unique_sentence2': 1994}, 'fas_Arab-kmr_Latn': {'num_samples': 1997, 'number_of_characters': 489285, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 121.85, 'max_sentence1_length': 389, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 123.16, 'max_sentence2_length': 420, 'unique_sentence2': 1996}, 'fas_Arab-kor_Hang': {'num_samples': 1997, 'number_of_characters': 376744, 'unique_pairs': 1995, 'min_sentence1_length': 9, 'average_sentence1_length': 121.85, 'max_sentence1_length': 389, 'unique_sentence1': 1995, 'min_sentence2_length': 5, 'average_sentence2_length': 66.8, 'max_sentence2_length': 217, 'unique_sentence2': 1995}, 'fas_Arab-lit_Latn': {'num_samples': 1997, 'number_of_characters': 502736, 'unique_pairs': 1995, 'min_sentence1_length': 9, 'average_sentence1_length': 121.85, 'max_sentence1_length': 389, 'unique_sentence1': 1995, 'min_sentence2_length': 9, 'average_sentence2_length': 129.89, 'max_sentence2_length': 446, 'unique_sentence2': 1995}, 'fas_Arab-mey_Arab': {'num_samples': 1997, 'number_of_characters': 457174, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 121.85, 'max_sentence1_length': 389, 'unique_sentence1': 1995, 'min_sentence2_length': 6, 'average_sentence2_length': 107.08, 'max_sentence2_length': 392, 'unique_sentence2': 1993}, 'fas_Arab-nld_Latn': {'num_samples': 1997, 'number_of_characters': 535254, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 121.85, 'max_sentence1_length': 389, 'unique_sentence1': 1995, 'min_sentence2_length': 8, 'average_sentence2_length': 146.18, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'fas_Arab-pol_Latn': {'num_samples': 1997, 'number_of_characters': 521205, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 121.85, 'max_sentence1_length': 389, 'unique_sentence1': 1995, 'min_sentence2_length': 9, 'average_sentence2_length': 139.14, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'fas_Arab-por_Latn': {'num_samples': 1997, 'number_of_characters': 520554, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 121.85, 'max_sentence1_length': 389, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 138.82, 'max_sentence2_length': 497, 'unique_sentence2': 1996}, 'fas_Arab-prs_Arab': {'num_samples': 1997, 'number_of_characters': 485875, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 121.85, 'max_sentence1_length': 389, 'unique_sentence1': 1995, 'min_sentence2_length': 8, 'average_sentence2_length': 121.45, 'max_sentence2_length': 365, 'unique_sentence2': 1997}, 'fas_Arab-pus_Arab': {'num_samples': 1997, 'number_of_characters': 485972, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 121.85, 'max_sentence1_length': 389, 'unique_sentence1': 1995, 'min_sentence2_length': 8, 'average_sentence2_length': 121.5, 'max_sentence2_length': 366, 'unique_sentence2': 1996}, 'fas_Arab-rus_Cyrl': {'num_samples': 1997, 'number_of_characters': 518232, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 121.85, 'max_sentence1_length': 389, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 137.65, 'max_sentence2_length': 419, 'unique_sentence2': 1996}, 'fas_Arab-shi_Arab': {'num_samples': 1997, 'number_of_characters': 458252, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 121.85, 'max_sentence1_length': 389, 'unique_sentence1': 1995, 'min_sentence2_length': 3, 'average_sentence2_length': 107.62, 'max_sentence2_length': 378, 'unique_sentence2': 1996}, 'fas_Arab-spa_Latn': {'num_samples': 1997, 'number_of_characters': 531539, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 121.85, 'max_sentence1_length': 389, 'unique_sentence1': 1995, 'min_sentence2_length': 1, 'average_sentence2_length': 144.32, 'max_sentence2_length': 504, 'unique_sentence2': 1996}, 'fas_Arab-swa_Latn': {'num_samples': 1997, 'number_of_characters': 515848, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 121.85, 'max_sentence1_length': 389, 'unique_sentence1': 1995, 'min_sentence2_length': 10, 'average_sentence2_length': 136.46, 'max_sentence2_length': 430, 'unique_sentence2': 1997}, 'fas_Arab-swe_Latn': {'num_samples': 1997, 'number_of_characters': 495166, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 121.85, 'max_sentence1_length': 389, 'unique_sentence1': 1995, 'min_sentence2_length': 8, 'average_sentence2_length': 126.1, 'max_sentence2_length': 430, 'unique_sentence2': 1996}, 'fas_Arab-tam_Taml': {'num_samples': 1997, 'number_of_characters': 553300, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 121.85, 'max_sentence1_length': 389, 'unique_sentence1': 1995, 'min_sentence2_length': 11, 'average_sentence2_length': 155.21, 'max_sentence2_length': 581, 'unique_sentence2': 1997}, 'fas_Arab-tgk_Cyrl': {'num_samples': 1997, 'number_of_characters': 517486, 'unique_pairs': 1995, 'min_sentence1_length': 9, 'average_sentence1_length': 121.85, 'max_sentence1_length': 389, 'unique_sentence1': 1995, 'min_sentence2_length': 3, 'average_sentence2_length': 137.28, 'max_sentence2_length': 451, 'unique_sentence2': 1995}, 'fas_Arab-tur_Latn': {'num_samples': 1997, 'number_of_characters': 508952, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 121.85, 'max_sentence1_length': 389, 'unique_sentence1': 1995, 'min_sentence2_length': 6, 'average_sentence2_length': 133.01, 'max_sentence2_length': 504, 'unique_sentence2': 1997}, 'fas_Arab-vie_Latn': {'num_samples': 1997, 'number_of_characters': 514460, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 121.85, 'max_sentence1_length': 389, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 135.76, 'max_sentence2_length': 437, 'unique_sentence2': 1996}, 'fas_Arab-zho_Hant': {'num_samples': 1997, 'number_of_characters': 334817, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 121.85, 'max_sentence1_length': 389, 'unique_sentence1': 1995, 'min_sentence2_length': 3, 'average_sentence2_length': 45.81, 'max_sentence2_length': 200, 'unique_sentence2': 1996}, 'fas_Arab-zul_Latn': {'num_samples': 1997, 'number_of_characters': 501071, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 121.85, 'max_sentence1_length': 389, 'unique_sentence1': 1995, 'min_sentence2_length': 8, 'average_sentence2_length': 129.06, 'max_sentence2_length': 494, 'unique_sentence2': 1996}, 'fij_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 548225, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 150.48, 'max_sentence1_length': 448, 'unique_sentence1': 1988, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'fij_Latn-fil_Latn': {'num_samples': 1997, 'number_of_characters': 593925, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 150.48, 'max_sentence1_length': 448, 'unique_sentence1': 1988, 'min_sentence2_length': 10, 'average_sentence2_length': 146.93, 'max_sentence2_length': 554, 'unique_sentence2': 1997}, 'fij_Latn-ind_Latn': {'num_samples': 1997, 'number_of_characters': 587477, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 150.48, 'max_sentence1_length': 448, 'unique_sentence1': 1988, 'min_sentence2_length': 6, 'average_sentence2_length': 143.7, 'max_sentence2_length': 486, 'unique_sentence2': 1997}, 'fij_Latn-mal_Mlym': {'num_samples': 1997, 'number_of_characters': 604657, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 150.48, 'max_sentence1_length': 448, 'unique_sentence1': 1988, 'min_sentence2_length': 7, 'average_sentence2_length': 152.3, 'max_sentence2_length': 540, 'unique_sentence2': 1996}, 'fij_Latn-mlg_Latn': {'num_samples': 1997, 'number_of_characters': 620813, 'unique_pairs': 1995, 'min_sentence1_length': 10, 'average_sentence1_length': 150.48, 'max_sentence1_length': 448, 'unique_sentence1': 1988, 'min_sentence2_length': 6, 'average_sentence2_length': 160.39, 'max_sentence2_length': 559, 'unique_sentence2': 1994}, 'fij_Latn-mri_Latn': {'num_samples': 1997, 'number_of_characters': 574629, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 150.48, 'max_sentence1_length': 448, 'unique_sentence1': 1988, 'min_sentence2_length': 9, 'average_sentence2_length': 137.27, 'max_sentence2_length': 443, 'unique_sentence2': 1997}, 'fij_Latn-msa_Latn': {'num_samples': 1997, 'number_of_characters': 577688, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 150.48, 'max_sentence1_length': 448, 'unique_sentence1': 1988, 'min_sentence2_length': 6, 'average_sentence2_length': 138.8, 'max_sentence2_length': 463, 'unique_sentence2': 1997}, 'fij_Latn-smo_Latn': {'num_samples': 1997, 'number_of_characters': 578360, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 150.48, 'max_sentence1_length': 448, 'unique_sentence1': 1988, 'min_sentence2_length': 8, 'average_sentence2_length': 139.14, 'max_sentence2_length': 431, 'unique_sentence2': 1996}, 'fij_Latn-tah_Latn': {'num_samples': 1997, 'number_of_characters': 610128, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 150.48, 'max_sentence1_length': 448, 'unique_sentence1': 1988, 'min_sentence2_length': 9, 'average_sentence2_length': 155.04, 'max_sentence2_length': 524, 'unique_sentence2': 1997}, 'fij_Latn-ton_Latn': {'num_samples': 1997, 'number_of_characters': 614145, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 150.48, 'max_sentence1_length': 448, 'unique_sentence1': 1988, 'min_sentence2_length': 7, 'average_sentence2_length': 157.06, 'max_sentence2_length': 468, 'unique_sentence2': 1997}, 'fil_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 541140, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 146.93, 'max_sentence1_length': 554, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'fil_Latn-fij_Latn': {'num_samples': 1997, 'number_of_characters': 593925, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 146.93, 'max_sentence1_length': 554, 'unique_sentence1': 1997, 'min_sentence2_length': 10, 'average_sentence2_length': 150.48, 'max_sentence2_length': 448, 'unique_sentence2': 1988}, 'fil_Latn-ind_Latn': {'num_samples': 1997, 'number_of_characters': 580392, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 146.93, 'max_sentence1_length': 554, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 143.7, 'max_sentence2_length': 486, 'unique_sentence2': 1997}, 'fil_Latn-mal_Mlym': {'num_samples': 1997, 'number_of_characters': 597572, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 146.93, 'max_sentence1_length': 554, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 152.3, 'max_sentence2_length': 540, 'unique_sentence2': 1996}, 'fil_Latn-mlg_Latn': {'num_samples': 1997, 'number_of_characters': 613728, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 146.93, 'max_sentence1_length': 554, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 160.39, 'max_sentence2_length': 559, 'unique_sentence2': 1994}, 'fil_Latn-mri_Latn': {'num_samples': 1997, 'number_of_characters': 567544, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 146.93, 'max_sentence1_length': 554, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 137.27, 'max_sentence2_length': 443, 'unique_sentence2': 1997}, 'fil_Latn-msa_Latn': {'num_samples': 1997, 'number_of_characters': 570603, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 146.93, 'max_sentence1_length': 554, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 138.8, 'max_sentence2_length': 463, 'unique_sentence2': 1997}, 'fil_Latn-smo_Latn': {'num_samples': 1997, 'number_of_characters': 571275, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 146.93, 'max_sentence1_length': 554, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 139.14, 'max_sentence2_length': 431, 'unique_sentence2': 1996}, 'fil_Latn-tah_Latn': {'num_samples': 1997, 'number_of_characters': 603043, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 146.93, 'max_sentence1_length': 554, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 155.04, 'max_sentence2_length': 524, 'unique_sentence2': 1997}, 'fil_Latn-ton_Latn': {'num_samples': 1997, 'number_of_characters': 607060, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 146.93, 'max_sentence1_length': 554, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 157.06, 'max_sentence2_length': 468, 'unique_sentence2': 1997}, 'fin_Latn-arb_Arab': {'num_samples': 1997, 'number_of_characters': 500981, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 135.1, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 115.76, 'max_sentence2_length': 362, 'unique_sentence2': 1995}, 'fin_Latn-ben_Beng': {'num_samples': 1997, 'number_of_characters': 513602, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 135.1, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 122.08, 'max_sentence2_length': 402, 'unique_sentence2': 1997}, 'fin_Latn-deu_Latn': {'num_samples': 1997, 'number_of_characters': 565450, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 135.1, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 148.05, 'max_sentence2_length': 508, 'unique_sentence2': 1996}, 'fin_Latn-ell_Grek': {'num_samples': 1997, 'number_of_characters': 568927, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 135.1, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 149.79, 'max_sentence2_length': 584, 'unique_sentence2': 1996}, 'fin_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 517520, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 135.1, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'fin_Latn-fas_Arab': {'num_samples': 1997, 'number_of_characters': 513139, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 135.1, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 121.85, 'max_sentence2_length': 389, 'unique_sentence2': 1995}, 'fin_Latn-fra_Latn': {'num_samples': 1997, 'number_of_characters': 562908, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 135.1, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 146.77, 'max_sentence2_length': 512, 'unique_sentence2': 1996}, 'fin_Latn-heb_Hebr': {'num_samples': 1997, 'number_of_characters': 470096, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 135.1, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 100.3, 'max_sentence2_length': 375, 'unique_sentence2': 1996}, 'fin_Latn-hin_Deva': {'num_samples': 1997, 'number_of_characters': 531375, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 135.1, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 130.98, 'max_sentence2_length': 394, 'unique_sentence2': 1996}, 'fin_Latn-hun_Latn': {'num_samples': 1997, 'number_of_characters': 548176, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 135.1, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 139.4, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'fin_Latn-ind_Latn': {'num_samples': 1997, 'number_of_characters': 556772, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 135.1, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 143.7, 'max_sentence2_length': 486, 'unique_sentence2': 1997}, 'fin_Latn-jpn_Jpan': {'num_samples': 1997, 'number_of_characters': 381426, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 135.1, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 4, 'average_sentence2_length': 55.9, 'max_sentence2_length': 189, 'unique_sentence2': 1994}, 'fin_Latn-kor_Hang': {'num_samples': 1997, 'number_of_characters': 403205, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 135.1, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 66.8, 'max_sentence2_length': 217, 'unique_sentence2': 1995}, 'fin_Latn-lav_Latn': {'num_samples': 1997, 'number_of_characters': 537988, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 135.1, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 134.3, 'max_sentence2_length': 503, 'unique_sentence2': 1994}, 'fin_Latn-lit_Latn': {'num_samples': 1997, 'number_of_characters': 529197, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 135.1, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 129.89, 'max_sentence2_length': 446, 'unique_sentence2': 1995}, 'fin_Latn-nld_Latn': {'num_samples': 1997, 'number_of_characters': 561715, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 135.1, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 146.18, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'fin_Latn-pol_Latn': {'num_samples': 1997, 'number_of_characters': 547666, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 135.1, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 139.14, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'fin_Latn-por_Latn': {'num_samples': 1997, 'number_of_characters': 547015, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 135.1, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 138.82, 'max_sentence2_length': 497, 'unique_sentence2': 1996}, 'fin_Latn-rus_Cyrl': {'num_samples': 1997, 'number_of_characters': 544693, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 135.1, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 137.65, 'max_sentence2_length': 419, 'unique_sentence2': 1996}, 'fin_Latn-spa_Latn': {'num_samples': 1997, 'number_of_characters': 558000, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 135.1, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 1, 'average_sentence2_length': 144.32, 'max_sentence2_length': 504, 'unique_sentence2': 1996}, 'fin_Latn-swa_Latn': {'num_samples': 1997, 'number_of_characters': 542309, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 135.1, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 10, 'average_sentence2_length': 136.46, 'max_sentence2_length': 430, 'unique_sentence2': 1997}, 'fin_Latn-swe_Latn': {'num_samples': 1997, 'number_of_characters': 521627, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 135.1, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 126.1, 'max_sentence2_length': 430, 'unique_sentence2': 1996}, 'fin_Latn-tam_Taml': {'num_samples': 1997, 'number_of_characters': 579761, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 135.1, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 11, 'average_sentence2_length': 155.21, 'max_sentence2_length': 581, 'unique_sentence2': 1997}, 'fin_Latn-tur_Latn': {'num_samples': 1997, 'number_of_characters': 535413, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 135.1, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 133.01, 'max_sentence2_length': 504, 'unique_sentence2': 1997}, 'fin_Latn-vie_Latn': {'num_samples': 1997, 'number_of_characters': 540921, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 135.1, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 135.76, 'max_sentence2_length': 437, 'unique_sentence2': 1996}, 'fin_Latn-zho_Hant': {'num_samples': 1997, 'number_of_characters': 361278, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 135.1, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 3, 'average_sentence2_length': 45.81, 'max_sentence2_length': 200, 'unique_sentence2': 1996}, 'fin_Latn-zul_Latn': {'num_samples': 1997, 'number_of_characters': 527532, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 135.1, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 129.06, 'max_sentence2_length': 494, 'unique_sentence2': 1996}, 'fra_Latn-arb_Arab': {'num_samples': 1997, 'number_of_characters': 524289, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.77, 'max_sentence1_length': 512, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 115.76, 'max_sentence2_length': 362, 'unique_sentence2': 1995}, 'fra_Latn-ben_Beng': {'num_samples': 1997, 'number_of_characters': 536910, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 146.77, 'max_sentence1_length': 512, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 122.08, 'max_sentence2_length': 402, 'unique_sentence2': 1997}, 'fra_Latn-cat_Latn': {'num_samples': 1997, 'number_of_characters': 576068, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 146.77, 'max_sentence1_length': 512, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 141.69, 'max_sentence2_length': 460, 'unique_sentence2': 1997}, 'fra_Latn-deu_Latn': {'num_samples': 1997, 'number_of_characters': 588758, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.77, 'max_sentence1_length': 512, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 148.05, 'max_sentence2_length': 508, 'unique_sentence2': 1996}, 'fra_Latn-ell_Grek': {'num_samples': 1997, 'number_of_characters': 592235, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.77, 'max_sentence1_length': 512, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 149.79, 'max_sentence2_length': 584, 'unique_sentence2': 1996}, 'fra_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 540828, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 146.77, 'max_sentence1_length': 512, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'fra_Latn-fas_Arab': {'num_samples': 1997, 'number_of_characters': 536447, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.77, 'max_sentence1_length': 512, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 121.85, 'max_sentence2_length': 389, 'unique_sentence2': 1995}, 'fra_Latn-fin_Latn': {'num_samples': 1997, 'number_of_characters': 562908, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.77, 'max_sentence1_length': 512, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 135.1, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'fra_Latn-glg_Latn': {'num_samples': 1997, 'number_of_characters': 565094, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.77, 'max_sentence1_length': 512, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 136.2, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'fra_Latn-heb_Hebr': {'num_samples': 1997, 'number_of_characters': 493404, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.77, 'max_sentence1_length': 512, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 100.3, 'max_sentence2_length': 375, 'unique_sentence2': 1996}, 'fra_Latn-hin_Deva': {'num_samples': 1997, 'number_of_characters': 554683, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.77, 'max_sentence1_length': 512, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 130.98, 'max_sentence2_length': 394, 'unique_sentence2': 1996}, 'fra_Latn-hun_Latn': {'num_samples': 1997, 'number_of_characters': 571484, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 146.77, 'max_sentence1_length': 512, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 139.4, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'fra_Latn-ind_Latn': {'num_samples': 1997, 'number_of_characters': 580080, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 146.77, 'max_sentence1_length': 512, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 143.7, 'max_sentence2_length': 486, 'unique_sentence2': 1997}, 'fra_Latn-ita_Latn': {'num_samples': 1997, 'number_of_characters': 582325, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.77, 'max_sentence1_length': 512, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 144.83, 'max_sentence2_length': 623, 'unique_sentence2': 1996}, 'fra_Latn-jpn_Jpan': {'num_samples': 1997, 'number_of_characters': 404734, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.77, 'max_sentence1_length': 512, 'unique_sentence1': 1996, 'min_sentence2_length': 4, 'average_sentence2_length': 55.9, 'max_sentence2_length': 189, 'unique_sentence2': 1994}, 'fra_Latn-kor_Hang': {'num_samples': 1997, 'number_of_characters': 426513, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.77, 'max_sentence1_length': 512, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 66.8, 'max_sentence2_length': 217, 'unique_sentence2': 1995}, 'fra_Latn-lit_Latn': {'num_samples': 1997, 'number_of_characters': 552505, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.77, 'max_sentence1_length': 512, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 129.89, 'max_sentence2_length': 446, 'unique_sentence2': 1995}, 'fra_Latn-mlt_Latn': {'num_samples': 1997, 'number_of_characters': 570583, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.77, 'max_sentence1_length': 512, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 138.95, 'max_sentence2_length': 582, 'unique_sentence2': 1996}, 'fra_Latn-nld_Latn': {'num_samples': 1997, 'number_of_characters': 585023, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.77, 'max_sentence1_length': 512, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 146.18, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'fra_Latn-pol_Latn': {'num_samples': 1997, 'number_of_characters': 570974, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.77, 'max_sentence1_length': 512, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 139.14, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'fra_Latn-por_Latn': {'num_samples': 1997, 'number_of_characters': 570323, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.77, 'max_sentence1_length': 512, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 138.82, 'max_sentence2_length': 497, 'unique_sentence2': 1996}, 'fra_Latn-ron_Latn': {'num_samples': 1997, 'number_of_characters': 585593, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 146.77, 'max_sentence1_length': 512, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 146.46, 'max_sentence2_length': 518, 'unique_sentence2': 1997}, 'fra_Latn-rus_Cyrl': {'num_samples': 1997, 'number_of_characters': 568001, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.77, 'max_sentence1_length': 512, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 137.65, 'max_sentence2_length': 419, 'unique_sentence2': 1996}, 'fra_Latn-spa_Latn': {'num_samples': 1997, 'number_of_characters': 581308, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.77, 'max_sentence1_length': 512, 'unique_sentence1': 1996, 'min_sentence2_length': 1, 'average_sentence2_length': 144.32, 'max_sentence2_length': 504, 'unique_sentence2': 1996}, 'fra_Latn-swa_Latn': {'num_samples': 1997, 'number_of_characters': 565617, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 146.77, 'max_sentence1_length': 512, 'unique_sentence1': 1996, 'min_sentence2_length': 10, 'average_sentence2_length': 136.46, 'max_sentence2_length': 430, 'unique_sentence2': 1997}, 'fra_Latn-swe_Latn': {'num_samples': 1997, 'number_of_characters': 544935, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.77, 'max_sentence1_length': 512, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 126.1, 'max_sentence2_length': 430, 'unique_sentence2': 1996}, 'fra_Latn-tam_Taml': {'num_samples': 1997, 'number_of_characters': 603069, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 146.77, 'max_sentence1_length': 512, 'unique_sentence1': 1996, 'min_sentence2_length': 11, 'average_sentence2_length': 155.21, 'max_sentence2_length': 581, 'unique_sentence2': 1997}, 'fra_Latn-tur_Latn': {'num_samples': 1997, 'number_of_characters': 558721, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 146.77, 'max_sentence1_length': 512, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 133.01, 'max_sentence2_length': 504, 'unique_sentence2': 1997}, 'fra_Latn-vie_Latn': {'num_samples': 1997, 'number_of_characters': 564229, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 146.77, 'max_sentence1_length': 512, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 135.76, 'max_sentence2_length': 437, 'unique_sentence2': 1996}, 'fra_Latn-zho_Hant': {'num_samples': 1997, 'number_of_characters': 384586, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.77, 'max_sentence1_length': 512, 'unique_sentence1': 1996, 'min_sentence2_length': 3, 'average_sentence2_length': 45.81, 'max_sentence2_length': 200, 'unique_sentence2': 1996}, 'fra_Latn-zul_Latn': {'num_samples': 1997, 'number_of_characters': 550840, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 146.77, 'max_sentence1_length': 512, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 129.06, 'max_sentence2_length': 494, 'unique_sentence2': 1996}, 'fuc_Latn-bem_Latn': {'num_samples': 1997, 'number_of_characters': 526972, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 114.41, 'max_sentence1_length': 376, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 149.47, 'max_sentence2_length': 465, 'unique_sentence2': 1997}, 'fuc_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 476200, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 114.41, 'max_sentence1_length': 376, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'fuc_Latn-ewe_Latn': {'num_samples': 1997, 'number_of_characters': 467458, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 114.41, 'max_sentence1_length': 376, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 119.67, 'max_sentence2_length': 493, 'unique_sentence2': 1994}, 'fuc_Latn-kin_Latn': {'num_samples': 1997, 'number_of_characters': 532267, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 114.41, 'max_sentence1_length': 376, 'unique_sentence1': 1996, 'min_sentence2_length': 10, 'average_sentence2_length': 152.12, 'max_sentence2_length': 541, 'unique_sentence2': 1996}, 'fuc_Latn-nde_Latn': {'num_samples': 1997, 'number_of_characters': 526219, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 114.41, 'max_sentence1_length': 376, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 149.09, 'max_sentence2_length': 590, 'unique_sentence2': 1997}, 'fuc_Latn-nya_Latn': {'num_samples': 1997, 'number_of_characters': 512762, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 114.41, 'max_sentence1_length': 376, 'unique_sentence1': 1996, 'min_sentence2_length': 10, 'average_sentence2_length': 142.35, 'max_sentence2_length': 464, 'unique_sentence2': 1993}, 'fuc_Latn-sna_Latn': {'num_samples': 1997, 'number_of_characters': 526810, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 114.41, 'max_sentence1_length': 376, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 149.39, 'max_sentence2_length': 511, 'unique_sentence2': 1995}, 'fuc_Latn-ven_Latn': {'num_samples': 1997, 'number_of_characters': 528236, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 114.41, 'max_sentence1_length': 376, 'unique_sentence1': 1996, 'min_sentence2_length': 10, 'average_sentence2_length': 150.1, 'max_sentence2_length': 535, 'unique_sentence2': 1993}, 'gle_Latn-cym_Latn': {'num_samples': 1997, 'number_of_characters': 561314, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 147.63, 'max_sentence1_length': 461, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 133.45, 'max_sentence2_length': 444, 'unique_sentence2': 1997}, 'gle_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 542529, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 147.63, 'max_sentence1_length': 461, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'glg_Latn-cat_Latn': {'num_samples': 1997, 'number_of_characters': 554946, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 136.2, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 141.69, 'max_sentence2_length': 460, 'unique_sentence2': 1997}, 'glg_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 519706, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 136.2, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'glg_Latn-fra_Latn': {'num_samples': 1997, 'number_of_characters': 565094, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 136.2, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 146.77, 'max_sentence2_length': 512, 'unique_sentence2': 1996}, 'glg_Latn-ita_Latn': {'num_samples': 1997, 'number_of_characters': 561203, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 136.2, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 144.83, 'max_sentence2_length': 623, 'unique_sentence2': 1996}, 'glg_Latn-mlt_Latn': {'num_samples': 1997, 'number_of_characters': 549461, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 136.2, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 138.95, 'max_sentence2_length': 582, 'unique_sentence2': 1996}, 'glg_Latn-por_Latn': {'num_samples': 1997, 'number_of_characters': 549201, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 136.2, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 138.82, 'max_sentence2_length': 497, 'unique_sentence2': 1996}, 'glg_Latn-ron_Latn': {'num_samples': 1997, 'number_of_characters': 564471, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 136.2, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 146.46, 'max_sentence2_length': 518, 'unique_sentence2': 1997}, 'glg_Latn-spa_Latn': {'num_samples': 1997, 'number_of_characters': 560186, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 136.2, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 1, 'average_sentence2_length': 144.32, 'max_sentence2_length': 504, 'unique_sentence2': 1996}, 'guj_Gujr-ben_Beng': {'num_samples': 1997, 'number_of_characters': 488733, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 122.65, 'max_sentence1_length': 378, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 122.08, 'max_sentence2_length': 402, 'unique_sentence2': 1997}, 'guj_Gujr-div_Thaa': {'num_samples': 1997, 'number_of_characters': 548779, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 122.65, 'max_sentence1_length': 378, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 152.15, 'max_sentence2_length': 609, 'unique_sentence2': 1996}, 'guj_Gujr-eng_Latn': {'num_samples': 1997, 'number_of_characters': 492651, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 122.65, 'max_sentence1_length': 378, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'guj_Gujr-eus_Latn': {'num_samples': 1997, 'number_of_characters': 520134, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 122.65, 'max_sentence1_length': 378, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 137.81, 'max_sentence2_length': 393, 'unique_sentence2': 1997}, 'guj_Gujr-hin_Deva': {'num_samples': 1997, 'number_of_characters': 506506, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 122.65, 'max_sentence1_length': 378, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 130.98, 'max_sentence2_length': 394, 'unique_sentence2': 1996}, 'guj_Gujr-kan_Knda': {'num_samples': 1997, 'number_of_characters': 510467, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 122.65, 'max_sentence1_length': 378, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 132.97, 'max_sentence2_length': 449, 'unique_sentence2': 1996}, 'guj_Gujr-mar_Deva': {'num_samples': 1997, 'number_of_characters': 505818, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 122.65, 'max_sentence1_length': 378, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 130.64, 'max_sentence2_length': 443, 'unique_sentence2': 1995}, 'guj_Gujr-nep_Deva': {'num_samples': 1997, 'number_of_characters': 493154, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 122.65, 'max_sentence1_length': 378, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 124.3, 'max_sentence2_length': 412, 'unique_sentence2': 1996}, 'guj_Gujr-pan_Guru': {'num_samples': 1997, 'number_of_characters': 495353, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 122.65, 'max_sentence1_length': 378, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 125.4, 'max_sentence2_length': 383, 'unique_sentence2': 1996}, 'guj_Gujr-sin_Sinh': {'num_samples': 1997, 'number_of_characters': 503672, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 122.65, 'max_sentence1_length': 378, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 129.56, 'max_sentence2_length': 441, 'unique_sentence2': 1996}, 'guj_Gujr-snd_Arab': {'num_samples': 1997, 'number_of_characters': 465258, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 122.65, 'max_sentence1_length': 378, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 110.33, 'max_sentence2_length': 335, 'unique_sentence2': 1996}, 'guj_Gujr-tam_Taml': {'num_samples': 1997, 'number_of_characters': 554892, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 122.65, 'max_sentence1_length': 378, 'unique_sentence1': 1997, 'min_sentence2_length': 11, 'average_sentence2_length': 155.21, 'max_sentence2_length': 581, 'unique_sentence2': 1997}, 'guj_Gujr-tel_Telu': {'num_samples': 1997, 'number_of_characters': 492458, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 122.65, 'max_sentence1_length': 378, 'unique_sentence1': 1997, 'min_sentence2_length': 10, 'average_sentence2_length': 123.95, 'max_sentence2_length': 412, 'unique_sentence2': 1996}, 'guj_Gujr-urd_Arab': {'num_samples': 1997, 'number_of_characters': 492929, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 122.65, 'max_sentence1_length': 378, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.19, 'max_sentence2_length': 390, 'unique_sentence2': 1996}, 'hau_Latn-amh_Ethi': {'num_samples': 1997, 'number_of_characters': 437473, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 135.19, 'max_sentence1_length': 483, 'unique_sentence1': 1997, 'min_sentence2_length': 1, 'average_sentence2_length': 83.88, 'max_sentence2_length': 290, 'unique_sentence2': 1994}, 'hau_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 517686, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 135.19, 'max_sentence1_length': 483, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'hau_Latn-ibo_Latn': {'num_samples': 1997, 'number_of_characters': 516067, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 135.19, 'max_sentence1_length': 483, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 123.24, 'max_sentence2_length': 469, 'unique_sentence2': 1997}, 'hau_Latn-nso_Latn': {'num_samples': 1997, 'number_of_characters': 561465, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 135.19, 'max_sentence1_length': 483, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 145.97, 'max_sentence2_length': 487, 'unique_sentence2': 1996}, 'hau_Latn-orm_Ethi': {'num_samples': 1997, 'number_of_characters': 507397, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 135.19, 'max_sentence1_length': 483, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 118.89, 'max_sentence2_length': 466, 'unique_sentence2': 1984}, 'hau_Latn-som_Latn': {'num_samples': 1997, 'number_of_characters': 561258, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 135.19, 'max_sentence1_length': 483, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 145.86, 'max_sentence2_length': 455, 'unique_sentence2': 1997}, 'hau_Latn-ssw_Latn': {'num_samples': 1997, 'number_of_characters': 558108, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 135.19, 'max_sentence1_length': 483, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 144.29, 'max_sentence2_length': 510, 'unique_sentence2': 1996}, 'hau_Latn-swa_Latn': {'num_samples': 1997, 'number_of_characters': 542475, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 135.19, 'max_sentence1_length': 483, 'unique_sentence1': 1997, 'min_sentence2_length': 10, 'average_sentence2_length': 136.46, 'max_sentence2_length': 430, 'unique_sentence2': 1997}, 'hau_Latn-tir_Ethi': {'num_samples': 1997, 'number_of_characters': 435204, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 135.19, 'max_sentence1_length': 483, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 82.74, 'max_sentence2_length': 272, 'unique_sentence2': 1996}, 'hau_Latn-tsn_Latn': {'num_samples': 1997, 'number_of_characters': 604249, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 135.19, 'max_sentence1_length': 483, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 167.39, 'max_sentence2_length': 556, 'unique_sentence2': 1997}, 'hau_Latn-wol_Latn': {'num_samples': 1997, 'number_of_characters': 509769, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 135.19, 'max_sentence1_length': 483, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 120.08, 'max_sentence2_length': 405, 'unique_sentence2': 1990}, 'hau_Latn-xho_Latn': {'num_samples': 1997, 'number_of_characters': 538056, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 135.19, 'max_sentence1_length': 483, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 134.25, 'max_sentence2_length': 492, 'unique_sentence2': 1997}, 'hau_Latn-yor_Latn': {'num_samples': 1997, 'number_of_characters': 586054, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 135.19, 'max_sentence1_length': 483, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 158.28, 'max_sentence2_length': 582, 'unique_sentence2': 1996}, 'hau_Latn-zul_Latn': {'num_samples': 1997, 'number_of_characters': 527698, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 135.19, 'max_sentence1_length': 483, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 129.06, 'max_sentence2_length': 494, 'unique_sentence2': 1996}, 'heb_Hebr-arb_Arab': {'num_samples': 1997, 'number_of_characters': 431477, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 100.3, 'max_sentence1_length': 375, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 115.76, 'max_sentence2_length': 362, 'unique_sentence2': 1995}, 'heb_Hebr-ben_Beng': {'num_samples': 1997, 'number_of_characters': 444098, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 100.3, 'max_sentence1_length': 375, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 122.08, 'max_sentence2_length': 402, 'unique_sentence2': 1997}, 'heb_Hebr-ckb_Arab': {'num_samples': 1997, 'number_of_characters': 452663, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 100.3, 'max_sentence1_length': 375, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 126.37, 'max_sentence2_length': 399, 'unique_sentence2': 1995}, 'heb_Hebr-deu_Latn': {'num_samples': 1997, 'number_of_characters': 495946, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 100.3, 'max_sentence1_length': 375, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 148.05, 'max_sentence2_length': 508, 'unique_sentence2': 1996}, 'heb_Hebr-ell_Grek': {'num_samples': 1997, 'number_of_characters': 499423, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 100.3, 'max_sentence1_length': 375, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 149.79, 'max_sentence2_length': 584, 'unique_sentence2': 1996}, 'heb_Hebr-eng_Latn': {'num_samples': 1997, 'number_of_characters': 448016, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 100.3, 'max_sentence1_length': 375, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'heb_Hebr-fas_Arab': {'num_samples': 1997, 'number_of_characters': 443635, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 100.3, 'max_sentence1_length': 375, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 121.85, 'max_sentence2_length': 389, 'unique_sentence2': 1995}, 'heb_Hebr-fin_Latn': {'num_samples': 1997, 'number_of_characters': 470096, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 100.3, 'max_sentence1_length': 375, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 135.1, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'heb_Hebr-fra_Latn': {'num_samples': 1997, 'number_of_characters': 493404, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 100.3, 'max_sentence1_length': 375, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 146.77, 'max_sentence2_length': 512, 'unique_sentence2': 1996}, 'heb_Hebr-hin_Deva': {'num_samples': 1997, 'number_of_characters': 461871, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 100.3, 'max_sentence1_length': 375, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 130.98, 'max_sentence2_length': 394, 'unique_sentence2': 1996}, 'heb_Hebr-hun_Latn': {'num_samples': 1997, 'number_of_characters': 478672, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 100.3, 'max_sentence1_length': 375, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 139.4, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'heb_Hebr-ind_Latn': {'num_samples': 1997, 'number_of_characters': 487268, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 100.3, 'max_sentence1_length': 375, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 143.7, 'max_sentence2_length': 486, 'unique_sentence2': 1997}, 'heb_Hebr-jpn_Jpan': {'num_samples': 1997, 'number_of_characters': 311922, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 100.3, 'max_sentence1_length': 375, 'unique_sentence1': 1996, 'min_sentence2_length': 4, 'average_sentence2_length': 55.9, 'max_sentence2_length': 189, 'unique_sentence2': 1994}, 'heb_Hebr-kmr_Latn': {'num_samples': 1997, 'number_of_characters': 446242, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 100.3, 'max_sentence1_length': 375, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 123.16, 'max_sentence2_length': 420, 'unique_sentence2': 1996}, 'heb_Hebr-kor_Hang': {'num_samples': 1997, 'number_of_characters': 333701, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 100.3, 'max_sentence1_length': 375, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 66.8, 'max_sentence2_length': 217, 'unique_sentence2': 1995}, 'heb_Hebr-lit_Latn': {'num_samples': 1997, 'number_of_characters': 459693, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 100.3, 'max_sentence1_length': 375, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 129.89, 'max_sentence2_length': 446, 'unique_sentence2': 1995}, 'heb_Hebr-mey_Arab': {'num_samples': 1997, 'number_of_characters': 414131, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 100.3, 'max_sentence1_length': 375, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 107.08, 'max_sentence2_length': 392, 'unique_sentence2': 1993}, 'heb_Hebr-nld_Latn': {'num_samples': 1997, 'number_of_characters': 492211, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 100.3, 'max_sentence1_length': 375, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 146.18, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'heb_Hebr-pol_Latn': {'num_samples': 1997, 'number_of_characters': 478162, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 100.3, 'max_sentence1_length': 375, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 139.14, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'heb_Hebr-por_Latn': {'num_samples': 1997, 'number_of_characters': 477511, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 100.3, 'max_sentence1_length': 375, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 138.82, 'max_sentence2_length': 497, 'unique_sentence2': 1996}, 'heb_Hebr-prs_Arab': {'num_samples': 1997, 'number_of_characters': 442832, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 100.3, 'max_sentence1_length': 375, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 121.45, 'max_sentence2_length': 365, 'unique_sentence2': 1997}, 'heb_Hebr-pus_Arab': {'num_samples': 1997, 'number_of_characters': 442929, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 100.3, 'max_sentence1_length': 375, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 121.5, 'max_sentence2_length': 366, 'unique_sentence2': 1996}, 'heb_Hebr-rus_Cyrl': {'num_samples': 1997, 'number_of_characters': 475189, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 100.3, 'max_sentence1_length': 375, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 137.65, 'max_sentence2_length': 419, 'unique_sentence2': 1996}, 'heb_Hebr-shi_Arab': {'num_samples': 1997, 'number_of_characters': 415209, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 100.3, 'max_sentence1_length': 375, 'unique_sentence1': 1996, 'min_sentence2_length': 3, 'average_sentence2_length': 107.62, 'max_sentence2_length': 378, 'unique_sentence2': 1996}, 'heb_Hebr-spa_Latn': {'num_samples': 1997, 'number_of_characters': 488496, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 100.3, 'max_sentence1_length': 375, 'unique_sentence1': 1996, 'min_sentence2_length': 1, 'average_sentence2_length': 144.32, 'max_sentence2_length': 504, 'unique_sentence2': 1996}, 'heb_Hebr-swa_Latn': {'num_samples': 1997, 'number_of_characters': 472805, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 100.3, 'max_sentence1_length': 375, 'unique_sentence1': 1996, 'min_sentence2_length': 10, 'average_sentence2_length': 136.46, 'max_sentence2_length': 430, 'unique_sentence2': 1997}, 'heb_Hebr-swe_Latn': {'num_samples': 1997, 'number_of_characters': 452123, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 100.3, 'max_sentence1_length': 375, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 126.1, 'max_sentence2_length': 430, 'unique_sentence2': 1996}, 'heb_Hebr-tam_Taml': {'num_samples': 1997, 'number_of_characters': 510257, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 100.3, 'max_sentence1_length': 375, 'unique_sentence1': 1996, 'min_sentence2_length': 11, 'average_sentence2_length': 155.21, 'max_sentence2_length': 581, 'unique_sentence2': 1997}, 'heb_Hebr-tgk_Cyrl': {'num_samples': 1997, 'number_of_characters': 474443, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 100.3, 'max_sentence1_length': 375, 'unique_sentence1': 1996, 'min_sentence2_length': 3, 'average_sentence2_length': 137.28, 'max_sentence2_length': 451, 'unique_sentence2': 1995}, 'heb_Hebr-tur_Latn': {'num_samples': 1997, 'number_of_characters': 465909, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 100.3, 'max_sentence1_length': 375, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 133.01, 'max_sentence2_length': 504, 'unique_sentence2': 1997}, 'heb_Hebr-vie_Latn': {'num_samples': 1997, 'number_of_characters': 471417, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 100.3, 'max_sentence1_length': 375, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 135.76, 'max_sentence2_length': 437, 'unique_sentence2': 1996}, 'heb_Hebr-zho_Hant': {'num_samples': 1997, 'number_of_characters': 291774, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 100.3, 'max_sentence1_length': 375, 'unique_sentence1': 1996, 'min_sentence2_length': 3, 'average_sentence2_length': 45.81, 'max_sentence2_length': 200, 'unique_sentence2': 1996}, 'heb_Hebr-zul_Latn': {'num_samples': 1997, 'number_of_characters': 458028, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 100.3, 'max_sentence1_length': 375, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 129.06, 'max_sentence2_length': 494, 'unique_sentence2': 1996}, 'hin_Deva-arb_Arab': {'num_samples': 1997, 'number_of_characters': 492756, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 130.98, 'max_sentence1_length': 394, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 115.76, 'max_sentence2_length': 362, 'unique_sentence2': 1995}, 'hin_Deva-ben_Beng': {'num_samples': 1997, 'number_of_characters': 505377, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 130.98, 'max_sentence1_length': 394, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 122.08, 'max_sentence2_length': 402, 'unique_sentence2': 1997}, 'hin_Deva-deu_Latn': {'num_samples': 1997, 'number_of_characters': 557225, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 130.98, 'max_sentence1_length': 394, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 148.05, 'max_sentence2_length': 508, 'unique_sentence2': 1996}, 'hin_Deva-div_Thaa': {'num_samples': 1997, 'number_of_characters': 565423, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 130.98, 'max_sentence1_length': 394, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 152.15, 'max_sentence2_length': 609, 'unique_sentence2': 1996}, 'hin_Deva-ell_Grek': {'num_samples': 1997, 'number_of_characters': 560702, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 130.98, 'max_sentence1_length': 394, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 149.79, 'max_sentence2_length': 584, 'unique_sentence2': 1996}, 'hin_Deva-eng_Latn': {'num_samples': 1997, 'number_of_characters': 509295, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 130.98, 'max_sentence1_length': 394, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'hin_Deva-eus_Latn': {'num_samples': 1997, 'number_of_characters': 536778, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 130.98, 'max_sentence1_length': 394, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 137.81, 'max_sentence2_length': 393, 'unique_sentence2': 1997}, 'hin_Deva-fas_Arab': {'num_samples': 1997, 'number_of_characters': 504914, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 130.98, 'max_sentence1_length': 394, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 121.85, 'max_sentence2_length': 389, 'unique_sentence2': 1995}, 'hin_Deva-fin_Latn': {'num_samples': 1997, 'number_of_characters': 531375, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 130.98, 'max_sentence1_length': 394, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 135.1, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'hin_Deva-fra_Latn': {'num_samples': 1997, 'number_of_characters': 554683, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 130.98, 'max_sentence1_length': 394, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 146.77, 'max_sentence2_length': 512, 'unique_sentence2': 1996}, 'hin_Deva-guj_Gujr': {'num_samples': 1997, 'number_of_characters': 506506, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 130.98, 'max_sentence1_length': 394, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 122.65, 'max_sentence2_length': 378, 'unique_sentence2': 1997}, 'hin_Deva-heb_Hebr': {'num_samples': 1997, 'number_of_characters': 461871, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 130.98, 'max_sentence1_length': 394, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 100.3, 'max_sentence2_length': 375, 'unique_sentence2': 1996}, 'hin_Deva-hun_Latn': {'num_samples': 1997, 'number_of_characters': 539951, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 130.98, 'max_sentence1_length': 394, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 139.4, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'hin_Deva-ind_Latn': {'num_samples': 1997, 'number_of_characters': 548547, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 130.98, 'max_sentence1_length': 394, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 143.7, 'max_sentence2_length': 486, 'unique_sentence2': 1997}, 'hin_Deva-jpn_Jpan': {'num_samples': 1997, 'number_of_characters': 373201, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 130.98, 'max_sentence1_length': 394, 'unique_sentence1': 1996, 'min_sentence2_length': 4, 'average_sentence2_length': 55.9, 'max_sentence2_length': 189, 'unique_sentence2': 1994}, 'hin_Deva-kan_Knda': {'num_samples': 1997, 'number_of_characters': 527111, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 130.98, 'max_sentence1_length': 394, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 132.97, 'max_sentence2_length': 449, 'unique_sentence2': 1996}, 'hin_Deva-kor_Hang': {'num_samples': 1997, 'number_of_characters': 394980, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 130.98, 'max_sentence1_length': 394, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 66.8, 'max_sentence2_length': 217, 'unique_sentence2': 1995}, 'hin_Deva-lit_Latn': {'num_samples': 1997, 'number_of_characters': 520972, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 130.98, 'max_sentence1_length': 394, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 129.89, 'max_sentence2_length': 446, 'unique_sentence2': 1995}, 'hin_Deva-mar_Deva': {'num_samples': 1997, 'number_of_characters': 522462, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 130.98, 'max_sentence1_length': 394, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 130.64, 'max_sentence2_length': 443, 'unique_sentence2': 1995}, 'hin_Deva-nep_Deva': {'num_samples': 1997, 'number_of_characters': 509798, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 130.98, 'max_sentence1_length': 394, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 124.3, 'max_sentence2_length': 412, 'unique_sentence2': 1996}, 'hin_Deva-nld_Latn': {'num_samples': 1997, 'number_of_characters': 553490, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 130.98, 'max_sentence1_length': 394, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 146.18, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'hin_Deva-pan_Guru': {'num_samples': 1997, 'number_of_characters': 511997, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 130.98, 'max_sentence1_length': 394, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 125.4, 'max_sentence2_length': 383, 'unique_sentence2': 1996}, 'hin_Deva-pol_Latn': {'num_samples': 1997, 'number_of_characters': 539441, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 130.98, 'max_sentence1_length': 394, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 139.14, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'hin_Deva-por_Latn': {'num_samples': 1997, 'number_of_characters': 538790, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 130.98, 'max_sentence1_length': 394, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 138.82, 'max_sentence2_length': 497, 'unique_sentence2': 1996}, 'hin_Deva-rus_Cyrl': {'num_samples': 1997, 'number_of_characters': 536468, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 130.98, 'max_sentence1_length': 394, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 137.65, 'max_sentence2_length': 419, 'unique_sentence2': 1996}, 'hin_Deva-sin_Sinh': {'num_samples': 1997, 'number_of_characters': 520316, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 130.98, 'max_sentence1_length': 394, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 129.56, 'max_sentence2_length': 441, 'unique_sentence2': 1996}, 'hin_Deva-snd_Arab': {'num_samples': 1997, 'number_of_characters': 481902, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 130.98, 'max_sentence1_length': 394, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 110.33, 'max_sentence2_length': 335, 'unique_sentence2': 1996}, 'hin_Deva-spa_Latn': {'num_samples': 1997, 'number_of_characters': 549775, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 130.98, 'max_sentence1_length': 394, 'unique_sentence1': 1996, 'min_sentence2_length': 1, 'average_sentence2_length': 144.32, 'max_sentence2_length': 504, 'unique_sentence2': 1996}, 'hin_Deva-swa_Latn': {'num_samples': 1997, 'number_of_characters': 534084, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 130.98, 'max_sentence1_length': 394, 'unique_sentence1': 1996, 'min_sentence2_length': 10, 'average_sentence2_length': 136.46, 'max_sentence2_length': 430, 'unique_sentence2': 1997}, 'hin_Deva-swe_Latn': {'num_samples': 1997, 'number_of_characters': 513402, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 130.98, 'max_sentence1_length': 394, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 126.1, 'max_sentence2_length': 430, 'unique_sentence2': 1996}, 'hin_Deva-tam_Taml': {'num_samples': 1997, 'number_of_characters': 571536, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 130.98, 'max_sentence1_length': 394, 'unique_sentence1': 1996, 'min_sentence2_length': 11, 'average_sentence2_length': 155.21, 'max_sentence2_length': 581, 'unique_sentence2': 1997}, 'hin_Deva-tel_Telu': {'num_samples': 1997, 'number_of_characters': 509102, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 130.98, 'max_sentence1_length': 394, 'unique_sentence1': 1996, 'min_sentence2_length': 10, 'average_sentence2_length': 123.95, 'max_sentence2_length': 412, 'unique_sentence2': 1996}, 'hin_Deva-tur_Latn': {'num_samples': 1997, 'number_of_characters': 527188, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 130.98, 'max_sentence1_length': 394, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 133.01, 'max_sentence2_length': 504, 'unique_sentence2': 1997}, 'hin_Deva-urd_Arab': {'num_samples': 1997, 'number_of_characters': 509573, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 130.98, 'max_sentence1_length': 394, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.19, 'max_sentence2_length': 390, 'unique_sentence2': 1996}, 'hin_Deva-vie_Latn': {'num_samples': 1997, 'number_of_characters': 532696, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 130.98, 'max_sentence1_length': 394, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 135.76, 'max_sentence2_length': 437, 'unique_sentence2': 1996}, 'hin_Deva-zho_Hant': {'num_samples': 1997, 'number_of_characters': 353053, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 130.98, 'max_sentence1_length': 394, 'unique_sentence1': 1996, 'min_sentence2_length': 3, 'average_sentence2_length': 45.81, 'max_sentence2_length': 200, 'unique_sentence2': 1996}, 'hin_Deva-zul_Latn': {'num_samples': 1997, 'number_of_characters': 519307, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 130.98, 'max_sentence1_length': 394, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 129.06, 'max_sentence2_length': 494, 'unique_sentence2': 1996}, 'hmn_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 578510, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 165.64, 'max_sentence1_length': 643, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'hrv_Latn-bel_Cyrl': {'num_samples': 1997, 'number_of_characters': 512015, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 128.15, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 128.24, 'max_sentence2_length': 422, 'unique_sentence2': 1996}, 'hrv_Latn-bos_Latn': {'num_samples': 1997, 'number_of_characters': 510835, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 128.15, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 127.65, 'max_sentence2_length': 434, 'unique_sentence2': 1996}, 'hrv_Latn-bul_Cyrl': {'num_samples': 1997, 'number_of_characters': 525814, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 128.15, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 135.15, 'max_sentence2_length': 493, 'unique_sentence2': 1996}, 'hrv_Latn-ces_Latn': {'num_samples': 1997, 'number_of_characters': 497243, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 128.15, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 120.84, 'max_sentence2_length': 474, 'unique_sentence2': 1997}, 'hrv_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 503645, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 128.15, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'hrv_Latn-mkd_Cyrl': {'num_samples': 1997, 'number_of_characters': 523816, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 128.15, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 134.15, 'max_sentence2_length': 451, 'unique_sentence2': 1997}, 'hrv_Latn-pol_Latn': {'num_samples': 1997, 'number_of_characters': 533791, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 128.15, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 139.14, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'hrv_Latn-rus_Cyrl': {'num_samples': 1997, 'number_of_characters': 530818, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 128.15, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 137.65, 'max_sentence2_length': 419, 'unique_sentence2': 1996}, 'hrv_Latn-slk_Latn': {'num_samples': 1997, 'number_of_characters': 508894, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 128.15, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 126.67, 'max_sentence2_length': 403, 'unique_sentence2': 1996}, 'hrv_Latn-slv_Latn': {'num_samples': 1997, 'number_of_characters': 508821, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 128.15, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 126.64, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'hrv_Latn-srp_Cyrl': {'num_samples': 1997, 'number_of_characters': 508228, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 128.15, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 126.34, 'max_sentence2_length': 439, 'unique_sentence2': 1995}, 'hrv_Latn-srp_Latn': {'num_samples': 1997, 'number_of_characters': 512066, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 128.15, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 128.26, 'max_sentence2_length': 452, 'unique_sentence2': 1996}, 'hrv_Latn-ukr_Cyrl': {'num_samples': 1997, 'number_of_characters': 518708, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 128.15, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 131.59, 'max_sentence2_length': 440, 'unique_sentence2': 1996}, 'hun_Latn-arb_Arab': {'num_samples': 1997, 'number_of_characters': 509557, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 139.4, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 115.76, 'max_sentence2_length': 362, 'unique_sentence2': 1995}, 'hun_Latn-ben_Beng': {'num_samples': 1997, 'number_of_characters': 522178, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 139.4, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 122.08, 'max_sentence2_length': 402, 'unique_sentence2': 1997}, 'hun_Latn-deu_Latn': {'num_samples': 1997, 'number_of_characters': 574026, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 139.4, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 148.05, 'max_sentence2_length': 508, 'unique_sentence2': 1996}, 'hun_Latn-ell_Grek': {'num_samples': 1997, 'number_of_characters': 577503, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 139.4, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 149.79, 'max_sentence2_length': 584, 'unique_sentence2': 1996}, 'hun_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 526096, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 139.4, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'hun_Latn-fas_Arab': {'num_samples': 1997, 'number_of_characters': 521715, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 139.4, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 121.85, 'max_sentence2_length': 389, 'unique_sentence2': 1995}, 'hun_Latn-fin_Latn': {'num_samples': 1997, 'number_of_characters': 548176, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 139.4, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 135.1, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'hun_Latn-fra_Latn': {'num_samples': 1997, 'number_of_characters': 571484, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 139.4, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 146.77, 'max_sentence2_length': 512, 'unique_sentence2': 1996}, 'hun_Latn-heb_Hebr': {'num_samples': 1997, 'number_of_characters': 478672, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 139.4, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 100.3, 'max_sentence2_length': 375, 'unique_sentence2': 1996}, 'hun_Latn-hin_Deva': {'num_samples': 1997, 'number_of_characters': 539951, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 139.4, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 130.98, 'max_sentence2_length': 394, 'unique_sentence2': 1996}, 'hun_Latn-ind_Latn': {'num_samples': 1997, 'number_of_characters': 565348, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 139.4, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 143.7, 'max_sentence2_length': 486, 'unique_sentence2': 1997}, 'hun_Latn-jpn_Jpan': {'num_samples': 1997, 'number_of_characters': 390002, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 139.4, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 4, 'average_sentence2_length': 55.9, 'max_sentence2_length': 189, 'unique_sentence2': 1994}, 'hun_Latn-kor_Hang': {'num_samples': 1997, 'number_of_characters': 411781, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 139.4, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 66.8, 'max_sentence2_length': 217, 'unique_sentence2': 1995}, 'hun_Latn-lav_Latn': {'num_samples': 1997, 'number_of_characters': 546564, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 139.4, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 134.3, 'max_sentence2_length': 503, 'unique_sentence2': 1994}, 'hun_Latn-lit_Latn': {'num_samples': 1997, 'number_of_characters': 537773, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 139.4, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 129.89, 'max_sentence2_length': 446, 'unique_sentence2': 1995}, 'hun_Latn-nld_Latn': {'num_samples': 1997, 'number_of_characters': 570291, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 139.4, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 146.18, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'hun_Latn-pol_Latn': {'num_samples': 1997, 'number_of_characters': 556242, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 139.4, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 139.14, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'hun_Latn-por_Latn': {'num_samples': 1997, 'number_of_characters': 555591, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 139.4, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 138.82, 'max_sentence2_length': 497, 'unique_sentence2': 1996}, 'hun_Latn-rus_Cyrl': {'num_samples': 1997, 'number_of_characters': 553269, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 139.4, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 137.65, 'max_sentence2_length': 419, 'unique_sentence2': 1996}, 'hun_Latn-spa_Latn': {'num_samples': 1997, 'number_of_characters': 566576, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 139.4, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 1, 'average_sentence2_length': 144.32, 'max_sentence2_length': 504, 'unique_sentence2': 1996}, 'hun_Latn-swa_Latn': {'num_samples': 1997, 'number_of_characters': 550885, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 139.4, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 10, 'average_sentence2_length': 136.46, 'max_sentence2_length': 430, 'unique_sentence2': 1997}, 'hun_Latn-swe_Latn': {'num_samples': 1997, 'number_of_characters': 530203, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 139.4, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 126.1, 'max_sentence2_length': 430, 'unique_sentence2': 1996}, 'hun_Latn-tam_Taml': {'num_samples': 1997, 'number_of_characters': 588337, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 139.4, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 11, 'average_sentence2_length': 155.21, 'max_sentence2_length': 581, 'unique_sentence2': 1997}, 'hun_Latn-tur_Latn': {'num_samples': 1997, 'number_of_characters': 543989, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 139.4, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 133.01, 'max_sentence2_length': 504, 'unique_sentence2': 1997}, 'hun_Latn-vie_Latn': {'num_samples': 1997, 'number_of_characters': 549497, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 139.4, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 135.76, 'max_sentence2_length': 437, 'unique_sentence2': 1996}, 'hun_Latn-zho_Hant': {'num_samples': 1997, 'number_of_characters': 369854, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 139.4, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 3, 'average_sentence2_length': 45.81, 'max_sentence2_length': 200, 'unique_sentence2': 1996}, 'hun_Latn-zul_Latn': {'num_samples': 1997, 'number_of_characters': 536108, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 139.4, 'max_sentence1_length': 508, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 129.06, 'max_sentence2_length': 494, 'unique_sentence2': 1996}, 'hye_Armn-ell_Grek': {'num_samples': 1997, 'number_of_characters': 563842, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 132.56, 'max_sentence1_length': 441, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 149.79, 'max_sentence2_length': 584, 'unique_sentence2': 1996}, 'hye_Armn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 512435, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 132.56, 'max_sentence1_length': 441, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'hye_Armn-kat_Geor': {'num_samples': 1997, 'number_of_characters': 531307, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 132.56, 'max_sentence1_length': 441, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 133.5, 'max_sentence2_length': 503, 'unique_sentence2': 1995}, 'hye_Armn-sqi_Latn': {'num_samples': 1997, 'number_of_characters': 548322, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 132.56, 'max_sentence1_length': 441, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 142.02, 'max_sentence2_length': 461, 'unique_sentence2': 1996}, 'ibo_Latn-amh_Ethi': {'num_samples': 1997, 'number_of_characters': 413608, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 123.24, 'max_sentence1_length': 469, 'unique_sentence1': 1997, 'min_sentence2_length': 1, 'average_sentence2_length': 83.88, 'max_sentence2_length': 290, 'unique_sentence2': 1994}, 'ibo_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 493821, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 123.24, 'max_sentence1_length': 469, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'ibo_Latn-hau_Latn': {'num_samples': 1997, 'number_of_characters': 516067, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 123.24, 'max_sentence1_length': 469, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 135.19, 'max_sentence2_length': 483, 'unique_sentence2': 1997}, 'ibo_Latn-nso_Latn': {'num_samples': 1997, 'number_of_characters': 537600, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 123.24, 'max_sentence1_length': 469, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 145.97, 'max_sentence2_length': 487, 'unique_sentence2': 1996}, 'ibo_Latn-orm_Ethi': {'num_samples': 1997, 'number_of_characters': 483532, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 123.24, 'max_sentence1_length': 469, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 118.89, 'max_sentence2_length': 466, 'unique_sentence2': 1984}, 'ibo_Latn-som_Latn': {'num_samples': 1997, 'number_of_characters': 537393, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 123.24, 'max_sentence1_length': 469, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 145.86, 'max_sentence2_length': 455, 'unique_sentence2': 1997}, 'ibo_Latn-ssw_Latn': {'num_samples': 1997, 'number_of_characters': 534243, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 123.24, 'max_sentence1_length': 469, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 144.29, 'max_sentence2_length': 510, 'unique_sentence2': 1996}, 'ibo_Latn-swa_Latn': {'num_samples': 1997, 'number_of_characters': 518610, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 123.24, 'max_sentence1_length': 469, 'unique_sentence1': 1997, 'min_sentence2_length': 10, 'average_sentence2_length': 136.46, 'max_sentence2_length': 430, 'unique_sentence2': 1997}, 'ibo_Latn-tir_Ethi': {'num_samples': 1997, 'number_of_characters': 411339, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 123.24, 'max_sentence1_length': 469, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 82.74, 'max_sentence2_length': 272, 'unique_sentence2': 1996}, 'ibo_Latn-tsn_Latn': {'num_samples': 1997, 'number_of_characters': 580384, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 123.24, 'max_sentence1_length': 469, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 167.39, 'max_sentence2_length': 556, 'unique_sentence2': 1997}, 'ibo_Latn-wol_Latn': {'num_samples': 1997, 'number_of_characters': 485904, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 123.24, 'max_sentence1_length': 469, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 120.08, 'max_sentence2_length': 405, 'unique_sentence2': 1990}, 'ibo_Latn-xho_Latn': {'num_samples': 1997, 'number_of_characters': 514191, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 123.24, 'max_sentence1_length': 469, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 134.25, 'max_sentence2_length': 492, 'unique_sentence2': 1997}, 'ibo_Latn-yor_Latn': {'num_samples': 1997, 'number_of_characters': 562189, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 123.24, 'max_sentence1_length': 469, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 158.28, 'max_sentence2_length': 582, 'unique_sentence2': 1996}, 'ibo_Latn-zul_Latn': {'num_samples': 1997, 'number_of_characters': 503833, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 123.24, 'max_sentence1_length': 469, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 129.06, 'max_sentence2_length': 494, 'unique_sentence2': 1996}, 'ind_Latn-arb_Arab': {'num_samples': 1997, 'number_of_characters': 518153, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 143.7, 'max_sentence1_length': 486, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 115.76, 'max_sentence2_length': 362, 'unique_sentence2': 1995}, 'ind_Latn-ben_Beng': {'num_samples': 1997, 'number_of_characters': 530774, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 143.7, 'max_sentence1_length': 486, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 122.08, 'max_sentence2_length': 402, 'unique_sentence2': 1997}, 'ind_Latn-deu_Latn': {'num_samples': 1997, 'number_of_characters': 582622, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 143.7, 'max_sentence1_length': 486, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 148.05, 'max_sentence2_length': 508, 'unique_sentence2': 1996}, 'ind_Latn-ell_Grek': {'num_samples': 1997, 'number_of_characters': 586099, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 143.7, 'max_sentence1_length': 486, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 149.79, 'max_sentence2_length': 584, 'unique_sentence2': 1996}, 'ind_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 534692, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 143.7, 'max_sentence1_length': 486, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'ind_Latn-fas_Arab': {'num_samples': 1997, 'number_of_characters': 530311, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 143.7, 'max_sentence1_length': 486, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 121.85, 'max_sentence2_length': 389, 'unique_sentence2': 1995}, 'ind_Latn-fij_Latn': {'num_samples': 1997, 'number_of_characters': 587477, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 143.7, 'max_sentence1_length': 486, 'unique_sentence1': 1997, 'min_sentence2_length': 10, 'average_sentence2_length': 150.48, 'max_sentence2_length': 448, 'unique_sentence2': 1988}, 'ind_Latn-fil_Latn': {'num_samples': 1997, 'number_of_characters': 580392, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 143.7, 'max_sentence1_length': 486, 'unique_sentence1': 1997, 'min_sentence2_length': 10, 'average_sentence2_length': 146.93, 'max_sentence2_length': 554, 'unique_sentence2': 1997}, 'ind_Latn-fin_Latn': {'num_samples': 1997, 'number_of_characters': 556772, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 143.7, 'max_sentence1_length': 486, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 135.1, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'ind_Latn-fra_Latn': {'num_samples': 1997, 'number_of_characters': 580080, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 143.7, 'max_sentence1_length': 486, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 146.77, 'max_sentence2_length': 512, 'unique_sentence2': 1996}, 'ind_Latn-heb_Hebr': {'num_samples': 1997, 'number_of_characters': 487268, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 143.7, 'max_sentence1_length': 486, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 100.3, 'max_sentence2_length': 375, 'unique_sentence2': 1996}, 'ind_Latn-hin_Deva': {'num_samples': 1997, 'number_of_characters': 548547, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 143.7, 'max_sentence1_length': 486, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 130.98, 'max_sentence2_length': 394, 'unique_sentence2': 1996}, 'ind_Latn-hun_Latn': {'num_samples': 1997, 'number_of_characters': 565348, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 143.7, 'max_sentence1_length': 486, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 139.4, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'ind_Latn-jpn_Jpan': {'num_samples': 1997, 'number_of_characters': 398598, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 143.7, 'max_sentence1_length': 486, 'unique_sentence1': 1997, 'min_sentence2_length': 4, 'average_sentence2_length': 55.9, 'max_sentence2_length': 189, 'unique_sentence2': 1994}, 'ind_Latn-kor_Hang': {'num_samples': 1997, 'number_of_characters': 420377, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 143.7, 'max_sentence1_length': 486, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 66.8, 'max_sentence2_length': 217, 'unique_sentence2': 1995}, 'ind_Latn-lit_Latn': {'num_samples': 1997, 'number_of_characters': 546369, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 143.7, 'max_sentence1_length': 486, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 129.89, 'max_sentence2_length': 446, 'unique_sentence2': 1995}, 'ind_Latn-mal_Mlym': {'num_samples': 1997, 'number_of_characters': 591124, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 143.7, 'max_sentence1_length': 486, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 152.3, 'max_sentence2_length': 540, 'unique_sentence2': 1996}, 'ind_Latn-mlg_Latn': {'num_samples': 1997, 'number_of_characters': 607280, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 143.7, 'max_sentence1_length': 486, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 160.39, 'max_sentence2_length': 559, 'unique_sentence2': 1994}, 'ind_Latn-mri_Latn': {'num_samples': 1997, 'number_of_characters': 561096, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 143.7, 'max_sentence1_length': 486, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 137.27, 'max_sentence2_length': 443, 'unique_sentence2': 1997}, 'ind_Latn-msa_Latn': {'num_samples': 1997, 'number_of_characters': 564155, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 143.7, 'max_sentence1_length': 486, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 138.8, 'max_sentence2_length': 463, 'unique_sentence2': 1997}, 'ind_Latn-nld_Latn': {'num_samples': 1997, 'number_of_characters': 578887, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 143.7, 'max_sentence1_length': 486, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 146.18, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'ind_Latn-pol_Latn': {'num_samples': 1997, 'number_of_characters': 564838, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 143.7, 'max_sentence1_length': 486, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 139.14, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'ind_Latn-por_Latn': {'num_samples': 1997, 'number_of_characters': 564187, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 143.7, 'max_sentence1_length': 486, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 138.82, 'max_sentence2_length': 497, 'unique_sentence2': 1996}, 'ind_Latn-rus_Cyrl': {'num_samples': 1997, 'number_of_characters': 561865, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 143.7, 'max_sentence1_length': 486, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 137.65, 'max_sentence2_length': 419, 'unique_sentence2': 1996}, 'ind_Latn-smo_Latn': {'num_samples': 1997, 'number_of_characters': 564827, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 143.7, 'max_sentence1_length': 486, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 139.14, 'max_sentence2_length': 431, 'unique_sentence2': 1996}, 'ind_Latn-spa_Latn': {'num_samples': 1997, 'number_of_characters': 575172, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 143.7, 'max_sentence1_length': 486, 'unique_sentence1': 1997, 'min_sentence2_length': 1, 'average_sentence2_length': 144.32, 'max_sentence2_length': 504, 'unique_sentence2': 1996}, 'ind_Latn-swa_Latn': {'num_samples': 1997, 'number_of_characters': 559481, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 143.7, 'max_sentence1_length': 486, 'unique_sentence1': 1997, 'min_sentence2_length': 10, 'average_sentence2_length': 136.46, 'max_sentence2_length': 430, 'unique_sentence2': 1997}, 'ind_Latn-swe_Latn': {'num_samples': 1997, 'number_of_characters': 538799, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 143.7, 'max_sentence1_length': 486, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 126.1, 'max_sentence2_length': 430, 'unique_sentence2': 1996}, 'ind_Latn-tah_Latn': {'num_samples': 1997, 'number_of_characters': 596595, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 143.7, 'max_sentence1_length': 486, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 155.04, 'max_sentence2_length': 524, 'unique_sentence2': 1997}, 'ind_Latn-tam_Taml': {'num_samples': 1997, 'number_of_characters': 596933, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 143.7, 'max_sentence1_length': 486, 'unique_sentence1': 1997, 'min_sentence2_length': 11, 'average_sentence2_length': 155.21, 'max_sentence2_length': 581, 'unique_sentence2': 1997}, 'ind_Latn-ton_Latn': {'num_samples': 1997, 'number_of_characters': 600612, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 143.7, 'max_sentence1_length': 486, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 157.06, 'max_sentence2_length': 468, 'unique_sentence2': 1997}, 'ind_Latn-tur_Latn': {'num_samples': 1997, 'number_of_characters': 552585, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 143.7, 'max_sentence1_length': 486, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 133.01, 'max_sentence2_length': 504, 'unique_sentence2': 1997}, 'ind_Latn-vie_Latn': {'num_samples': 1997, 'number_of_characters': 558093, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 143.7, 'max_sentence1_length': 486, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 135.76, 'max_sentence2_length': 437, 'unique_sentence2': 1996}, 'ind_Latn-zho_Hant': {'num_samples': 1997, 'number_of_characters': 378450, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 143.7, 'max_sentence1_length': 486, 'unique_sentence1': 1997, 'min_sentence2_length': 3, 'average_sentence2_length': 45.81, 'max_sentence2_length': 200, 'unique_sentence2': 1996}, 'ind_Latn-zul_Latn': {'num_samples': 1997, 'number_of_characters': 544704, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 143.7, 'max_sentence1_length': 486, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 129.06, 'max_sentence2_length': 494, 'unique_sentence2': 1996}, 'isl_Latn-afr_Latn': {'num_samples': 1997, 'number_of_characters': 530560, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 131.3, 'max_sentence1_length': 399, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 134.38, 'max_sentence2_length': 437, 'unique_sentence2': 1996}, 'isl_Latn-dan_Latn': {'num_samples': 1997, 'number_of_characters': 514346, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 131.3, 'max_sentence1_length': 399, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 126.26, 'max_sentence2_length': 522, 'unique_sentence2': 1995}, 'isl_Latn-deu_Latn': {'num_samples': 1997, 'number_of_characters': 557858, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 131.3, 'max_sentence1_length': 399, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 148.05, 'max_sentence2_length': 508, 'unique_sentence2': 1996}, 'isl_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 509928, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 131.3, 'max_sentence1_length': 399, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'isl_Latn-fao_Latn': {'num_samples': 1997, 'number_of_characters': 520011, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 131.3, 'max_sentence1_length': 399, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 129.1, 'max_sentence2_length': 433, 'unique_sentence2': 1997}, 'isl_Latn-ltz_Latn': {'num_samples': 1997, 'number_of_characters': 542965, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 131.3, 'max_sentence1_length': 399, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 140.59, 'max_sentence2_length': 543, 'unique_sentence2': 1996}, 'isl_Latn-nld_Latn': {'num_samples': 1997, 'number_of_characters': 554123, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 131.3, 'max_sentence1_length': 399, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 146.18, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'isl_Latn-nno_Latn': {'num_samples': 1997, 'number_of_characters': 510565, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 131.3, 'max_sentence1_length': 399, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.37, 'max_sentence2_length': 417, 'unique_sentence2': 1996}, 'isl_Latn-nob_Latn': {'num_samples': 1997, 'number_of_characters': 513652, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 131.3, 'max_sentence1_length': 399, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 125.91, 'max_sentence2_length': 482, 'unique_sentence2': 1996}, 'isl_Latn-swe_Latn': {'num_samples': 1997, 'number_of_characters': 514035, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 131.3, 'max_sentence1_length': 399, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 126.1, 'max_sentence2_length': 430, 'unique_sentence2': 1996}, 'ita_Latn-cat_Latn': {'num_samples': 1997, 'number_of_characters': 572177, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 144.83, 'max_sentence1_length': 623, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 141.69, 'max_sentence2_length': 460, 'unique_sentence2': 1997}, 'ita_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 536937, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 144.83, 'max_sentence1_length': 623, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'ita_Latn-fra_Latn': {'num_samples': 1997, 'number_of_characters': 582325, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 144.83, 'max_sentence1_length': 623, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 146.77, 'max_sentence2_length': 512, 'unique_sentence2': 1996}, 'ita_Latn-glg_Latn': {'num_samples': 1997, 'number_of_characters': 561203, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 144.83, 'max_sentence1_length': 623, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 136.2, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'ita_Latn-mlt_Latn': {'num_samples': 1997, 'number_of_characters': 566692, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 144.83, 'max_sentence1_length': 623, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 138.95, 'max_sentence2_length': 582, 'unique_sentence2': 1996}, 'ita_Latn-por_Latn': {'num_samples': 1997, 'number_of_characters': 566432, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 144.83, 'max_sentence1_length': 623, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 138.82, 'max_sentence2_length': 497, 'unique_sentence2': 1996}, 'ita_Latn-ron_Latn': {'num_samples': 1997, 'number_of_characters': 581702, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 144.83, 'max_sentence1_length': 623, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 146.46, 'max_sentence2_length': 518, 'unique_sentence2': 1997}, 'ita_Latn-spa_Latn': {'num_samples': 1997, 'number_of_characters': 577417, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 144.83, 'max_sentence1_length': 623, 'unique_sentence1': 1996, 'min_sentence2_length': 1, 'average_sentence2_length': 144.32, 'max_sentence2_length': 504, 'unique_sentence2': 1996}, 'jpn_Jpan-arb_Arab': {'num_samples': 1997, 'number_of_characters': 342807, 'unique_pairs': 1995, 'min_sentence1_length': 4, 'average_sentence1_length': 55.9, 'max_sentence1_length': 189, 'unique_sentence1': 1994, 'min_sentence2_length': 5, 'average_sentence2_length': 115.76, 'max_sentence2_length': 362, 'unique_sentence2': 1995}, 'jpn_Jpan-ben_Beng': {'num_samples': 1997, 'number_of_characters': 355428, 'unique_pairs': 1997, 'min_sentence1_length': 4, 'average_sentence1_length': 55.9, 'max_sentence1_length': 189, 'unique_sentence1': 1994, 'min_sentence2_length': 6, 'average_sentence2_length': 122.08, 'max_sentence2_length': 402, 'unique_sentence2': 1997}, 'jpn_Jpan-deu_Latn': {'num_samples': 1997, 'number_of_characters': 407276, 'unique_pairs': 1996, 'min_sentence1_length': 4, 'average_sentence1_length': 55.9, 'max_sentence1_length': 189, 'unique_sentence1': 1994, 'min_sentence2_length': 9, 'average_sentence2_length': 148.05, 'max_sentence2_length': 508, 'unique_sentence2': 1996}, 'jpn_Jpan-ell_Grek': {'num_samples': 1997, 'number_of_characters': 410753, 'unique_pairs': 1996, 'min_sentence1_length': 4, 'average_sentence1_length': 55.9, 'max_sentence1_length': 189, 'unique_sentence1': 1994, 'min_sentence2_length': 7, 'average_sentence2_length': 149.79, 'max_sentence2_length': 584, 'unique_sentence2': 1996}, 'jpn_Jpan-eng_Latn': {'num_samples': 1997, 'number_of_characters': 359346, 'unique_pairs': 1997, 'min_sentence1_length': 4, 'average_sentence1_length': 55.9, 'max_sentence1_length': 189, 'unique_sentence1': 1994, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'jpn_Jpan-fas_Arab': {'num_samples': 1997, 'number_of_characters': 354965, 'unique_pairs': 1995, 'min_sentence1_length': 4, 'average_sentence1_length': 55.9, 'max_sentence1_length': 189, 'unique_sentence1': 1994, 'min_sentence2_length': 9, 'average_sentence2_length': 121.85, 'max_sentence2_length': 389, 'unique_sentence2': 1995}, 'jpn_Jpan-fin_Latn': {'num_samples': 1997, 'number_of_characters': 381426, 'unique_pairs': 1996, 'min_sentence1_length': 4, 'average_sentence1_length': 55.9, 'max_sentence1_length': 189, 'unique_sentence1': 1994, 'min_sentence2_length': 7, 'average_sentence2_length': 135.1, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'jpn_Jpan-fra_Latn': {'num_samples': 1997, 'number_of_characters': 404734, 'unique_pairs': 1996, 'min_sentence1_length': 4, 'average_sentence1_length': 55.9, 'max_sentence1_length': 189, 'unique_sentence1': 1994, 'min_sentence2_length': 8, 'average_sentence2_length': 146.77, 'max_sentence2_length': 512, 'unique_sentence2': 1996}, 'jpn_Jpan-heb_Hebr': {'num_samples': 1997, 'number_of_characters': 311922, 'unique_pairs': 1996, 'min_sentence1_length': 4, 'average_sentence1_length': 55.9, 'max_sentence1_length': 189, 'unique_sentence1': 1994, 'min_sentence2_length': 7, 'average_sentence2_length': 100.3, 'max_sentence2_length': 375, 'unique_sentence2': 1996}, 'jpn_Jpan-hin_Deva': {'num_samples': 1997, 'number_of_characters': 373201, 'unique_pairs': 1996, 'min_sentence1_length': 4, 'average_sentence1_length': 55.9, 'max_sentence1_length': 189, 'unique_sentence1': 1994, 'min_sentence2_length': 5, 'average_sentence2_length': 130.98, 'max_sentence2_length': 394, 'unique_sentence2': 1996}, 'jpn_Jpan-hun_Latn': {'num_samples': 1997, 'number_of_characters': 390002, 'unique_pairs': 1997, 'min_sentence1_length': 4, 'average_sentence1_length': 55.9, 'max_sentence1_length': 189, 'unique_sentence1': 1994, 'min_sentence2_length': 7, 'average_sentence2_length': 139.4, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'jpn_Jpan-ind_Latn': {'num_samples': 1997, 'number_of_characters': 398598, 'unique_pairs': 1997, 'min_sentence1_length': 4, 'average_sentence1_length': 55.9, 'max_sentence1_length': 189, 'unique_sentence1': 1994, 'min_sentence2_length': 6, 'average_sentence2_length': 143.7, 'max_sentence2_length': 486, 'unique_sentence2': 1997}, 'jpn_Jpan-kor_Hang': {'num_samples': 1997, 'number_of_characters': 245031, 'unique_pairs': 1995, 'min_sentence1_length': 4, 'average_sentence1_length': 55.9, 'max_sentence1_length': 189, 'unique_sentence1': 1994, 'min_sentence2_length': 5, 'average_sentence2_length': 66.8, 'max_sentence2_length': 217, 'unique_sentence2': 1995}, 'jpn_Jpan-lit_Latn': {'num_samples': 1997, 'number_of_characters': 371023, 'unique_pairs': 1995, 'min_sentence1_length': 4, 'average_sentence1_length': 55.9, 'max_sentence1_length': 189, 'unique_sentence1': 1994, 'min_sentence2_length': 9, 'average_sentence2_length': 129.89, 'max_sentence2_length': 446, 'unique_sentence2': 1995}, 'jpn_Jpan-nld_Latn': {'num_samples': 1997, 'number_of_characters': 403541, 'unique_pairs': 1996, 'min_sentence1_length': 4, 'average_sentence1_length': 55.9, 'max_sentence1_length': 189, 'unique_sentence1': 1994, 'min_sentence2_length': 8, 'average_sentence2_length': 146.18, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'jpn_Jpan-pol_Latn': {'num_samples': 1997, 'number_of_characters': 389492, 'unique_pairs': 1996, 'min_sentence1_length': 4, 'average_sentence1_length': 55.9, 'max_sentence1_length': 189, 'unique_sentence1': 1994, 'min_sentence2_length': 9, 'average_sentence2_length': 139.14, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'jpn_Jpan-por_Latn': {'num_samples': 1997, 'number_of_characters': 388841, 'unique_pairs': 1996, 'min_sentence1_length': 4, 'average_sentence1_length': 55.9, 'max_sentence1_length': 189, 'unique_sentence1': 1994, 'min_sentence2_length': 7, 'average_sentence2_length': 138.82, 'max_sentence2_length': 497, 'unique_sentence2': 1996}, 'jpn_Jpan-rus_Cyrl': {'num_samples': 1997, 'number_of_characters': 386519, 'unique_pairs': 1996, 'min_sentence1_length': 4, 'average_sentence1_length': 55.9, 'max_sentence1_length': 189, 'unique_sentence1': 1994, 'min_sentence2_length': 7, 'average_sentence2_length': 137.65, 'max_sentence2_length': 419, 'unique_sentence2': 1996}, 'jpn_Jpan-spa_Latn': {'num_samples': 1997, 'number_of_characters': 399826, 'unique_pairs': 1996, 'min_sentence1_length': 4, 'average_sentence1_length': 55.9, 'max_sentence1_length': 189, 'unique_sentence1': 1994, 'min_sentence2_length': 1, 'average_sentence2_length': 144.32, 'max_sentence2_length': 504, 'unique_sentence2': 1996}, 'jpn_Jpan-swa_Latn': {'num_samples': 1997, 'number_of_characters': 384135, 'unique_pairs': 1997, 'min_sentence1_length': 4, 'average_sentence1_length': 55.9, 'max_sentence1_length': 189, 'unique_sentence1': 1994, 'min_sentence2_length': 10, 'average_sentence2_length': 136.46, 'max_sentence2_length': 430, 'unique_sentence2': 1997}, 'jpn_Jpan-swe_Latn': {'num_samples': 1997, 'number_of_characters': 363453, 'unique_pairs': 1996, 'min_sentence1_length': 4, 'average_sentence1_length': 55.9, 'max_sentence1_length': 189, 'unique_sentence1': 1994, 'min_sentence2_length': 8, 'average_sentence2_length': 126.1, 'max_sentence2_length': 430, 'unique_sentence2': 1996}, 'jpn_Jpan-tam_Taml': {'num_samples': 1997, 'number_of_characters': 421587, 'unique_pairs': 1997, 'min_sentence1_length': 4, 'average_sentence1_length': 55.9, 'max_sentence1_length': 189, 'unique_sentence1': 1994, 'min_sentence2_length': 11, 'average_sentence2_length': 155.21, 'max_sentence2_length': 581, 'unique_sentence2': 1997}, 'jpn_Jpan-tur_Latn': {'num_samples': 1997, 'number_of_characters': 377239, 'unique_pairs': 1997, 'min_sentence1_length': 4, 'average_sentence1_length': 55.9, 'max_sentence1_length': 189, 'unique_sentence1': 1994, 'min_sentence2_length': 6, 'average_sentence2_length': 133.01, 'max_sentence2_length': 504, 'unique_sentence2': 1997}, 'jpn_Jpan-vie_Latn': {'num_samples': 1997, 'number_of_characters': 382747, 'unique_pairs': 1996, 'min_sentence1_length': 4, 'average_sentence1_length': 55.9, 'max_sentence1_length': 189, 'unique_sentence1': 1994, 'min_sentence2_length': 7, 'average_sentence2_length': 135.76, 'max_sentence2_length': 437, 'unique_sentence2': 1996}, 'jpn_Jpan-yue_Hant': {'num_samples': 1997, 'number_of_characters': 190513, 'unique_pairs': 1996, 'min_sentence1_length': 4, 'average_sentence1_length': 55.9, 'max_sentence1_length': 189, 'unique_sentence1': 1994, 'min_sentence2_length': 4, 'average_sentence2_length': 39.5, 'max_sentence2_length': 133, 'unique_sentence2': 1996}, 'jpn_Jpan-zho_Hans': {'num_samples': 1997, 'number_of_characters': 196587, 'unique_pairs': 1997, 'min_sentence1_length': 4, 'average_sentence1_length': 55.9, 'max_sentence1_length': 189, 'unique_sentence1': 1994, 'min_sentence2_length': 4, 'average_sentence2_length': 42.54, 'max_sentence2_length': 263, 'unique_sentence2': 1997}, 'jpn_Jpan-zho_Hant': {'num_samples': 1997, 'number_of_characters': 203104, 'unique_pairs': 1996, 'min_sentence1_length': 4, 'average_sentence1_length': 55.9, 'max_sentence1_length': 189, 'unique_sentence1': 1994, 'min_sentence2_length': 3, 'average_sentence2_length': 45.81, 'max_sentence2_length': 200, 'unique_sentence2': 1996}, 'jpn_Jpan-zul_Latn': {'num_samples': 1997, 'number_of_characters': 369358, 'unique_pairs': 1997, 'min_sentence1_length': 4, 'average_sentence1_length': 55.9, 'max_sentence1_length': 189, 'unique_sentence1': 1994, 'min_sentence2_length': 8, 'average_sentence2_length': 129.06, 'max_sentence2_length': 494, 'unique_sentence2': 1996}, 'kan_Knda-ben_Beng': {'num_samples': 1997, 'number_of_characters': 509338, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 132.97, 'max_sentence1_length': 449, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 122.08, 'max_sentence2_length': 402, 'unique_sentence2': 1997}, 'kan_Knda-div_Thaa': {'num_samples': 1997, 'number_of_characters': 569384, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 132.97, 'max_sentence1_length': 449, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 152.15, 'max_sentence2_length': 609, 'unique_sentence2': 1996}, 'kan_Knda-eng_Latn': {'num_samples': 1997, 'number_of_characters': 513256, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 132.97, 'max_sentence1_length': 449, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'kan_Knda-eus_Latn': {'num_samples': 1997, 'number_of_characters': 540739, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 132.97, 'max_sentence1_length': 449, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 137.81, 'max_sentence2_length': 393, 'unique_sentence2': 1997}, 'kan_Knda-guj_Gujr': {'num_samples': 1997, 'number_of_characters': 510467, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 132.97, 'max_sentence1_length': 449, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 122.65, 'max_sentence2_length': 378, 'unique_sentence2': 1997}, 'kan_Knda-hin_Deva': {'num_samples': 1997, 'number_of_characters': 527111, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 132.97, 'max_sentence1_length': 449, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 130.98, 'max_sentence2_length': 394, 'unique_sentence2': 1996}, 'kan_Knda-mar_Deva': {'num_samples': 1997, 'number_of_characters': 526423, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 132.97, 'max_sentence1_length': 449, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 130.64, 'max_sentence2_length': 443, 'unique_sentence2': 1995}, 'kan_Knda-nep_Deva': {'num_samples': 1997, 'number_of_characters': 513759, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 132.97, 'max_sentence1_length': 449, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 124.3, 'max_sentence2_length': 412, 'unique_sentence2': 1996}, 'kan_Knda-pan_Guru': {'num_samples': 1997, 'number_of_characters': 515958, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 132.97, 'max_sentence1_length': 449, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 125.4, 'max_sentence2_length': 383, 'unique_sentence2': 1996}, 'kan_Knda-sin_Sinh': {'num_samples': 1997, 'number_of_characters': 524277, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 132.97, 'max_sentence1_length': 449, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 129.56, 'max_sentence2_length': 441, 'unique_sentence2': 1996}, 'kan_Knda-snd_Arab': {'num_samples': 1997, 'number_of_characters': 485863, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 132.97, 'max_sentence1_length': 449, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 110.33, 'max_sentence2_length': 335, 'unique_sentence2': 1996}, 'kan_Knda-tam_Taml': {'num_samples': 1997, 'number_of_characters': 575497, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 132.97, 'max_sentence1_length': 449, 'unique_sentence1': 1996, 'min_sentence2_length': 11, 'average_sentence2_length': 155.21, 'max_sentence2_length': 581, 'unique_sentence2': 1997}, 'kan_Knda-tel_Telu': {'num_samples': 1997, 'number_of_characters': 513063, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 132.97, 'max_sentence1_length': 449, 'unique_sentence1': 1996, 'min_sentence2_length': 10, 'average_sentence2_length': 123.95, 'max_sentence2_length': 412, 'unique_sentence2': 1996}, 'kan_Knda-urd_Arab': {'num_samples': 1997, 'number_of_characters': 513534, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 132.97, 'max_sentence1_length': 449, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.19, 'max_sentence2_length': 390, 'unique_sentence2': 1996}, 'kat_Geor-ell_Grek': {'num_samples': 1997, 'number_of_characters': 565719, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 133.5, 'max_sentence1_length': 503, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 149.79, 'max_sentence2_length': 584, 'unique_sentence2': 1996}, 'kat_Geor-eng_Latn': {'num_samples': 1997, 'number_of_characters': 514312, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 133.5, 'max_sentence1_length': 503, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'kat_Geor-hye_Armn': {'num_samples': 1997, 'number_of_characters': 531307, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 133.5, 'max_sentence1_length': 503, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 132.56, 'max_sentence2_length': 441, 'unique_sentence2': 1996}, 'kat_Geor-sqi_Latn': {'num_samples': 1997, 'number_of_characters': 550199, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 133.5, 'max_sentence1_length': 503, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 142.02, 'max_sentence2_length': 461, 'unique_sentence2': 1996}, 'kaz_Cyrl-aze_Latn': {'num_samples': 1997, 'number_of_characters': 529910, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 130.33, 'max_sentence1_length': 473, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 135.02, 'max_sentence2_length': 398, 'unique_sentence2': 1997}, 'kaz_Cyrl-bak_Cyrl': {'num_samples': 1997, 'number_of_characters': 506602, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 130.33, 'max_sentence1_length': 473, 'unique_sentence1': 1996, 'min_sentence2_length': 11, 'average_sentence2_length': 123.35, 'max_sentence2_length': 437, 'unique_sentence2': 1995}, 'kaz_Cyrl-eng_Latn': {'num_samples': 1997, 'number_of_characters': 507996, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 130.33, 'max_sentence1_length': 473, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'kaz_Cyrl-kir_Cyrl': {'num_samples': 1997, 'number_of_characters': 511140, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 130.33, 'max_sentence1_length': 473, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 125.62, 'max_sentence2_length': 395, 'unique_sentence2': 1996}, 'kaz_Cyrl-tat_Cyrl': {'num_samples': 1997, 'number_of_characters': 506202, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 130.33, 'max_sentence1_length': 473, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 123.15, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'kaz_Cyrl-tuk_Latn': {'num_samples': 1997, 'number_of_characters': 545550, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 130.33, 'max_sentence1_length': 473, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 142.85, 'max_sentence2_length': 576, 'unique_sentence2': 1996}, 'kaz_Cyrl-tur_Latn': {'num_samples': 1997, 'number_of_characters': 525889, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 130.33, 'max_sentence1_length': 473, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 133.01, 'max_sentence2_length': 504, 'unique_sentence2': 1997}, 'kaz_Cyrl-uig_Arab': {'num_samples': 1997, 'number_of_characters': 571298, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 130.33, 'max_sentence1_length': 473, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 155.74, 'max_sentence2_length': 592, 'unique_sentence2': 1996}, 'kaz_Cyrl-uzb_Latn': {'num_samples': 1997, 'number_of_characters': 553971, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 130.33, 'max_sentence1_length': 473, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 147.07, 'max_sentence2_length': 470, 'unique_sentence2': 1996}, 'khm_Khmr-bod_Tibt': {'num_samples': 1997, 'number_of_characters': 589120, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 144.46, 'max_sentence1_length': 517, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 150.54, 'max_sentence2_length': 478, 'unique_sentence2': 1993}, 'khm_Khmr-dzo_Tibt': {'num_samples': 1997, 'number_of_characters': 531712, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 144.46, 'max_sentence1_length': 517, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 121.79, 'max_sentence2_length': 411, 'unique_sentence2': 1992}, 'khm_Khmr-eng_Latn': {'num_samples': 1997, 'number_of_characters': 536211, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 144.46, 'max_sentence1_length': 517, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'khm_Khmr-lao_Laoo': {'num_samples': 1997, 'number_of_characters': 555471, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 144.46, 'max_sentence1_length': 517, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 133.69, 'max_sentence2_length': 507, 'unique_sentence2': 1997}, 'khm_Khmr-mon_Mong': {'num_samples': 1997, 'number_of_characters': 547539, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 144.46, 'max_sentence1_length': 517, 'unique_sentence1': 1996, 'min_sentence2_length': 11, 'average_sentence2_length': 129.72, 'max_sentence2_length': 414, 'unique_sentence2': 1997}, 'khm_Khmr-mya_Mymr': {'num_samples': 1997, 'number_of_characters': 600345, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 144.46, 'max_sentence1_length': 517, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 156.16, 'max_sentence2_length': 773, 'unique_sentence2': 1997}, 'khm_Khmr-tha_Thai': {'num_samples': 1997, 'number_of_characters': 525959, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 144.46, 'max_sentence1_length': 517, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 118.91, 'max_sentence2_length': 439, 'unique_sentence2': 1996}, 'kin_Latn-bem_Latn': {'num_samples': 1997, 'number_of_characters': 602279, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 152.12, 'max_sentence1_length': 541, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 149.47, 'max_sentence2_length': 465, 'unique_sentence2': 1997}, 'kin_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 551507, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 152.12, 'max_sentence1_length': 541, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'kin_Latn-ewe_Latn': {'num_samples': 1997, 'number_of_characters': 542765, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 152.12, 'max_sentence1_length': 541, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 119.67, 'max_sentence2_length': 493, 'unique_sentence2': 1994}, 'kin_Latn-fuc_Latn': {'num_samples': 1997, 'number_of_characters': 532267, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 152.12, 'max_sentence1_length': 541, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 114.41, 'max_sentence2_length': 376, 'unique_sentence2': 1996}, 'kin_Latn-nde_Latn': {'num_samples': 1997, 'number_of_characters': 601526, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 152.12, 'max_sentence1_length': 541, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 149.09, 'max_sentence2_length': 590, 'unique_sentence2': 1997}, 'kin_Latn-nya_Latn': {'num_samples': 1997, 'number_of_characters': 588069, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 152.12, 'max_sentence1_length': 541, 'unique_sentence1': 1996, 'min_sentence2_length': 10, 'average_sentence2_length': 142.35, 'max_sentence2_length': 464, 'unique_sentence2': 1993}, 'kin_Latn-sna_Latn': {'num_samples': 1997, 'number_of_characters': 602117, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 152.12, 'max_sentence1_length': 541, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 149.39, 'max_sentence2_length': 511, 'unique_sentence2': 1995}, 'kin_Latn-ven_Latn': {'num_samples': 1997, 'number_of_characters': 603543, 'unique_pairs': 1996, 'min_sentence1_length': 10, 'average_sentence1_length': 152.12, 'max_sentence1_length': 541, 'unique_sentence1': 1996, 'min_sentence2_length': 10, 'average_sentence2_length': 150.1, 'max_sentence2_length': 535, 'unique_sentence2': 1993}, 'kir_Cyrl-aze_Latn': {'num_samples': 1997, 'number_of_characters': 520498, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 125.62, 'max_sentence1_length': 395, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 135.02, 'max_sentence2_length': 398, 'unique_sentence2': 1997}, 'kir_Cyrl-bak_Cyrl': {'num_samples': 1997, 'number_of_characters': 497190, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 125.62, 'max_sentence1_length': 395, 'unique_sentence1': 1996, 'min_sentence2_length': 11, 'average_sentence2_length': 123.35, 'max_sentence2_length': 437, 'unique_sentence2': 1995}, 'kir_Cyrl-eng_Latn': {'num_samples': 1997, 'number_of_characters': 498584, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 125.62, 'max_sentence1_length': 395, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'kir_Cyrl-kaz_Cyrl': {'num_samples': 1997, 'number_of_characters': 511140, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 125.62, 'max_sentence1_length': 395, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 130.33, 'max_sentence2_length': 473, 'unique_sentence2': 1996}, 'kir_Cyrl-tat_Cyrl': {'num_samples': 1997, 'number_of_characters': 496790, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 125.62, 'max_sentence1_length': 395, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 123.15, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'kir_Cyrl-tuk_Latn': {'num_samples': 1997, 'number_of_characters': 536138, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 125.62, 'max_sentence1_length': 395, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 142.85, 'max_sentence2_length': 576, 'unique_sentence2': 1996}, 'kir_Cyrl-tur_Latn': {'num_samples': 1997, 'number_of_characters': 516477, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 125.62, 'max_sentence1_length': 395, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 133.01, 'max_sentence2_length': 504, 'unique_sentence2': 1997}, 'kir_Cyrl-uig_Arab': {'num_samples': 1997, 'number_of_characters': 561886, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 125.62, 'max_sentence1_length': 395, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 155.74, 'max_sentence2_length': 592, 'unique_sentence2': 1996}, 'kir_Cyrl-uzb_Latn': {'num_samples': 1997, 'number_of_characters': 544559, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 125.62, 'max_sentence1_length': 395, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 147.07, 'max_sentence2_length': 470, 'unique_sentence2': 1996}, 'kmr_Latn-arb_Arab': {'num_samples': 1997, 'number_of_characters': 477127, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 123.16, 'max_sentence1_length': 420, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 115.76, 'max_sentence2_length': 362, 'unique_sentence2': 1995}, 'kmr_Latn-ckb_Arab': {'num_samples': 1997, 'number_of_characters': 498313, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 123.16, 'max_sentence1_length': 420, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 126.37, 'max_sentence2_length': 399, 'unique_sentence2': 1995}, 'kmr_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 493666, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 123.16, 'max_sentence1_length': 420, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'kmr_Latn-fas_Arab': {'num_samples': 1997, 'number_of_characters': 489285, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 123.16, 'max_sentence1_length': 420, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 121.85, 'max_sentence2_length': 389, 'unique_sentence2': 1995}, 'kmr_Latn-heb_Hebr': {'num_samples': 1997, 'number_of_characters': 446242, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 123.16, 'max_sentence1_length': 420, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 100.3, 'max_sentence2_length': 375, 'unique_sentence2': 1996}, 'kmr_Latn-mey_Arab': {'num_samples': 1997, 'number_of_characters': 459781, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 123.16, 'max_sentence1_length': 420, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 107.08, 'max_sentence2_length': 392, 'unique_sentence2': 1993}, 'kmr_Latn-prs_Arab': {'num_samples': 1997, 'number_of_characters': 488482, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 123.16, 'max_sentence1_length': 420, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 121.45, 'max_sentence2_length': 365, 'unique_sentence2': 1997}, 'kmr_Latn-pus_Arab': {'num_samples': 1997, 'number_of_characters': 488579, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 123.16, 'max_sentence1_length': 420, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 121.5, 'max_sentence2_length': 366, 'unique_sentence2': 1996}, 'kmr_Latn-shi_Arab': {'num_samples': 1997, 'number_of_characters': 460859, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 123.16, 'max_sentence1_length': 420, 'unique_sentence1': 1996, 'min_sentence2_length': 3, 'average_sentence2_length': 107.62, 'max_sentence2_length': 378, 'unique_sentence2': 1996}, 'kmr_Latn-tgk_Cyrl': {'num_samples': 1997, 'number_of_characters': 520093, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 123.16, 'max_sentence1_length': 420, 'unique_sentence1': 1996, 'min_sentence2_length': 3, 'average_sentence2_length': 137.28, 'max_sentence2_length': 451, 'unique_sentence2': 1995}, 'kor_Hang-arb_Arab': {'num_samples': 1997, 'number_of_characters': 364586, 'unique_pairs': 1995, 'min_sentence1_length': 5, 'average_sentence1_length': 66.8, 'max_sentence1_length': 217, 'unique_sentence1': 1995, 'min_sentence2_length': 5, 'average_sentence2_length': 115.76, 'max_sentence2_length': 362, 'unique_sentence2': 1995}, 'kor_Hang-ben_Beng': {'num_samples': 1997, 'number_of_characters': 377207, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 66.8, 'max_sentence1_length': 217, 'unique_sentence1': 1995, 'min_sentence2_length': 6, 'average_sentence2_length': 122.08, 'max_sentence2_length': 402, 'unique_sentence2': 1997}, 'kor_Hang-deu_Latn': {'num_samples': 1997, 'number_of_characters': 429055, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 66.8, 'max_sentence1_length': 217, 'unique_sentence1': 1995, 'min_sentence2_length': 9, 'average_sentence2_length': 148.05, 'max_sentence2_length': 508, 'unique_sentence2': 1996}, 'kor_Hang-ell_Grek': {'num_samples': 1997, 'number_of_characters': 432532, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 66.8, 'max_sentence1_length': 217, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 149.79, 'max_sentence2_length': 584, 'unique_sentence2': 1996}, 'kor_Hang-eng_Latn': {'num_samples': 1997, 'number_of_characters': 381125, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 66.8, 'max_sentence1_length': 217, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'kor_Hang-fas_Arab': {'num_samples': 1997, 'number_of_characters': 376744, 'unique_pairs': 1995, 'min_sentence1_length': 5, 'average_sentence1_length': 66.8, 'max_sentence1_length': 217, 'unique_sentence1': 1995, 'min_sentence2_length': 9, 'average_sentence2_length': 121.85, 'max_sentence2_length': 389, 'unique_sentence2': 1995}, 'kor_Hang-fin_Latn': {'num_samples': 1997, 'number_of_characters': 403205, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 66.8, 'max_sentence1_length': 217, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 135.1, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'kor_Hang-fra_Latn': {'num_samples': 1997, 'number_of_characters': 426513, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 66.8, 'max_sentence1_length': 217, 'unique_sentence1': 1995, 'min_sentence2_length': 8, 'average_sentence2_length': 146.77, 'max_sentence2_length': 512, 'unique_sentence2': 1996}, 'kor_Hang-heb_Hebr': {'num_samples': 1997, 'number_of_characters': 333701, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 66.8, 'max_sentence1_length': 217, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 100.3, 'max_sentence2_length': 375, 'unique_sentence2': 1996}, 'kor_Hang-hin_Deva': {'num_samples': 1997, 'number_of_characters': 394980, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 66.8, 'max_sentence1_length': 217, 'unique_sentence1': 1995, 'min_sentence2_length': 5, 'average_sentence2_length': 130.98, 'max_sentence2_length': 394, 'unique_sentence2': 1996}, 'kor_Hang-hun_Latn': {'num_samples': 1997, 'number_of_characters': 411781, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 66.8, 'max_sentence1_length': 217, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 139.4, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'kor_Hang-ind_Latn': {'num_samples': 1997, 'number_of_characters': 420377, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 66.8, 'max_sentence1_length': 217, 'unique_sentence1': 1995, 'min_sentence2_length': 6, 'average_sentence2_length': 143.7, 'max_sentence2_length': 486, 'unique_sentence2': 1997}, 'kor_Hang-jpn_Jpan': {'num_samples': 1997, 'number_of_characters': 245031, 'unique_pairs': 1995, 'min_sentence1_length': 5, 'average_sentence1_length': 66.8, 'max_sentence1_length': 217, 'unique_sentence1': 1995, 'min_sentence2_length': 4, 'average_sentence2_length': 55.9, 'max_sentence2_length': 189, 'unique_sentence2': 1994}, 'kor_Hang-lit_Latn': {'num_samples': 1997, 'number_of_characters': 392802, 'unique_pairs': 1995, 'min_sentence1_length': 5, 'average_sentence1_length': 66.8, 'max_sentence1_length': 217, 'unique_sentence1': 1995, 'min_sentence2_length': 9, 'average_sentence2_length': 129.89, 'max_sentence2_length': 446, 'unique_sentence2': 1995}, 'kor_Hang-nld_Latn': {'num_samples': 1997, 'number_of_characters': 425320, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 66.8, 'max_sentence1_length': 217, 'unique_sentence1': 1995, 'min_sentence2_length': 8, 'average_sentence2_length': 146.18, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'kor_Hang-pol_Latn': {'num_samples': 1997, 'number_of_characters': 411271, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 66.8, 'max_sentence1_length': 217, 'unique_sentence1': 1995, 'min_sentence2_length': 9, 'average_sentence2_length': 139.14, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'kor_Hang-por_Latn': {'num_samples': 1997, 'number_of_characters': 410620, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 66.8, 'max_sentence1_length': 217, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 138.82, 'max_sentence2_length': 497, 'unique_sentence2': 1996}, 'kor_Hang-rus_Cyrl': {'num_samples': 1997, 'number_of_characters': 408298, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 66.8, 'max_sentence1_length': 217, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 137.65, 'max_sentence2_length': 419, 'unique_sentence2': 1996}, 'kor_Hang-spa_Latn': {'num_samples': 1997, 'number_of_characters': 421605, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 66.8, 'max_sentence1_length': 217, 'unique_sentence1': 1995, 'min_sentence2_length': 1, 'average_sentence2_length': 144.32, 'max_sentence2_length': 504, 'unique_sentence2': 1996}, 'kor_Hang-swa_Latn': {'num_samples': 1997, 'number_of_characters': 405914, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 66.8, 'max_sentence1_length': 217, 'unique_sentence1': 1995, 'min_sentence2_length': 10, 'average_sentence2_length': 136.46, 'max_sentence2_length': 430, 'unique_sentence2': 1997}, 'kor_Hang-swe_Latn': {'num_samples': 1997, 'number_of_characters': 385232, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 66.8, 'max_sentence1_length': 217, 'unique_sentence1': 1995, 'min_sentence2_length': 8, 'average_sentence2_length': 126.1, 'max_sentence2_length': 430, 'unique_sentence2': 1996}, 'kor_Hang-tam_Taml': {'num_samples': 1997, 'number_of_characters': 443366, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 66.8, 'max_sentence1_length': 217, 'unique_sentence1': 1995, 'min_sentence2_length': 11, 'average_sentence2_length': 155.21, 'max_sentence2_length': 581, 'unique_sentence2': 1997}, 'kor_Hang-tur_Latn': {'num_samples': 1997, 'number_of_characters': 399018, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 66.8, 'max_sentence1_length': 217, 'unique_sentence1': 1995, 'min_sentence2_length': 6, 'average_sentence2_length': 133.01, 'max_sentence2_length': 504, 'unique_sentence2': 1997}, 'kor_Hang-vie_Latn': {'num_samples': 1997, 'number_of_characters': 404526, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 66.8, 'max_sentence1_length': 217, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 135.76, 'max_sentence2_length': 437, 'unique_sentence2': 1996}, 'kor_Hang-yue_Hant': {'num_samples': 1997, 'number_of_characters': 212292, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 66.8, 'max_sentence1_length': 217, 'unique_sentence1': 1995, 'min_sentence2_length': 4, 'average_sentence2_length': 39.5, 'max_sentence2_length': 133, 'unique_sentence2': 1996}, 'kor_Hang-zho_Hans': {'num_samples': 1997, 'number_of_characters': 218366, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 66.8, 'max_sentence1_length': 217, 'unique_sentence1': 1995, 'min_sentence2_length': 4, 'average_sentence2_length': 42.54, 'max_sentence2_length': 263, 'unique_sentence2': 1997}, 'kor_Hang-zho_Hant': {'num_samples': 1997, 'number_of_characters': 224883, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 66.8, 'max_sentence1_length': 217, 'unique_sentence1': 1995, 'min_sentence2_length': 3, 'average_sentence2_length': 45.81, 'max_sentence2_length': 200, 'unique_sentence2': 1996}, 'kor_Hang-zul_Latn': {'num_samples': 1997, 'number_of_characters': 391137, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 66.8, 'max_sentence1_length': 217, 'unique_sentence1': 1995, 'min_sentence2_length': 8, 'average_sentence2_length': 129.06, 'max_sentence2_length': 494, 'unique_sentence2': 1996}, 'lao_Laoo-bod_Tibt': {'num_samples': 1997, 'number_of_characters': 567609, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 133.69, 'max_sentence1_length': 507, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 150.54, 'max_sentence2_length': 478, 'unique_sentence2': 1993}, 'lao_Laoo-dzo_Tibt': {'num_samples': 1997, 'number_of_characters': 510201, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 133.69, 'max_sentence1_length': 507, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 121.79, 'max_sentence2_length': 411, 'unique_sentence2': 1992}, 'lao_Laoo-eng_Latn': {'num_samples': 1997, 'number_of_characters': 514700, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 133.69, 'max_sentence1_length': 507, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'lao_Laoo-khm_Khmr': {'num_samples': 1997, 'number_of_characters': 555471, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 133.69, 'max_sentence1_length': 507, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 144.46, 'max_sentence2_length': 517, 'unique_sentence2': 1996}, 'lao_Laoo-mon_Mong': {'num_samples': 1997, 'number_of_characters': 526028, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 133.69, 'max_sentence1_length': 507, 'unique_sentence1': 1997, 'min_sentence2_length': 11, 'average_sentence2_length': 129.72, 'max_sentence2_length': 414, 'unique_sentence2': 1997}, 'lao_Laoo-mya_Mymr': {'num_samples': 1997, 'number_of_characters': 578834, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 133.69, 'max_sentence1_length': 507, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 156.16, 'max_sentence2_length': 773, 'unique_sentence2': 1997}, 'lao_Laoo-tha_Thai': {'num_samples': 1997, 'number_of_characters': 504448, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 133.69, 'max_sentence1_length': 507, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 118.91, 'max_sentence2_length': 439, 'unique_sentence2': 1996}, 'lav_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 515908, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 134.3, 'max_sentence1_length': 503, 'unique_sentence1': 1994, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'lav_Latn-fin_Latn': {'num_samples': 1997, 'number_of_characters': 537988, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 134.3, 'max_sentence1_length': 503, 'unique_sentence1': 1994, 'min_sentence2_length': 7, 'average_sentence2_length': 135.1, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'lav_Latn-hun_Latn': {'num_samples': 1997, 'number_of_characters': 546564, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 134.3, 'max_sentence1_length': 503, 'unique_sentence1': 1994, 'min_sentence2_length': 7, 'average_sentence2_length': 139.4, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'lav_Latn-lit_Latn': {'num_samples': 1997, 'number_of_characters': 527585, 'unique_pairs': 1995, 'min_sentence1_length': 7, 'average_sentence1_length': 134.3, 'max_sentence1_length': 503, 'unique_sentence1': 1994, 'min_sentence2_length': 9, 'average_sentence2_length': 129.89, 'max_sentence2_length': 446, 'unique_sentence2': 1995}, 'lit_Latn-arb_Arab': {'num_samples': 1997, 'number_of_characters': 490578, 'unique_pairs': 1995, 'min_sentence1_length': 9, 'average_sentence1_length': 129.89, 'max_sentence1_length': 446, 'unique_sentence1': 1995, 'min_sentence2_length': 5, 'average_sentence2_length': 115.76, 'max_sentence2_length': 362, 'unique_sentence2': 1995}, 'lit_Latn-ben_Beng': {'num_samples': 1997, 'number_of_characters': 503199, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 129.89, 'max_sentence1_length': 446, 'unique_sentence1': 1995, 'min_sentence2_length': 6, 'average_sentence2_length': 122.08, 'max_sentence2_length': 402, 'unique_sentence2': 1997}, 'lit_Latn-deu_Latn': {'num_samples': 1997, 'number_of_characters': 555047, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 129.89, 'max_sentence1_length': 446, 'unique_sentence1': 1995, 'min_sentence2_length': 9, 'average_sentence2_length': 148.05, 'max_sentence2_length': 508, 'unique_sentence2': 1996}, 'lit_Latn-ell_Grek': {'num_samples': 1997, 'number_of_characters': 558524, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 129.89, 'max_sentence1_length': 446, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 149.79, 'max_sentence2_length': 584, 'unique_sentence2': 1996}, 'lit_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 507117, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 129.89, 'max_sentence1_length': 446, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'lit_Latn-fas_Arab': {'num_samples': 1997, 'number_of_characters': 502736, 'unique_pairs': 1995, 'min_sentence1_length': 9, 'average_sentence1_length': 129.89, 'max_sentence1_length': 446, 'unique_sentence1': 1995, 'min_sentence2_length': 9, 'average_sentence2_length': 121.85, 'max_sentence2_length': 389, 'unique_sentence2': 1995}, 'lit_Latn-fin_Latn': {'num_samples': 1997, 'number_of_characters': 529197, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 129.89, 'max_sentence1_length': 446, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 135.1, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'lit_Latn-fra_Latn': {'num_samples': 1997, 'number_of_characters': 552505, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 129.89, 'max_sentence1_length': 446, 'unique_sentence1': 1995, 'min_sentence2_length': 8, 'average_sentence2_length': 146.77, 'max_sentence2_length': 512, 'unique_sentence2': 1996}, 'lit_Latn-heb_Hebr': {'num_samples': 1997, 'number_of_characters': 459693, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 129.89, 'max_sentence1_length': 446, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 100.3, 'max_sentence2_length': 375, 'unique_sentence2': 1996}, 'lit_Latn-hin_Deva': {'num_samples': 1997, 'number_of_characters': 520972, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 129.89, 'max_sentence1_length': 446, 'unique_sentence1': 1995, 'min_sentence2_length': 5, 'average_sentence2_length': 130.98, 'max_sentence2_length': 394, 'unique_sentence2': 1996}, 'lit_Latn-hun_Latn': {'num_samples': 1997, 'number_of_characters': 537773, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 129.89, 'max_sentence1_length': 446, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 139.4, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'lit_Latn-ind_Latn': {'num_samples': 1997, 'number_of_characters': 546369, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 129.89, 'max_sentence1_length': 446, 'unique_sentence1': 1995, 'min_sentence2_length': 6, 'average_sentence2_length': 143.7, 'max_sentence2_length': 486, 'unique_sentence2': 1997}, 'lit_Latn-jpn_Jpan': {'num_samples': 1997, 'number_of_characters': 371023, 'unique_pairs': 1995, 'min_sentence1_length': 9, 'average_sentence1_length': 129.89, 'max_sentence1_length': 446, 'unique_sentence1': 1995, 'min_sentence2_length': 4, 'average_sentence2_length': 55.9, 'max_sentence2_length': 189, 'unique_sentence2': 1994}, 'lit_Latn-kor_Hang': {'num_samples': 1997, 'number_of_characters': 392802, 'unique_pairs': 1995, 'min_sentence1_length': 9, 'average_sentence1_length': 129.89, 'max_sentence1_length': 446, 'unique_sentence1': 1995, 'min_sentence2_length': 5, 'average_sentence2_length': 66.8, 'max_sentence2_length': 217, 'unique_sentence2': 1995}, 'lit_Latn-lav_Latn': {'num_samples': 1997, 'number_of_characters': 527585, 'unique_pairs': 1995, 'min_sentence1_length': 9, 'average_sentence1_length': 129.89, 'max_sentence1_length': 446, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 134.3, 'max_sentence2_length': 503, 'unique_sentence2': 1994}, 'lit_Latn-nld_Latn': {'num_samples': 1997, 'number_of_characters': 551312, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 129.89, 'max_sentence1_length': 446, 'unique_sentence1': 1995, 'min_sentence2_length': 8, 'average_sentence2_length': 146.18, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'lit_Latn-pol_Latn': {'num_samples': 1997, 'number_of_characters': 537263, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 129.89, 'max_sentence1_length': 446, 'unique_sentence1': 1995, 'min_sentence2_length': 9, 'average_sentence2_length': 139.14, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'lit_Latn-por_Latn': {'num_samples': 1997, 'number_of_characters': 536612, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 129.89, 'max_sentence1_length': 446, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 138.82, 'max_sentence2_length': 497, 'unique_sentence2': 1996}, 'lit_Latn-rus_Cyrl': {'num_samples': 1997, 'number_of_characters': 534290, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 129.89, 'max_sentence1_length': 446, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 137.65, 'max_sentence2_length': 419, 'unique_sentence2': 1996}, 'lit_Latn-spa_Latn': {'num_samples': 1997, 'number_of_characters': 547597, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 129.89, 'max_sentence1_length': 446, 'unique_sentence1': 1995, 'min_sentence2_length': 1, 'average_sentence2_length': 144.32, 'max_sentence2_length': 504, 'unique_sentence2': 1996}, 'lit_Latn-swa_Latn': {'num_samples': 1997, 'number_of_characters': 531906, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 129.89, 'max_sentence1_length': 446, 'unique_sentence1': 1995, 'min_sentence2_length': 10, 'average_sentence2_length': 136.46, 'max_sentence2_length': 430, 'unique_sentence2': 1997}, 'lit_Latn-swe_Latn': {'num_samples': 1997, 'number_of_characters': 511224, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 129.89, 'max_sentence1_length': 446, 'unique_sentence1': 1995, 'min_sentence2_length': 8, 'average_sentence2_length': 126.1, 'max_sentence2_length': 430, 'unique_sentence2': 1996}, 'lit_Latn-tam_Taml': {'num_samples': 1997, 'number_of_characters': 569358, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 129.89, 'max_sentence1_length': 446, 'unique_sentence1': 1995, 'min_sentence2_length': 11, 'average_sentence2_length': 155.21, 'max_sentence2_length': 581, 'unique_sentence2': 1997}, 'lit_Latn-tur_Latn': {'num_samples': 1997, 'number_of_characters': 525010, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 129.89, 'max_sentence1_length': 446, 'unique_sentence1': 1995, 'min_sentence2_length': 6, 'average_sentence2_length': 133.01, 'max_sentence2_length': 504, 'unique_sentence2': 1997}, 'lit_Latn-vie_Latn': {'num_samples': 1997, 'number_of_characters': 530518, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 129.89, 'max_sentence1_length': 446, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 135.76, 'max_sentence2_length': 437, 'unique_sentence2': 1996}, 'lit_Latn-zho_Hant': {'num_samples': 1997, 'number_of_characters': 350875, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 129.89, 'max_sentence1_length': 446, 'unique_sentence1': 1995, 'min_sentence2_length': 3, 'average_sentence2_length': 45.81, 'max_sentence2_length': 200, 'unique_sentence2': 1996}, 'lit_Latn-zul_Latn': {'num_samples': 1997, 'number_of_characters': 517129, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 129.89, 'max_sentence1_length': 446, 'unique_sentence1': 1995, 'min_sentence2_length': 8, 'average_sentence2_length': 129.06, 'max_sentence2_length': 494, 'unique_sentence2': 1996}, 'ltz_Latn-afr_Latn': {'num_samples': 1997, 'number_of_characters': 549109, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 140.59, 'max_sentence1_length': 543, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 134.38, 'max_sentence2_length': 437, 'unique_sentence2': 1996}, 'ltz_Latn-dan_Latn': {'num_samples': 1997, 'number_of_characters': 532895, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 140.59, 'max_sentence1_length': 543, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 126.26, 'max_sentence2_length': 522, 'unique_sentence2': 1995}, 'ltz_Latn-deu_Latn': {'num_samples': 1997, 'number_of_characters': 576407, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 140.59, 'max_sentence1_length': 543, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 148.05, 'max_sentence2_length': 508, 'unique_sentence2': 1996}, 'ltz_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 528477, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 140.59, 'max_sentence1_length': 543, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'ltz_Latn-fao_Latn': {'num_samples': 1997, 'number_of_characters': 538560, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 140.59, 'max_sentence1_length': 543, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 129.1, 'max_sentence2_length': 433, 'unique_sentence2': 1997}, 'ltz_Latn-isl_Latn': {'num_samples': 1997, 'number_of_characters': 542965, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 140.59, 'max_sentence1_length': 543, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 131.3, 'max_sentence2_length': 399, 'unique_sentence2': 1996}, 'ltz_Latn-nld_Latn': {'num_samples': 1997, 'number_of_characters': 572672, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 140.59, 'max_sentence1_length': 543, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 146.18, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'ltz_Latn-nno_Latn': {'num_samples': 1997, 'number_of_characters': 529114, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 140.59, 'max_sentence1_length': 543, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.37, 'max_sentence2_length': 417, 'unique_sentence2': 1996}, 'ltz_Latn-nob_Latn': {'num_samples': 1997, 'number_of_characters': 532201, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 140.59, 'max_sentence1_length': 543, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 125.91, 'max_sentence2_length': 482, 'unique_sentence2': 1996}, 'ltz_Latn-swe_Latn': {'num_samples': 1997, 'number_of_characters': 532584, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 140.59, 'max_sentence1_length': 543, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 126.1, 'max_sentence2_length': 430, 'unique_sentence2': 1996}, 'mal_Mlym-eng_Latn': {'num_samples': 1997, 'number_of_characters': 551872, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 152.3, 'max_sentence1_length': 540, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'mal_Mlym-fij_Latn': {'num_samples': 1997, 'number_of_characters': 604657, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 152.3, 'max_sentence1_length': 540, 'unique_sentence1': 1996, 'min_sentence2_length': 10, 'average_sentence2_length': 150.48, 'max_sentence2_length': 448, 'unique_sentence2': 1988}, 'mal_Mlym-fil_Latn': {'num_samples': 1997, 'number_of_characters': 597572, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 152.3, 'max_sentence1_length': 540, 'unique_sentence1': 1996, 'min_sentence2_length': 10, 'average_sentence2_length': 146.93, 'max_sentence2_length': 554, 'unique_sentence2': 1997}, 'mal_Mlym-ind_Latn': {'num_samples': 1997, 'number_of_characters': 591124, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 152.3, 'max_sentence1_length': 540, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 143.7, 'max_sentence2_length': 486, 'unique_sentence2': 1997}, 'mal_Mlym-mlg_Latn': {'num_samples': 1997, 'number_of_characters': 624460, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 152.3, 'max_sentence1_length': 540, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 160.39, 'max_sentence2_length': 559, 'unique_sentence2': 1994}, 'mal_Mlym-mri_Latn': {'num_samples': 1997, 'number_of_characters': 578276, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 152.3, 'max_sentence1_length': 540, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 137.27, 'max_sentence2_length': 443, 'unique_sentence2': 1997}, 'mal_Mlym-msa_Latn': {'num_samples': 1997, 'number_of_characters': 581335, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 152.3, 'max_sentence1_length': 540, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 138.8, 'max_sentence2_length': 463, 'unique_sentence2': 1997}, 'mal_Mlym-smo_Latn': {'num_samples': 1997, 'number_of_characters': 582007, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 152.3, 'max_sentence1_length': 540, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 139.14, 'max_sentence2_length': 431, 'unique_sentence2': 1996}, 'mal_Mlym-tah_Latn': {'num_samples': 1997, 'number_of_characters': 613775, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 152.3, 'max_sentence1_length': 540, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 155.04, 'max_sentence2_length': 524, 'unique_sentence2': 1997}, 'mal_Mlym-ton_Latn': {'num_samples': 1997, 'number_of_characters': 617792, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 152.3, 'max_sentence1_length': 540, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 157.06, 'max_sentence2_length': 468, 'unique_sentence2': 1997}, 'mar_Deva-ben_Beng': {'num_samples': 1997, 'number_of_characters': 504689, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 130.64, 'max_sentence1_length': 443, 'unique_sentence1': 1995, 'min_sentence2_length': 6, 'average_sentence2_length': 122.08, 'max_sentence2_length': 402, 'unique_sentence2': 1997}, 'mar_Deva-div_Thaa': {'num_samples': 1997, 'number_of_characters': 564735, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 130.64, 'max_sentence1_length': 443, 'unique_sentence1': 1995, 'min_sentence2_length': 9, 'average_sentence2_length': 152.15, 'max_sentence2_length': 609, 'unique_sentence2': 1996}, 'mar_Deva-eng_Latn': {'num_samples': 1997, 'number_of_characters': 508607, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 130.64, 'max_sentence1_length': 443, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'mar_Deva-eus_Latn': {'num_samples': 1997, 'number_of_characters': 536090, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 130.64, 'max_sentence1_length': 443, 'unique_sentence1': 1995, 'min_sentence2_length': 6, 'average_sentence2_length': 137.81, 'max_sentence2_length': 393, 'unique_sentence2': 1997}, 'mar_Deva-guj_Gujr': {'num_samples': 1997, 'number_of_characters': 505818, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 130.64, 'max_sentence1_length': 443, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 122.65, 'max_sentence2_length': 378, 'unique_sentence2': 1997}, 'mar_Deva-hin_Deva': {'num_samples': 1997, 'number_of_characters': 522462, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 130.64, 'max_sentence1_length': 443, 'unique_sentence1': 1995, 'min_sentence2_length': 5, 'average_sentence2_length': 130.98, 'max_sentence2_length': 394, 'unique_sentence2': 1996}, 'mar_Deva-kan_Knda': {'num_samples': 1997, 'number_of_characters': 526423, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 130.64, 'max_sentence1_length': 443, 'unique_sentence1': 1995, 'min_sentence2_length': 6, 'average_sentence2_length': 132.97, 'max_sentence2_length': 449, 'unique_sentence2': 1996}, 'mar_Deva-nep_Deva': {'num_samples': 1997, 'number_of_characters': 509110, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 130.64, 'max_sentence1_length': 443, 'unique_sentence1': 1995, 'min_sentence2_length': 6, 'average_sentence2_length': 124.3, 'max_sentence2_length': 412, 'unique_sentence2': 1996}, 'mar_Deva-pan_Guru': {'num_samples': 1997, 'number_of_characters': 511309, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 130.64, 'max_sentence1_length': 443, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 125.4, 'max_sentence2_length': 383, 'unique_sentence2': 1996}, 'mar_Deva-sin_Sinh': {'num_samples': 1997, 'number_of_characters': 519628, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 130.64, 'max_sentence1_length': 443, 'unique_sentence1': 1995, 'min_sentence2_length': 5, 'average_sentence2_length': 129.56, 'max_sentence2_length': 441, 'unique_sentence2': 1996}, 'mar_Deva-snd_Arab': {'num_samples': 1997, 'number_of_characters': 481214, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 130.64, 'max_sentence1_length': 443, 'unique_sentence1': 1995, 'min_sentence2_length': 6, 'average_sentence2_length': 110.33, 'max_sentence2_length': 335, 'unique_sentence2': 1996}, 'mar_Deva-tam_Taml': {'num_samples': 1997, 'number_of_characters': 570848, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 130.64, 'max_sentence1_length': 443, 'unique_sentence1': 1995, 'min_sentence2_length': 11, 'average_sentence2_length': 155.21, 'max_sentence2_length': 581, 'unique_sentence2': 1997}, 'mar_Deva-tel_Telu': {'num_samples': 1997, 'number_of_characters': 508414, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 130.64, 'max_sentence1_length': 443, 'unique_sentence1': 1995, 'min_sentence2_length': 10, 'average_sentence2_length': 123.95, 'max_sentence2_length': 412, 'unique_sentence2': 1996}, 'mar_Deva-urd_Arab': {'num_samples': 1997, 'number_of_characters': 508885, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 130.64, 'max_sentence1_length': 443, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 124.19, 'max_sentence2_length': 390, 'unique_sentence2': 1996}, 'mey_Arab-arb_Arab': {'num_samples': 1997, 'number_of_characters': 445016, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 107.08, 'max_sentence1_length': 392, 'unique_sentence1': 1993, 'min_sentence2_length': 5, 'average_sentence2_length': 115.76, 'max_sentence2_length': 362, 'unique_sentence2': 1995}, 'mey_Arab-ckb_Arab': {'num_samples': 1997, 'number_of_characters': 466202, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 107.08, 'max_sentence1_length': 392, 'unique_sentence1': 1993, 'min_sentence2_length': 5, 'average_sentence2_length': 126.37, 'max_sentence2_length': 399, 'unique_sentence2': 1995}, 'mey_Arab-eng_Latn': {'num_samples': 1997, 'number_of_characters': 461555, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 107.08, 'max_sentence1_length': 392, 'unique_sentence1': 1993, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'mey_Arab-fas_Arab': {'num_samples': 1997, 'number_of_characters': 457174, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 107.08, 'max_sentence1_length': 392, 'unique_sentence1': 1993, 'min_sentence2_length': 9, 'average_sentence2_length': 121.85, 'max_sentence2_length': 389, 'unique_sentence2': 1995}, 'mey_Arab-heb_Hebr': {'num_samples': 1997, 'number_of_characters': 414131, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 107.08, 'max_sentence1_length': 392, 'unique_sentence1': 1993, 'min_sentence2_length': 7, 'average_sentence2_length': 100.3, 'max_sentence2_length': 375, 'unique_sentence2': 1996}, 'mey_Arab-kmr_Latn': {'num_samples': 1997, 'number_of_characters': 459781, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 107.08, 'max_sentence1_length': 392, 'unique_sentence1': 1993, 'min_sentence2_length': 7, 'average_sentence2_length': 123.16, 'max_sentence2_length': 420, 'unique_sentence2': 1996}, 'mey_Arab-prs_Arab': {'num_samples': 1997, 'number_of_characters': 456371, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 107.08, 'max_sentence1_length': 392, 'unique_sentence1': 1993, 'min_sentence2_length': 8, 'average_sentence2_length': 121.45, 'max_sentence2_length': 365, 'unique_sentence2': 1997}, 'mey_Arab-pus_Arab': {'num_samples': 1997, 'number_of_characters': 456468, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 107.08, 'max_sentence1_length': 392, 'unique_sentence1': 1993, 'min_sentence2_length': 8, 'average_sentence2_length': 121.5, 'max_sentence2_length': 366, 'unique_sentence2': 1996}, 'mey_Arab-shi_Arab': {'num_samples': 1997, 'number_of_characters': 428748, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 107.08, 'max_sentence1_length': 392, 'unique_sentence1': 1993, 'min_sentence2_length': 3, 'average_sentence2_length': 107.62, 'max_sentence2_length': 378, 'unique_sentence2': 1996}, 'mey_Arab-tgk_Cyrl': {'num_samples': 1997, 'number_of_characters': 487982, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 107.08, 'max_sentence1_length': 392, 'unique_sentence1': 1993, 'min_sentence2_length': 3, 'average_sentence2_length': 137.28, 'max_sentence2_length': 451, 'unique_sentence2': 1995}, 'mkd_Cyrl-bel_Cyrl': {'num_samples': 1997, 'number_of_characters': 523981, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 134.15, 'max_sentence1_length': 451, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 128.24, 'max_sentence2_length': 422, 'unique_sentence2': 1996}, 'mkd_Cyrl-bos_Latn': {'num_samples': 1997, 'number_of_characters': 522801, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 134.15, 'max_sentence1_length': 451, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 127.65, 'max_sentence2_length': 434, 'unique_sentence2': 1996}, 'mkd_Cyrl-bul_Cyrl': {'num_samples': 1997, 'number_of_characters': 537780, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 134.15, 'max_sentence1_length': 451, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 135.15, 'max_sentence2_length': 493, 'unique_sentence2': 1996}, 'mkd_Cyrl-ces_Latn': {'num_samples': 1997, 'number_of_characters': 509209, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 134.15, 'max_sentence1_length': 451, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 120.84, 'max_sentence2_length': 474, 'unique_sentence2': 1997}, 'mkd_Cyrl-eng_Latn': {'num_samples': 1997, 'number_of_characters': 515611, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 134.15, 'max_sentence1_length': 451, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'mkd_Cyrl-hrv_Latn': {'num_samples': 1997, 'number_of_characters': 523816, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 134.15, 'max_sentence1_length': 451, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 128.15, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'mkd_Cyrl-pol_Latn': {'num_samples': 1997, 'number_of_characters': 545757, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 134.15, 'max_sentence1_length': 451, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 139.14, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'mkd_Cyrl-rus_Cyrl': {'num_samples': 1997, 'number_of_characters': 542784, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 134.15, 'max_sentence1_length': 451, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 137.65, 'max_sentence2_length': 419, 'unique_sentence2': 1996}, 'mkd_Cyrl-slk_Latn': {'num_samples': 1997, 'number_of_characters': 520860, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 134.15, 'max_sentence1_length': 451, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 126.67, 'max_sentence2_length': 403, 'unique_sentence2': 1996}, 'mkd_Cyrl-slv_Latn': {'num_samples': 1997, 'number_of_characters': 520787, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 134.15, 'max_sentence1_length': 451, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 126.64, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'mkd_Cyrl-srp_Cyrl': {'num_samples': 1997, 'number_of_characters': 520194, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 134.15, 'max_sentence1_length': 451, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 126.34, 'max_sentence2_length': 439, 'unique_sentence2': 1995}, 'mkd_Cyrl-srp_Latn': {'num_samples': 1997, 'number_of_characters': 524032, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 134.15, 'max_sentence1_length': 451, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 128.26, 'max_sentence2_length': 452, 'unique_sentence2': 1996}, 'mkd_Cyrl-ukr_Cyrl': {'num_samples': 1997, 'number_of_characters': 530674, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 134.15, 'max_sentence1_length': 451, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 131.59, 'max_sentence2_length': 440, 'unique_sentence2': 1996}, 'mlg_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 568028, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 160.39, 'max_sentence1_length': 559, 'unique_sentence1': 1994, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'mlg_Latn-fij_Latn': {'num_samples': 1997, 'number_of_characters': 620813, 'unique_pairs': 1995, 'min_sentence1_length': 6, 'average_sentence1_length': 160.39, 'max_sentence1_length': 559, 'unique_sentence1': 1994, 'min_sentence2_length': 10, 'average_sentence2_length': 150.48, 'max_sentence2_length': 448, 'unique_sentence2': 1988}, 'mlg_Latn-fil_Latn': {'num_samples': 1997, 'number_of_characters': 613728, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 160.39, 'max_sentence1_length': 559, 'unique_sentence1': 1994, 'min_sentence2_length': 10, 'average_sentence2_length': 146.93, 'max_sentence2_length': 554, 'unique_sentence2': 1997}, 'mlg_Latn-ind_Latn': {'num_samples': 1997, 'number_of_characters': 607280, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 160.39, 'max_sentence1_length': 559, 'unique_sentence1': 1994, 'min_sentence2_length': 6, 'average_sentence2_length': 143.7, 'max_sentence2_length': 486, 'unique_sentence2': 1997}, 'mlg_Latn-mal_Mlym': {'num_samples': 1997, 'number_of_characters': 624460, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 160.39, 'max_sentence1_length': 559, 'unique_sentence1': 1994, 'min_sentence2_length': 7, 'average_sentence2_length': 152.3, 'max_sentence2_length': 540, 'unique_sentence2': 1996}, 'mlg_Latn-mri_Latn': {'num_samples': 1997, 'number_of_characters': 594432, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 160.39, 'max_sentence1_length': 559, 'unique_sentence1': 1994, 'min_sentence2_length': 9, 'average_sentence2_length': 137.27, 'max_sentence2_length': 443, 'unique_sentence2': 1997}, 'mlg_Latn-msa_Latn': {'num_samples': 1997, 'number_of_characters': 597491, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 160.39, 'max_sentence1_length': 559, 'unique_sentence1': 1994, 'min_sentence2_length': 6, 'average_sentence2_length': 138.8, 'max_sentence2_length': 463, 'unique_sentence2': 1997}, 'mlg_Latn-smo_Latn': {'num_samples': 1997, 'number_of_characters': 598163, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 160.39, 'max_sentence1_length': 559, 'unique_sentence1': 1994, 'min_sentence2_length': 8, 'average_sentence2_length': 139.14, 'max_sentence2_length': 431, 'unique_sentence2': 1996}, 'mlg_Latn-tah_Latn': {'num_samples': 1997, 'number_of_characters': 629931, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 160.39, 'max_sentence1_length': 559, 'unique_sentence1': 1994, 'min_sentence2_length': 9, 'average_sentence2_length': 155.04, 'max_sentence2_length': 524, 'unique_sentence2': 1997}, 'mlg_Latn-ton_Latn': {'num_samples': 1997, 'number_of_characters': 633948, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 160.39, 'max_sentence1_length': 559, 'unique_sentence1': 1994, 'min_sentence2_length': 7, 'average_sentence2_length': 157.06, 'max_sentence2_length': 468, 'unique_sentence2': 1997}, 'mlt_Latn-cat_Latn': {'num_samples': 1997, 'number_of_characters': 560435, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 138.95, 'max_sentence1_length': 582, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 141.69, 'max_sentence2_length': 460, 'unique_sentence2': 1997}, 'mlt_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 525195, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 138.95, 'max_sentence1_length': 582, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'mlt_Latn-fra_Latn': {'num_samples': 1997, 'number_of_characters': 570583, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 138.95, 'max_sentence1_length': 582, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 146.77, 'max_sentence2_length': 512, 'unique_sentence2': 1996}, 'mlt_Latn-glg_Latn': {'num_samples': 1997, 'number_of_characters': 549461, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 138.95, 'max_sentence1_length': 582, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 136.2, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'mlt_Latn-ita_Latn': {'num_samples': 1997, 'number_of_characters': 566692, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 138.95, 'max_sentence1_length': 582, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 144.83, 'max_sentence2_length': 623, 'unique_sentence2': 1996}, 'mlt_Latn-por_Latn': {'num_samples': 1997, 'number_of_characters': 554690, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 138.95, 'max_sentence1_length': 582, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 138.82, 'max_sentence2_length': 497, 'unique_sentence2': 1996}, 'mlt_Latn-ron_Latn': {'num_samples': 1997, 'number_of_characters': 569960, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 138.95, 'max_sentence1_length': 582, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 146.46, 'max_sentence2_length': 518, 'unique_sentence2': 1997}, 'mlt_Latn-spa_Latn': {'num_samples': 1997, 'number_of_characters': 565675, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 138.95, 'max_sentence1_length': 582, 'unique_sentence1': 1996, 'min_sentence2_length': 1, 'average_sentence2_length': 144.32, 'max_sentence2_length': 504, 'unique_sentence2': 1996}, 'mon_Mong-bod_Tibt': {'num_samples': 1997, 'number_of_characters': 559677, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 129.72, 'max_sentence1_length': 414, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 150.54, 'max_sentence2_length': 478, 'unique_sentence2': 1993}, 'mon_Mong-dzo_Tibt': {'num_samples': 1997, 'number_of_characters': 502269, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 129.72, 'max_sentence1_length': 414, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 121.79, 'max_sentence2_length': 411, 'unique_sentence2': 1992}, 'mon_Mong-eng_Latn': {'num_samples': 1997, 'number_of_characters': 506768, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 129.72, 'max_sentence1_length': 414, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'mon_Mong-khm_Khmr': {'num_samples': 1997, 'number_of_characters': 547539, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 129.72, 'max_sentence1_length': 414, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 144.46, 'max_sentence2_length': 517, 'unique_sentence2': 1996}, 'mon_Mong-lao_Laoo': {'num_samples': 1997, 'number_of_characters': 526028, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 129.72, 'max_sentence1_length': 414, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 133.69, 'max_sentence2_length': 507, 'unique_sentence2': 1997}, 'mon_Mong-mya_Mymr': {'num_samples': 1997, 'number_of_characters': 570902, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 129.72, 'max_sentence1_length': 414, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 156.16, 'max_sentence2_length': 773, 'unique_sentence2': 1997}, 'mon_Mong-tha_Thai': {'num_samples': 1997, 'number_of_characters': 496516, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 129.72, 'max_sentence1_length': 414, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 118.91, 'max_sentence2_length': 439, 'unique_sentence2': 1996}, 'mri_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 521844, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 137.27, 'max_sentence1_length': 443, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'mri_Latn-fij_Latn': {'num_samples': 1997, 'number_of_characters': 574629, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 137.27, 'max_sentence1_length': 443, 'unique_sentence1': 1997, 'min_sentence2_length': 10, 'average_sentence2_length': 150.48, 'max_sentence2_length': 448, 'unique_sentence2': 1988}, 'mri_Latn-fil_Latn': {'num_samples': 1997, 'number_of_characters': 567544, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 137.27, 'max_sentence1_length': 443, 'unique_sentence1': 1997, 'min_sentence2_length': 10, 'average_sentence2_length': 146.93, 'max_sentence2_length': 554, 'unique_sentence2': 1997}, 'mri_Latn-ind_Latn': {'num_samples': 1997, 'number_of_characters': 561096, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 137.27, 'max_sentence1_length': 443, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 143.7, 'max_sentence2_length': 486, 'unique_sentence2': 1997}, 'mri_Latn-mal_Mlym': {'num_samples': 1997, 'number_of_characters': 578276, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 137.27, 'max_sentence1_length': 443, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 152.3, 'max_sentence2_length': 540, 'unique_sentence2': 1996}, 'mri_Latn-mlg_Latn': {'num_samples': 1997, 'number_of_characters': 594432, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 137.27, 'max_sentence1_length': 443, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 160.39, 'max_sentence2_length': 559, 'unique_sentence2': 1994}, 'mri_Latn-msa_Latn': {'num_samples': 1997, 'number_of_characters': 551307, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 137.27, 'max_sentence1_length': 443, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 138.8, 'max_sentence2_length': 463, 'unique_sentence2': 1997}, 'mri_Latn-smo_Latn': {'num_samples': 1997, 'number_of_characters': 551979, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 137.27, 'max_sentence1_length': 443, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 139.14, 'max_sentence2_length': 431, 'unique_sentence2': 1996}, 'mri_Latn-tah_Latn': {'num_samples': 1997, 'number_of_characters': 583747, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 137.27, 'max_sentence1_length': 443, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 155.04, 'max_sentence2_length': 524, 'unique_sentence2': 1997}, 'mri_Latn-ton_Latn': {'num_samples': 1997, 'number_of_characters': 587764, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 137.27, 'max_sentence1_length': 443, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 157.06, 'max_sentence2_length': 468, 'unique_sentence2': 1997}, 'msa_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 524903, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 138.8, 'max_sentence1_length': 463, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'msa_Latn-fij_Latn': {'num_samples': 1997, 'number_of_characters': 577688, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 138.8, 'max_sentence1_length': 463, 'unique_sentence1': 1997, 'min_sentence2_length': 10, 'average_sentence2_length': 150.48, 'max_sentence2_length': 448, 'unique_sentence2': 1988}, 'msa_Latn-fil_Latn': {'num_samples': 1997, 'number_of_characters': 570603, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 138.8, 'max_sentence1_length': 463, 'unique_sentence1': 1997, 'min_sentence2_length': 10, 'average_sentence2_length': 146.93, 'max_sentence2_length': 554, 'unique_sentence2': 1997}, 'msa_Latn-ind_Latn': {'num_samples': 1997, 'number_of_characters': 564155, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 138.8, 'max_sentence1_length': 463, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 143.7, 'max_sentence2_length': 486, 'unique_sentence2': 1997}, 'msa_Latn-mal_Mlym': {'num_samples': 1997, 'number_of_characters': 581335, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 138.8, 'max_sentence1_length': 463, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 152.3, 'max_sentence2_length': 540, 'unique_sentence2': 1996}, 'msa_Latn-mlg_Latn': {'num_samples': 1997, 'number_of_characters': 597491, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 138.8, 'max_sentence1_length': 463, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 160.39, 'max_sentence2_length': 559, 'unique_sentence2': 1994}, 'msa_Latn-mri_Latn': {'num_samples': 1997, 'number_of_characters': 551307, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 138.8, 'max_sentence1_length': 463, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 137.27, 'max_sentence2_length': 443, 'unique_sentence2': 1997}, 'msa_Latn-smo_Latn': {'num_samples': 1997, 'number_of_characters': 555038, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 138.8, 'max_sentence1_length': 463, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 139.14, 'max_sentence2_length': 431, 'unique_sentence2': 1996}, 'msa_Latn-tah_Latn': {'num_samples': 1997, 'number_of_characters': 586806, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 138.8, 'max_sentence1_length': 463, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 155.04, 'max_sentence2_length': 524, 'unique_sentence2': 1997}, 'msa_Latn-ton_Latn': {'num_samples': 1997, 'number_of_characters': 590823, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 138.8, 'max_sentence1_length': 463, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 157.06, 'max_sentence2_length': 468, 'unique_sentence2': 1997}, 'mya_Mymr-bod_Tibt': {'num_samples': 1997, 'number_of_characters': 612483, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 156.16, 'max_sentence1_length': 773, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 150.54, 'max_sentence2_length': 478, 'unique_sentence2': 1993}, 'mya_Mymr-dzo_Tibt': {'num_samples': 1997, 'number_of_characters': 555075, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 156.16, 'max_sentence1_length': 773, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 121.79, 'max_sentence2_length': 411, 'unique_sentence2': 1992}, 'mya_Mymr-eng_Latn': {'num_samples': 1997, 'number_of_characters': 559574, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 156.16, 'max_sentence1_length': 773, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'mya_Mymr-khm_Khmr': {'num_samples': 1997, 'number_of_characters': 600345, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 156.16, 'max_sentence1_length': 773, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 144.46, 'max_sentence2_length': 517, 'unique_sentence2': 1996}, 'mya_Mymr-lao_Laoo': {'num_samples': 1997, 'number_of_characters': 578834, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 156.16, 'max_sentence1_length': 773, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 133.69, 'max_sentence2_length': 507, 'unique_sentence2': 1997}, 'mya_Mymr-mon_Mong': {'num_samples': 1997, 'number_of_characters': 570902, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 156.16, 'max_sentence1_length': 773, 'unique_sentence1': 1997, 'min_sentence2_length': 11, 'average_sentence2_length': 129.72, 'max_sentence2_length': 414, 'unique_sentence2': 1997}, 'mya_Mymr-tha_Thai': {'num_samples': 1997, 'number_of_characters': 549322, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 156.16, 'max_sentence1_length': 773, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 118.91, 'max_sentence2_length': 439, 'unique_sentence2': 1996}, 'nde_Latn-bem_Latn': {'num_samples': 1997, 'number_of_characters': 596231, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 149.09, 'max_sentence1_length': 590, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 149.47, 'max_sentence2_length': 465, 'unique_sentence2': 1997}, 'nde_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 545459, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 149.09, 'max_sentence1_length': 590, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'nde_Latn-ewe_Latn': {'num_samples': 1997, 'number_of_characters': 536717, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 149.09, 'max_sentence1_length': 590, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 119.67, 'max_sentence2_length': 493, 'unique_sentence2': 1994}, 'nde_Latn-fuc_Latn': {'num_samples': 1997, 'number_of_characters': 526219, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 149.09, 'max_sentence1_length': 590, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 114.41, 'max_sentence2_length': 376, 'unique_sentence2': 1996}, 'nde_Latn-kin_Latn': {'num_samples': 1997, 'number_of_characters': 601526, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 149.09, 'max_sentence1_length': 590, 'unique_sentence1': 1997, 'min_sentence2_length': 10, 'average_sentence2_length': 152.12, 'max_sentence2_length': 541, 'unique_sentence2': 1996}, 'nde_Latn-nya_Latn': {'num_samples': 1997, 'number_of_characters': 582021, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 149.09, 'max_sentence1_length': 590, 'unique_sentence1': 1997, 'min_sentence2_length': 10, 'average_sentence2_length': 142.35, 'max_sentence2_length': 464, 'unique_sentence2': 1993}, 'nde_Latn-sna_Latn': {'num_samples': 1997, 'number_of_characters': 596069, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 149.09, 'max_sentence1_length': 590, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 149.39, 'max_sentence2_length': 511, 'unique_sentence2': 1995}, 'nde_Latn-ven_Latn': {'num_samples': 1997, 'number_of_characters': 597495, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 149.09, 'max_sentence1_length': 590, 'unique_sentence1': 1997, 'min_sentence2_length': 10, 'average_sentence2_length': 150.1, 'max_sentence2_length': 535, 'unique_sentence2': 1993}, 'nep_Deva-ben_Beng': {'num_samples': 1997, 'number_of_characters': 492025, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 124.3, 'max_sentence1_length': 412, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 122.08, 'max_sentence2_length': 402, 'unique_sentence2': 1997}, 'nep_Deva-div_Thaa': {'num_samples': 1997, 'number_of_characters': 552071, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 124.3, 'max_sentence1_length': 412, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 152.15, 'max_sentence2_length': 609, 'unique_sentence2': 1996}, 'nep_Deva-eng_Latn': {'num_samples': 1997, 'number_of_characters': 495943, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 124.3, 'max_sentence1_length': 412, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'nep_Deva-eus_Latn': {'num_samples': 1997, 'number_of_characters': 523426, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 124.3, 'max_sentence1_length': 412, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 137.81, 'max_sentence2_length': 393, 'unique_sentence2': 1997}, 'nep_Deva-guj_Gujr': {'num_samples': 1997, 'number_of_characters': 493154, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 124.3, 'max_sentence1_length': 412, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 122.65, 'max_sentence2_length': 378, 'unique_sentence2': 1997}, 'nep_Deva-hin_Deva': {'num_samples': 1997, 'number_of_characters': 509798, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 124.3, 'max_sentence1_length': 412, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 130.98, 'max_sentence2_length': 394, 'unique_sentence2': 1996}, 'nep_Deva-kan_Knda': {'num_samples': 1997, 'number_of_characters': 513759, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 124.3, 'max_sentence1_length': 412, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 132.97, 'max_sentence2_length': 449, 'unique_sentence2': 1996}, 'nep_Deva-mar_Deva': {'num_samples': 1997, 'number_of_characters': 509110, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 124.3, 'max_sentence1_length': 412, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 130.64, 'max_sentence2_length': 443, 'unique_sentence2': 1995}, 'nep_Deva-pan_Guru': {'num_samples': 1997, 'number_of_characters': 498645, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 124.3, 'max_sentence1_length': 412, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 125.4, 'max_sentence2_length': 383, 'unique_sentence2': 1996}, 'nep_Deva-sin_Sinh': {'num_samples': 1997, 'number_of_characters': 506964, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 124.3, 'max_sentence1_length': 412, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 129.56, 'max_sentence2_length': 441, 'unique_sentence2': 1996}, 'nep_Deva-snd_Arab': {'num_samples': 1997, 'number_of_characters': 468550, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 124.3, 'max_sentence1_length': 412, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 110.33, 'max_sentence2_length': 335, 'unique_sentence2': 1996}, 'nep_Deva-tam_Taml': {'num_samples': 1997, 'number_of_characters': 558184, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 124.3, 'max_sentence1_length': 412, 'unique_sentence1': 1996, 'min_sentence2_length': 11, 'average_sentence2_length': 155.21, 'max_sentence2_length': 581, 'unique_sentence2': 1997}, 'nep_Deva-tel_Telu': {'num_samples': 1997, 'number_of_characters': 495750, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 124.3, 'max_sentence1_length': 412, 'unique_sentence1': 1996, 'min_sentence2_length': 10, 'average_sentence2_length': 123.95, 'max_sentence2_length': 412, 'unique_sentence2': 1996}, 'nep_Deva-urd_Arab': {'num_samples': 1997, 'number_of_characters': 496221, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 124.3, 'max_sentence1_length': 412, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.19, 'max_sentence2_length': 390, 'unique_sentence2': 1996}, 'nld_Latn-afr_Latn': {'num_samples': 1997, 'number_of_characters': 560267, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.18, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 134.38, 'max_sentence2_length': 437, 'unique_sentence2': 1996}, 'nld_Latn-arb_Arab': {'num_samples': 1997, 'number_of_characters': 523096, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.18, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 115.76, 'max_sentence2_length': 362, 'unique_sentence2': 1995}, 'nld_Latn-ben_Beng': {'num_samples': 1997, 'number_of_characters': 535717, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 146.18, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 122.08, 'max_sentence2_length': 402, 'unique_sentence2': 1997}, 'nld_Latn-dan_Latn': {'num_samples': 1997, 'number_of_characters': 544053, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.18, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 126.26, 'max_sentence2_length': 522, 'unique_sentence2': 1995}, 'nld_Latn-deu_Latn': {'num_samples': 1997, 'number_of_characters': 587565, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.18, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 148.05, 'max_sentence2_length': 508, 'unique_sentence2': 1996}, 'nld_Latn-ell_Grek': {'num_samples': 1997, 'number_of_characters': 591042, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.18, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 149.79, 'max_sentence2_length': 584, 'unique_sentence2': 1996}, 'nld_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 539635, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 146.18, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'nld_Latn-fao_Latn': {'num_samples': 1997, 'number_of_characters': 549718, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 146.18, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 129.1, 'max_sentence2_length': 433, 'unique_sentence2': 1997}, 'nld_Latn-fas_Arab': {'num_samples': 1997, 'number_of_characters': 535254, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.18, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 121.85, 'max_sentence2_length': 389, 'unique_sentence2': 1995}, 'nld_Latn-fin_Latn': {'num_samples': 1997, 'number_of_characters': 561715, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.18, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 135.1, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'nld_Latn-fra_Latn': {'num_samples': 1997, 'number_of_characters': 585023, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.18, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 146.77, 'max_sentence2_length': 512, 'unique_sentence2': 1996}, 'nld_Latn-heb_Hebr': {'num_samples': 1997, 'number_of_characters': 492211, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.18, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 100.3, 'max_sentence2_length': 375, 'unique_sentence2': 1996}, 'nld_Latn-hin_Deva': {'num_samples': 1997, 'number_of_characters': 553490, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.18, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 130.98, 'max_sentence2_length': 394, 'unique_sentence2': 1996}, 'nld_Latn-hun_Latn': {'num_samples': 1997, 'number_of_characters': 570291, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 146.18, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 139.4, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'nld_Latn-ind_Latn': {'num_samples': 1997, 'number_of_characters': 578887, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 146.18, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 143.7, 'max_sentence2_length': 486, 'unique_sentence2': 1997}, 'nld_Latn-isl_Latn': {'num_samples': 1997, 'number_of_characters': 554123, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.18, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 131.3, 'max_sentence2_length': 399, 'unique_sentence2': 1996}, 'nld_Latn-jpn_Jpan': {'num_samples': 1997, 'number_of_characters': 403541, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.18, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 4, 'average_sentence2_length': 55.9, 'max_sentence2_length': 189, 'unique_sentence2': 1994}, 'nld_Latn-kor_Hang': {'num_samples': 1997, 'number_of_characters': 425320, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.18, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 66.8, 'max_sentence2_length': 217, 'unique_sentence2': 1995}, 'nld_Latn-lit_Latn': {'num_samples': 1997, 'number_of_characters': 551312, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.18, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 129.89, 'max_sentence2_length': 446, 'unique_sentence2': 1995}, 'nld_Latn-ltz_Latn': {'num_samples': 1997, 'number_of_characters': 572672, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.18, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 140.59, 'max_sentence2_length': 543, 'unique_sentence2': 1996}, 'nld_Latn-nno_Latn': {'num_samples': 1997, 'number_of_characters': 540272, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.18, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.37, 'max_sentence2_length': 417, 'unique_sentence2': 1996}, 'nld_Latn-nob_Latn': {'num_samples': 1997, 'number_of_characters': 543359, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.18, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 125.91, 'max_sentence2_length': 482, 'unique_sentence2': 1996}, 'nld_Latn-pol_Latn': {'num_samples': 1997, 'number_of_characters': 569781, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.18, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 139.14, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'nld_Latn-por_Latn': {'num_samples': 1997, 'number_of_characters': 569130, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.18, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 138.82, 'max_sentence2_length': 497, 'unique_sentence2': 1996}, 'nld_Latn-rus_Cyrl': {'num_samples': 1997, 'number_of_characters': 566808, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.18, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 137.65, 'max_sentence2_length': 419, 'unique_sentence2': 1996}, 'nld_Latn-spa_Latn': {'num_samples': 1997, 'number_of_characters': 580115, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.18, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 1, 'average_sentence2_length': 144.32, 'max_sentence2_length': 504, 'unique_sentence2': 1996}, 'nld_Latn-swa_Latn': {'num_samples': 1997, 'number_of_characters': 564424, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 146.18, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 10, 'average_sentence2_length': 136.46, 'max_sentence2_length': 430, 'unique_sentence2': 1997}, 'nld_Latn-swe_Latn': {'num_samples': 1997, 'number_of_characters': 543742, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.18, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 126.1, 'max_sentence2_length': 430, 'unique_sentence2': 1996}, 'nld_Latn-tam_Taml': {'num_samples': 1997, 'number_of_characters': 601876, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 146.18, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 11, 'average_sentence2_length': 155.21, 'max_sentence2_length': 581, 'unique_sentence2': 1997}, 'nld_Latn-tur_Latn': {'num_samples': 1997, 'number_of_characters': 557528, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 146.18, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 133.01, 'max_sentence2_length': 504, 'unique_sentence2': 1997}, 'nld_Latn-vie_Latn': {'num_samples': 1997, 'number_of_characters': 563036, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 146.18, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 135.76, 'max_sentence2_length': 437, 'unique_sentence2': 1996}, 'nld_Latn-zho_Hant': {'num_samples': 1997, 'number_of_characters': 383393, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 146.18, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 3, 'average_sentence2_length': 45.81, 'max_sentence2_length': 200, 'unique_sentence2': 1996}, 'nld_Latn-zul_Latn': {'num_samples': 1997, 'number_of_characters': 549647, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 146.18, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 129.06, 'max_sentence2_length': 494, 'unique_sentence2': 1996}, 'nno_Latn-afr_Latn': {'num_samples': 1997, 'number_of_characters': 516709, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 124.37, 'max_sentence1_length': 417, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 134.38, 'max_sentence2_length': 437, 'unique_sentence2': 1996}, 'nno_Latn-dan_Latn': {'num_samples': 1997, 'number_of_characters': 500495, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 124.37, 'max_sentence1_length': 417, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 126.26, 'max_sentence2_length': 522, 'unique_sentence2': 1995}, 'nno_Latn-deu_Latn': {'num_samples': 1997, 'number_of_characters': 544007, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 124.37, 'max_sentence1_length': 417, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 148.05, 'max_sentence2_length': 508, 'unique_sentence2': 1996}, 'nno_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 496077, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.37, 'max_sentence1_length': 417, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'nno_Latn-fao_Latn': {'num_samples': 1997, 'number_of_characters': 506160, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.37, 'max_sentence1_length': 417, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 129.1, 'max_sentence2_length': 433, 'unique_sentence2': 1997}, 'nno_Latn-isl_Latn': {'num_samples': 1997, 'number_of_characters': 510565, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 124.37, 'max_sentence1_length': 417, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 131.3, 'max_sentence2_length': 399, 'unique_sentence2': 1996}, 'nno_Latn-ltz_Latn': {'num_samples': 1997, 'number_of_characters': 529114, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 124.37, 'max_sentence1_length': 417, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 140.59, 'max_sentence2_length': 543, 'unique_sentence2': 1996}, 'nno_Latn-nld_Latn': {'num_samples': 1997, 'number_of_characters': 540272, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 124.37, 'max_sentence1_length': 417, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 146.18, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'nno_Latn-nob_Latn': {'num_samples': 1997, 'number_of_characters': 499801, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 124.37, 'max_sentence1_length': 417, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 125.91, 'max_sentence2_length': 482, 'unique_sentence2': 1996}, 'nno_Latn-swe_Latn': {'num_samples': 1997, 'number_of_characters': 500184, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 124.37, 'max_sentence1_length': 417, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 126.1, 'max_sentence2_length': 430, 'unique_sentence2': 1996}, 'nob_Latn-afr_Latn': {'num_samples': 1997, 'number_of_characters': 519796, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 125.91, 'max_sentence1_length': 482, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 134.38, 'max_sentence2_length': 437, 'unique_sentence2': 1996}, 'nob_Latn-dan_Latn': {'num_samples': 1997, 'number_of_characters': 503582, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 125.91, 'max_sentence1_length': 482, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 126.26, 'max_sentence2_length': 522, 'unique_sentence2': 1995}, 'nob_Latn-deu_Latn': {'num_samples': 1997, 'number_of_characters': 547094, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 125.91, 'max_sentence1_length': 482, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 148.05, 'max_sentence2_length': 508, 'unique_sentence2': 1996}, 'nob_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 499164, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 125.91, 'max_sentence1_length': 482, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'nob_Latn-fao_Latn': {'num_samples': 1997, 'number_of_characters': 509247, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 125.91, 'max_sentence1_length': 482, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 129.1, 'max_sentence2_length': 433, 'unique_sentence2': 1997}, 'nob_Latn-isl_Latn': {'num_samples': 1997, 'number_of_characters': 513652, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 125.91, 'max_sentence1_length': 482, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 131.3, 'max_sentence2_length': 399, 'unique_sentence2': 1996}, 'nob_Latn-ltz_Latn': {'num_samples': 1997, 'number_of_characters': 532201, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 125.91, 'max_sentence1_length': 482, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 140.59, 'max_sentence2_length': 543, 'unique_sentence2': 1996}, 'nob_Latn-nld_Latn': {'num_samples': 1997, 'number_of_characters': 543359, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 125.91, 'max_sentence1_length': 482, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 146.18, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'nob_Latn-nno_Latn': {'num_samples': 1997, 'number_of_characters': 499801, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 125.91, 'max_sentence1_length': 482, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.37, 'max_sentence2_length': 417, 'unique_sentence2': 1996}, 'nob_Latn-swe_Latn': {'num_samples': 1997, 'number_of_characters': 503271, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 125.91, 'max_sentence1_length': 482, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 126.1, 'max_sentence2_length': 430, 'unique_sentence2': 1996}, 'nso_Latn-amh_Ethi': {'num_samples': 1997, 'number_of_characters': 459006, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 145.97, 'max_sentence1_length': 487, 'unique_sentence1': 1996, 'min_sentence2_length': 1, 'average_sentence2_length': 83.88, 'max_sentence2_length': 290, 'unique_sentence2': 1994}, 'nso_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 539219, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 145.97, 'max_sentence1_length': 487, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'nso_Latn-hau_Latn': {'num_samples': 1997, 'number_of_characters': 561465, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 145.97, 'max_sentence1_length': 487, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 135.19, 'max_sentence2_length': 483, 'unique_sentence2': 1997}, 'nso_Latn-ibo_Latn': {'num_samples': 1997, 'number_of_characters': 537600, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 145.97, 'max_sentence1_length': 487, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 123.24, 'max_sentence2_length': 469, 'unique_sentence2': 1997}, 'nso_Latn-orm_Ethi': {'num_samples': 1997, 'number_of_characters': 528930, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 145.97, 'max_sentence1_length': 487, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 118.89, 'max_sentence2_length': 466, 'unique_sentence2': 1984}, 'nso_Latn-som_Latn': {'num_samples': 1997, 'number_of_characters': 582791, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 145.97, 'max_sentence1_length': 487, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 145.86, 'max_sentence2_length': 455, 'unique_sentence2': 1997}, 'nso_Latn-ssw_Latn': {'num_samples': 1997, 'number_of_characters': 579641, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 145.97, 'max_sentence1_length': 487, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 144.29, 'max_sentence2_length': 510, 'unique_sentence2': 1996}, 'nso_Latn-swa_Latn': {'num_samples': 1997, 'number_of_characters': 564008, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 145.97, 'max_sentence1_length': 487, 'unique_sentence1': 1996, 'min_sentence2_length': 10, 'average_sentence2_length': 136.46, 'max_sentence2_length': 430, 'unique_sentence2': 1997}, 'nso_Latn-tir_Ethi': {'num_samples': 1997, 'number_of_characters': 456737, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 145.97, 'max_sentence1_length': 487, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 82.74, 'max_sentence2_length': 272, 'unique_sentence2': 1996}, 'nso_Latn-tsn_Latn': {'num_samples': 1997, 'number_of_characters': 625782, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 145.97, 'max_sentence1_length': 487, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 167.39, 'max_sentence2_length': 556, 'unique_sentence2': 1997}, 'nso_Latn-wol_Latn': {'num_samples': 1997, 'number_of_characters': 531302, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 145.97, 'max_sentence1_length': 487, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 120.08, 'max_sentence2_length': 405, 'unique_sentence2': 1990}, 'nso_Latn-xho_Latn': {'num_samples': 1997, 'number_of_characters': 559589, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 145.97, 'max_sentence1_length': 487, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 134.25, 'max_sentence2_length': 492, 'unique_sentence2': 1997}, 'nso_Latn-yor_Latn': {'num_samples': 1997, 'number_of_characters': 607587, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 145.97, 'max_sentence1_length': 487, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 158.28, 'max_sentence2_length': 582, 'unique_sentence2': 1996}, 'nso_Latn-zul_Latn': {'num_samples': 1997, 'number_of_characters': 549231, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 145.97, 'max_sentence1_length': 487, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 129.06, 'max_sentence2_length': 494, 'unique_sentence2': 1996}, 'nya_Latn-bem_Latn': {'num_samples': 1997, 'number_of_characters': 582774, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 142.35, 'max_sentence1_length': 464, 'unique_sentence1': 1993, 'min_sentence2_length': 8, 'average_sentence2_length': 149.47, 'max_sentence2_length': 465, 'unique_sentence2': 1997}, 'nya_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 532002, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 142.35, 'max_sentence1_length': 464, 'unique_sentence1': 1993, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'nya_Latn-ewe_Latn': {'num_samples': 1997, 'number_of_characters': 523260, 'unique_pairs': 1995, 'min_sentence1_length': 10, 'average_sentence1_length': 142.35, 'max_sentence1_length': 464, 'unique_sentence1': 1993, 'min_sentence2_length': 6, 'average_sentence2_length': 119.67, 'max_sentence2_length': 493, 'unique_sentence2': 1994}, 'nya_Latn-fuc_Latn': {'num_samples': 1997, 'number_of_characters': 512762, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 142.35, 'max_sentence1_length': 464, 'unique_sentence1': 1993, 'min_sentence2_length': 6, 'average_sentence2_length': 114.41, 'max_sentence2_length': 376, 'unique_sentence2': 1996}, 'nya_Latn-kin_Latn': {'num_samples': 1997, 'number_of_characters': 588069, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 142.35, 'max_sentence1_length': 464, 'unique_sentence1': 1993, 'min_sentence2_length': 10, 'average_sentence2_length': 152.12, 'max_sentence2_length': 541, 'unique_sentence2': 1996}, 'nya_Latn-nde_Latn': {'num_samples': 1997, 'number_of_characters': 582021, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 142.35, 'max_sentence1_length': 464, 'unique_sentence1': 1993, 'min_sentence2_length': 6, 'average_sentence2_length': 149.09, 'max_sentence2_length': 590, 'unique_sentence2': 1997}, 'nya_Latn-sna_Latn': {'num_samples': 1997, 'number_of_characters': 582612, 'unique_pairs': 1996, 'min_sentence1_length': 10, 'average_sentence1_length': 142.35, 'max_sentence1_length': 464, 'unique_sentence1': 1993, 'min_sentence2_length': 6, 'average_sentence2_length': 149.39, 'max_sentence2_length': 511, 'unique_sentence2': 1995}, 'nya_Latn-ven_Latn': {'num_samples': 1997, 'number_of_characters': 584038, 'unique_pairs': 1996, 'min_sentence1_length': 10, 'average_sentence1_length': 142.35, 'max_sentence1_length': 464, 'unique_sentence1': 1993, 'min_sentence2_length': 10, 'average_sentence2_length': 150.1, 'max_sentence2_length': 535, 'unique_sentence2': 1993}, 'orm_Ethi-amh_Ethi': {'num_samples': 1997, 'number_of_characters': 404938, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 118.89, 'max_sentence1_length': 466, 'unique_sentence1': 1984, 'min_sentence2_length': 1, 'average_sentence2_length': 83.88, 'max_sentence2_length': 290, 'unique_sentence2': 1994}, 'orm_Ethi-eng_Latn': {'num_samples': 1997, 'number_of_characters': 485151, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 118.89, 'max_sentence1_length': 466, 'unique_sentence1': 1984, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'orm_Ethi-hau_Latn': {'num_samples': 1997, 'number_of_characters': 507397, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 118.89, 'max_sentence1_length': 466, 'unique_sentence1': 1984, 'min_sentence2_length': 8, 'average_sentence2_length': 135.19, 'max_sentence2_length': 483, 'unique_sentence2': 1997}, 'orm_Ethi-ibo_Latn': {'num_samples': 1997, 'number_of_characters': 483532, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 118.89, 'max_sentence1_length': 466, 'unique_sentence1': 1984, 'min_sentence2_length': 6, 'average_sentence2_length': 123.24, 'max_sentence2_length': 469, 'unique_sentence2': 1997}, 'orm_Ethi-nso_Latn': {'num_samples': 1997, 'number_of_characters': 528930, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 118.89, 'max_sentence1_length': 466, 'unique_sentence1': 1984, 'min_sentence2_length': 5, 'average_sentence2_length': 145.97, 'max_sentence2_length': 487, 'unique_sentence2': 1996}, 'orm_Ethi-som_Latn': {'num_samples': 1997, 'number_of_characters': 528723, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 118.89, 'max_sentence1_length': 466, 'unique_sentence1': 1984, 'min_sentence2_length': 8, 'average_sentence2_length': 145.86, 'max_sentence2_length': 455, 'unique_sentence2': 1997}, 'orm_Ethi-ssw_Latn': {'num_samples': 1997, 'number_of_characters': 525573, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 118.89, 'max_sentence1_length': 466, 'unique_sentence1': 1984, 'min_sentence2_length': 8, 'average_sentence2_length': 144.29, 'max_sentence2_length': 510, 'unique_sentence2': 1996}, 'orm_Ethi-swa_Latn': {'num_samples': 1997, 'number_of_characters': 509940, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 118.89, 'max_sentence1_length': 466, 'unique_sentence1': 1984, 'min_sentence2_length': 10, 'average_sentence2_length': 136.46, 'max_sentence2_length': 430, 'unique_sentence2': 1997}, 'orm_Ethi-tir_Ethi': {'num_samples': 1997, 'number_of_characters': 402669, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 118.89, 'max_sentence1_length': 466, 'unique_sentence1': 1984, 'min_sentence2_length': 5, 'average_sentence2_length': 82.74, 'max_sentence2_length': 272, 'unique_sentence2': 1996}, 'orm_Ethi-tsn_Latn': {'num_samples': 1997, 'number_of_characters': 571714, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 118.89, 'max_sentence1_length': 466, 'unique_sentence1': 1984, 'min_sentence2_length': 6, 'average_sentence2_length': 167.39, 'max_sentence2_length': 556, 'unique_sentence2': 1997}, 'orm_Ethi-wol_Latn': {'num_samples': 1997, 'number_of_characters': 477234, 'unique_pairs': 1992, 'min_sentence1_length': 9, 'average_sentence1_length': 118.89, 'max_sentence1_length': 466, 'unique_sentence1': 1984, 'min_sentence2_length': 6, 'average_sentence2_length': 120.08, 'max_sentence2_length': 405, 'unique_sentence2': 1990}, 'orm_Ethi-xho_Latn': {'num_samples': 1997, 'number_of_characters': 505521, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 118.89, 'max_sentence1_length': 466, 'unique_sentence1': 1984, 'min_sentence2_length': 6, 'average_sentence2_length': 134.25, 'max_sentence2_length': 492, 'unique_sentence2': 1997}, 'orm_Ethi-yor_Latn': {'num_samples': 1997, 'number_of_characters': 553519, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 118.89, 'max_sentence1_length': 466, 'unique_sentence1': 1984, 'min_sentence2_length': 7, 'average_sentence2_length': 158.28, 'max_sentence2_length': 582, 'unique_sentence2': 1996}, 'orm_Ethi-zul_Latn': {'num_samples': 1997, 'number_of_characters': 495163, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 118.89, 'max_sentence1_length': 466, 'unique_sentence1': 1984, 'min_sentence2_length': 8, 'average_sentence2_length': 129.06, 'max_sentence2_length': 494, 'unique_sentence2': 1996}, 'pan_Guru-ben_Beng': {'num_samples': 1997, 'number_of_characters': 494224, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 125.4, 'max_sentence1_length': 383, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 122.08, 'max_sentence2_length': 402, 'unique_sentence2': 1997}, 'pan_Guru-div_Thaa': {'num_samples': 1997, 'number_of_characters': 554270, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 125.4, 'max_sentence1_length': 383, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 152.15, 'max_sentence2_length': 609, 'unique_sentence2': 1996}, 'pan_Guru-eng_Latn': {'num_samples': 1997, 'number_of_characters': 498142, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 125.4, 'max_sentence1_length': 383, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'pan_Guru-eus_Latn': {'num_samples': 1997, 'number_of_characters': 525625, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 125.4, 'max_sentence1_length': 383, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 137.81, 'max_sentence2_length': 393, 'unique_sentence2': 1997}, 'pan_Guru-guj_Gujr': {'num_samples': 1997, 'number_of_characters': 495353, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 125.4, 'max_sentence1_length': 383, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 122.65, 'max_sentence2_length': 378, 'unique_sentence2': 1997}, 'pan_Guru-hin_Deva': {'num_samples': 1997, 'number_of_characters': 511997, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 125.4, 'max_sentence1_length': 383, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 130.98, 'max_sentence2_length': 394, 'unique_sentence2': 1996}, 'pan_Guru-kan_Knda': {'num_samples': 1997, 'number_of_characters': 515958, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 125.4, 'max_sentence1_length': 383, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 132.97, 'max_sentence2_length': 449, 'unique_sentence2': 1996}, 'pan_Guru-mar_Deva': {'num_samples': 1997, 'number_of_characters': 511309, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 125.4, 'max_sentence1_length': 383, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 130.64, 'max_sentence2_length': 443, 'unique_sentence2': 1995}, 'pan_Guru-nep_Deva': {'num_samples': 1997, 'number_of_characters': 498645, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 125.4, 'max_sentence1_length': 383, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 124.3, 'max_sentence2_length': 412, 'unique_sentence2': 1996}, 'pan_Guru-sin_Sinh': {'num_samples': 1997, 'number_of_characters': 509163, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 125.4, 'max_sentence1_length': 383, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 129.56, 'max_sentence2_length': 441, 'unique_sentence2': 1996}, 'pan_Guru-snd_Arab': {'num_samples': 1997, 'number_of_characters': 470749, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 125.4, 'max_sentence1_length': 383, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 110.33, 'max_sentence2_length': 335, 'unique_sentence2': 1996}, 'pan_Guru-tam_Taml': {'num_samples': 1997, 'number_of_characters': 560383, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 125.4, 'max_sentence1_length': 383, 'unique_sentence1': 1996, 'min_sentence2_length': 11, 'average_sentence2_length': 155.21, 'max_sentence2_length': 581, 'unique_sentence2': 1997}, 'pan_Guru-tel_Telu': {'num_samples': 1997, 'number_of_characters': 497949, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 125.4, 'max_sentence1_length': 383, 'unique_sentence1': 1996, 'min_sentence2_length': 10, 'average_sentence2_length': 123.95, 'max_sentence2_length': 412, 'unique_sentence2': 1996}, 'pan_Guru-urd_Arab': {'num_samples': 1997, 'number_of_characters': 498420, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 125.4, 'max_sentence1_length': 383, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.19, 'max_sentence2_length': 390, 'unique_sentence2': 1996}, 'pol_Latn-arb_Arab': {'num_samples': 1997, 'number_of_characters': 509047, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 139.14, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 115.76, 'max_sentence2_length': 362, 'unique_sentence2': 1995}, 'pol_Latn-bel_Cyrl': {'num_samples': 1997, 'number_of_characters': 533956, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 139.14, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 128.24, 'max_sentence2_length': 422, 'unique_sentence2': 1996}, 'pol_Latn-ben_Beng': {'num_samples': 1997, 'number_of_characters': 521668, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 139.14, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 122.08, 'max_sentence2_length': 402, 'unique_sentence2': 1997}, 'pol_Latn-bos_Latn': {'num_samples': 1997, 'number_of_characters': 532776, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 139.14, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 127.65, 'max_sentence2_length': 434, 'unique_sentence2': 1996}, 'pol_Latn-bul_Cyrl': {'num_samples': 1997, 'number_of_characters': 547755, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 139.14, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 135.15, 'max_sentence2_length': 493, 'unique_sentence2': 1996}, 'pol_Latn-ces_Latn': {'num_samples': 1997, 'number_of_characters': 519184, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 139.14, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 120.84, 'max_sentence2_length': 474, 'unique_sentence2': 1997}, 'pol_Latn-deu_Latn': {'num_samples': 1997, 'number_of_characters': 573516, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 139.14, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 148.05, 'max_sentence2_length': 508, 'unique_sentence2': 1996}, 'pol_Latn-ell_Grek': {'num_samples': 1997, 'number_of_characters': 576993, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 139.14, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 149.79, 'max_sentence2_length': 584, 'unique_sentence2': 1996}, 'pol_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 525586, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 139.14, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'pol_Latn-fas_Arab': {'num_samples': 1997, 'number_of_characters': 521205, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 139.14, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 121.85, 'max_sentence2_length': 389, 'unique_sentence2': 1995}, 'pol_Latn-fin_Latn': {'num_samples': 1997, 'number_of_characters': 547666, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 139.14, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 135.1, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'pol_Latn-fra_Latn': {'num_samples': 1997, 'number_of_characters': 570974, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 139.14, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 146.77, 'max_sentence2_length': 512, 'unique_sentence2': 1996}, 'pol_Latn-heb_Hebr': {'num_samples': 1997, 'number_of_characters': 478162, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 139.14, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 100.3, 'max_sentence2_length': 375, 'unique_sentence2': 1996}, 'pol_Latn-hin_Deva': {'num_samples': 1997, 'number_of_characters': 539441, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 139.14, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 130.98, 'max_sentence2_length': 394, 'unique_sentence2': 1996}, 'pol_Latn-hrv_Latn': {'num_samples': 1997, 'number_of_characters': 533791, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 139.14, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 128.15, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'pol_Latn-hun_Latn': {'num_samples': 1997, 'number_of_characters': 556242, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 139.14, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 139.4, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'pol_Latn-ind_Latn': {'num_samples': 1997, 'number_of_characters': 564838, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 139.14, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 143.7, 'max_sentence2_length': 486, 'unique_sentence2': 1997}, 'pol_Latn-jpn_Jpan': {'num_samples': 1997, 'number_of_characters': 389492, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 139.14, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 4, 'average_sentence2_length': 55.9, 'max_sentence2_length': 189, 'unique_sentence2': 1994}, 'pol_Latn-kor_Hang': {'num_samples': 1997, 'number_of_characters': 411271, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 139.14, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 66.8, 'max_sentence2_length': 217, 'unique_sentence2': 1995}, 'pol_Latn-lit_Latn': {'num_samples': 1997, 'number_of_characters': 537263, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 139.14, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 129.89, 'max_sentence2_length': 446, 'unique_sentence2': 1995}, 'pol_Latn-mkd_Cyrl': {'num_samples': 1997, 'number_of_characters': 545757, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 139.14, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 134.15, 'max_sentence2_length': 451, 'unique_sentence2': 1997}, 'pol_Latn-nld_Latn': {'num_samples': 1997, 'number_of_characters': 569781, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 139.14, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 146.18, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'pol_Latn-por_Latn': {'num_samples': 1997, 'number_of_characters': 555081, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 139.14, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 138.82, 'max_sentence2_length': 497, 'unique_sentence2': 1996}, 'pol_Latn-rus_Cyrl': {'num_samples': 1997, 'number_of_characters': 552759, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 139.14, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 137.65, 'max_sentence2_length': 419, 'unique_sentence2': 1996}, 'pol_Latn-slk_Latn': {'num_samples': 1997, 'number_of_characters': 530835, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 139.14, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 126.67, 'max_sentence2_length': 403, 'unique_sentence2': 1996}, 'pol_Latn-slv_Latn': {'num_samples': 1997, 'number_of_characters': 530762, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 139.14, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 126.64, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'pol_Latn-spa_Latn': {'num_samples': 1997, 'number_of_characters': 566066, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 139.14, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 1, 'average_sentence2_length': 144.32, 'max_sentence2_length': 504, 'unique_sentence2': 1996}, 'pol_Latn-srp_Cyrl': {'num_samples': 1997, 'number_of_characters': 530169, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 139.14, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 126.34, 'max_sentence2_length': 439, 'unique_sentence2': 1995}, 'pol_Latn-srp_Latn': {'num_samples': 1997, 'number_of_characters': 534007, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 139.14, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 128.26, 'max_sentence2_length': 452, 'unique_sentence2': 1996}, 'pol_Latn-swa_Latn': {'num_samples': 1997, 'number_of_characters': 550375, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 139.14, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 10, 'average_sentence2_length': 136.46, 'max_sentence2_length': 430, 'unique_sentence2': 1997}, 'pol_Latn-swe_Latn': {'num_samples': 1997, 'number_of_characters': 529693, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 139.14, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 126.1, 'max_sentence2_length': 430, 'unique_sentence2': 1996}, 'pol_Latn-tam_Taml': {'num_samples': 1997, 'number_of_characters': 587827, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 139.14, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 11, 'average_sentence2_length': 155.21, 'max_sentence2_length': 581, 'unique_sentence2': 1997}, 'pol_Latn-tur_Latn': {'num_samples': 1997, 'number_of_characters': 543479, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 139.14, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 133.01, 'max_sentence2_length': 504, 'unique_sentence2': 1997}, 'pol_Latn-ukr_Cyrl': {'num_samples': 1997, 'number_of_characters': 540649, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 139.14, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 131.59, 'max_sentence2_length': 440, 'unique_sentence2': 1996}, 'pol_Latn-vie_Latn': {'num_samples': 1997, 'number_of_characters': 548987, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 139.14, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 135.76, 'max_sentence2_length': 437, 'unique_sentence2': 1996}, 'pol_Latn-zho_Hant': {'num_samples': 1997, 'number_of_characters': 369344, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 139.14, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 3, 'average_sentence2_length': 45.81, 'max_sentence2_length': 200, 'unique_sentence2': 1996}, 'pol_Latn-zul_Latn': {'num_samples': 1997, 'number_of_characters': 535598, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 139.14, 'max_sentence1_length': 468, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 129.06, 'max_sentence2_length': 494, 'unique_sentence2': 1996}, 'por_Latn-arb_Arab': {'num_samples': 1997, 'number_of_characters': 508396, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 138.82, 'max_sentence1_length': 497, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 115.76, 'max_sentence2_length': 362, 'unique_sentence2': 1995}, 'por_Latn-ben_Beng': {'num_samples': 1997, 'number_of_characters': 521017, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 138.82, 'max_sentence1_length': 497, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 122.08, 'max_sentence2_length': 402, 'unique_sentence2': 1997}, 'por_Latn-cat_Latn': {'num_samples': 1997, 'number_of_characters': 560175, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 138.82, 'max_sentence1_length': 497, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 141.69, 'max_sentence2_length': 460, 'unique_sentence2': 1997}, 'por_Latn-deu_Latn': {'num_samples': 1997, 'number_of_characters': 572865, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 138.82, 'max_sentence1_length': 497, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 148.05, 'max_sentence2_length': 508, 'unique_sentence2': 1996}, 'por_Latn-ell_Grek': {'num_samples': 1997, 'number_of_characters': 576342, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 138.82, 'max_sentence1_length': 497, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 149.79, 'max_sentence2_length': 584, 'unique_sentence2': 1996}, 'por_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 524935, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 138.82, 'max_sentence1_length': 497, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'por_Latn-fas_Arab': {'num_samples': 1997, 'number_of_characters': 520554, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 138.82, 'max_sentence1_length': 497, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 121.85, 'max_sentence2_length': 389, 'unique_sentence2': 1995}, 'por_Latn-fin_Latn': {'num_samples': 1997, 'number_of_characters': 547015, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 138.82, 'max_sentence1_length': 497, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 135.1, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'por_Latn-fra_Latn': {'num_samples': 1997, 'number_of_characters': 570323, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 138.82, 'max_sentence1_length': 497, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 146.77, 'max_sentence2_length': 512, 'unique_sentence2': 1996}, 'por_Latn-glg_Latn': {'num_samples': 1997, 'number_of_characters': 549201, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 138.82, 'max_sentence1_length': 497, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 136.2, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'por_Latn-heb_Hebr': {'num_samples': 1997, 'number_of_characters': 477511, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 138.82, 'max_sentence1_length': 497, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 100.3, 'max_sentence2_length': 375, 'unique_sentence2': 1996}, 'por_Latn-hin_Deva': {'num_samples': 1997, 'number_of_characters': 538790, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 138.82, 'max_sentence1_length': 497, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 130.98, 'max_sentence2_length': 394, 'unique_sentence2': 1996}, 'por_Latn-hun_Latn': {'num_samples': 1997, 'number_of_characters': 555591, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 138.82, 'max_sentence1_length': 497, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 139.4, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'por_Latn-ind_Latn': {'num_samples': 1997, 'number_of_characters': 564187, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 138.82, 'max_sentence1_length': 497, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 143.7, 'max_sentence2_length': 486, 'unique_sentence2': 1997}, 'por_Latn-ita_Latn': {'num_samples': 1997, 'number_of_characters': 566432, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 138.82, 'max_sentence1_length': 497, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 144.83, 'max_sentence2_length': 623, 'unique_sentence2': 1996}, 'por_Latn-jpn_Jpan': {'num_samples': 1997, 'number_of_characters': 388841, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 138.82, 'max_sentence1_length': 497, 'unique_sentence1': 1996, 'min_sentence2_length': 4, 'average_sentence2_length': 55.9, 'max_sentence2_length': 189, 'unique_sentence2': 1994}, 'por_Latn-kor_Hang': {'num_samples': 1997, 'number_of_characters': 410620, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 138.82, 'max_sentence1_length': 497, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 66.8, 'max_sentence2_length': 217, 'unique_sentence2': 1995}, 'por_Latn-lit_Latn': {'num_samples': 1997, 'number_of_characters': 536612, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 138.82, 'max_sentence1_length': 497, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 129.89, 'max_sentence2_length': 446, 'unique_sentence2': 1995}, 'por_Latn-mlt_Latn': {'num_samples': 1997, 'number_of_characters': 554690, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 138.82, 'max_sentence1_length': 497, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 138.95, 'max_sentence2_length': 582, 'unique_sentence2': 1996}, 'por_Latn-nld_Latn': {'num_samples': 1997, 'number_of_characters': 569130, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 138.82, 'max_sentence1_length': 497, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 146.18, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'por_Latn-pol_Latn': {'num_samples': 1997, 'number_of_characters': 555081, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 138.82, 'max_sentence1_length': 497, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 139.14, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'por_Latn-ron_Latn': {'num_samples': 1997, 'number_of_characters': 569700, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 138.82, 'max_sentence1_length': 497, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 146.46, 'max_sentence2_length': 518, 'unique_sentence2': 1997}, 'por_Latn-rus_Cyrl': {'num_samples': 1997, 'number_of_characters': 552108, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 138.82, 'max_sentence1_length': 497, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 137.65, 'max_sentence2_length': 419, 'unique_sentence2': 1996}, 'por_Latn-spa_Latn': {'num_samples': 1997, 'number_of_characters': 565415, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 138.82, 'max_sentence1_length': 497, 'unique_sentence1': 1996, 'min_sentence2_length': 1, 'average_sentence2_length': 144.32, 'max_sentence2_length': 504, 'unique_sentence2': 1996}, 'por_Latn-swa_Latn': {'num_samples': 1997, 'number_of_characters': 549724, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 138.82, 'max_sentence1_length': 497, 'unique_sentence1': 1996, 'min_sentence2_length': 10, 'average_sentence2_length': 136.46, 'max_sentence2_length': 430, 'unique_sentence2': 1997}, 'por_Latn-swe_Latn': {'num_samples': 1997, 'number_of_characters': 529042, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 138.82, 'max_sentence1_length': 497, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 126.1, 'max_sentence2_length': 430, 'unique_sentence2': 1996}, 'por_Latn-tam_Taml': {'num_samples': 1997, 'number_of_characters': 587176, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 138.82, 'max_sentence1_length': 497, 'unique_sentence1': 1996, 'min_sentence2_length': 11, 'average_sentence2_length': 155.21, 'max_sentence2_length': 581, 'unique_sentence2': 1997}, 'por_Latn-tur_Latn': {'num_samples': 1997, 'number_of_characters': 542828, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 138.82, 'max_sentence1_length': 497, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 133.01, 'max_sentence2_length': 504, 'unique_sentence2': 1997}, 'por_Latn-vie_Latn': {'num_samples': 1997, 'number_of_characters': 548336, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 138.82, 'max_sentence1_length': 497, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 135.76, 'max_sentence2_length': 437, 'unique_sentence2': 1996}, 'por_Latn-zho_Hant': {'num_samples': 1997, 'number_of_characters': 368693, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 138.82, 'max_sentence1_length': 497, 'unique_sentence1': 1996, 'min_sentence2_length': 3, 'average_sentence2_length': 45.81, 'max_sentence2_length': 200, 'unique_sentence2': 1996}, 'por_Latn-zul_Latn': {'num_samples': 1997, 'number_of_characters': 534947, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 138.82, 'max_sentence1_length': 497, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 129.06, 'max_sentence2_length': 494, 'unique_sentence2': 1996}, 'prs_Arab-arb_Arab': {'num_samples': 1997, 'number_of_characters': 473717, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 121.45, 'max_sentence1_length': 365, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 115.76, 'max_sentence2_length': 362, 'unique_sentence2': 1995}, 'prs_Arab-ckb_Arab': {'num_samples': 1997, 'number_of_characters': 494903, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 121.45, 'max_sentence1_length': 365, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 126.37, 'max_sentence2_length': 399, 'unique_sentence2': 1995}, 'prs_Arab-eng_Latn': {'num_samples': 1997, 'number_of_characters': 490256, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 121.45, 'max_sentence1_length': 365, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'prs_Arab-fas_Arab': {'num_samples': 1997, 'number_of_characters': 485875, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 121.45, 'max_sentence1_length': 365, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 121.85, 'max_sentence2_length': 389, 'unique_sentence2': 1995}, 'prs_Arab-heb_Hebr': {'num_samples': 1997, 'number_of_characters': 442832, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 121.45, 'max_sentence1_length': 365, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 100.3, 'max_sentence2_length': 375, 'unique_sentence2': 1996}, 'prs_Arab-kmr_Latn': {'num_samples': 1997, 'number_of_characters': 488482, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 121.45, 'max_sentence1_length': 365, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 123.16, 'max_sentence2_length': 420, 'unique_sentence2': 1996}, 'prs_Arab-mey_Arab': {'num_samples': 1997, 'number_of_characters': 456371, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 121.45, 'max_sentence1_length': 365, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 107.08, 'max_sentence2_length': 392, 'unique_sentence2': 1993}, 'prs_Arab-pus_Arab': {'num_samples': 1997, 'number_of_characters': 485169, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 121.45, 'max_sentence1_length': 365, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 121.5, 'max_sentence2_length': 366, 'unique_sentence2': 1996}, 'prs_Arab-shi_Arab': {'num_samples': 1997, 'number_of_characters': 457449, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 121.45, 'max_sentence1_length': 365, 'unique_sentence1': 1997, 'min_sentence2_length': 3, 'average_sentence2_length': 107.62, 'max_sentence2_length': 378, 'unique_sentence2': 1996}, 'prs_Arab-tgk_Cyrl': {'num_samples': 1997, 'number_of_characters': 516683, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 121.45, 'max_sentence1_length': 365, 'unique_sentence1': 1997, 'min_sentence2_length': 3, 'average_sentence2_length': 137.28, 'max_sentence2_length': 451, 'unique_sentence2': 1995}, 'pus_Arab-arb_Arab': {'num_samples': 1997, 'number_of_characters': 473814, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 121.5, 'max_sentence1_length': 366, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 115.76, 'max_sentence2_length': 362, 'unique_sentence2': 1995}, 'pus_Arab-ckb_Arab': {'num_samples': 1997, 'number_of_characters': 495000, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 121.5, 'max_sentence1_length': 366, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 126.37, 'max_sentence2_length': 399, 'unique_sentence2': 1995}, 'pus_Arab-eng_Latn': {'num_samples': 1997, 'number_of_characters': 490353, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 121.5, 'max_sentence1_length': 366, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'pus_Arab-fas_Arab': {'num_samples': 1997, 'number_of_characters': 485972, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 121.5, 'max_sentence1_length': 366, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 121.85, 'max_sentence2_length': 389, 'unique_sentence2': 1995}, 'pus_Arab-heb_Hebr': {'num_samples': 1997, 'number_of_characters': 442929, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 121.5, 'max_sentence1_length': 366, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 100.3, 'max_sentence2_length': 375, 'unique_sentence2': 1996}, 'pus_Arab-kmr_Latn': {'num_samples': 1997, 'number_of_characters': 488579, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 121.5, 'max_sentence1_length': 366, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 123.16, 'max_sentence2_length': 420, 'unique_sentence2': 1996}, 'pus_Arab-mey_Arab': {'num_samples': 1997, 'number_of_characters': 456468, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 121.5, 'max_sentence1_length': 366, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 107.08, 'max_sentence2_length': 392, 'unique_sentence2': 1993}, 'pus_Arab-prs_Arab': {'num_samples': 1997, 'number_of_characters': 485169, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 121.5, 'max_sentence1_length': 366, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 121.45, 'max_sentence2_length': 365, 'unique_sentence2': 1997}, 'pus_Arab-shi_Arab': {'num_samples': 1997, 'number_of_characters': 457546, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 121.5, 'max_sentence1_length': 366, 'unique_sentence1': 1996, 'min_sentence2_length': 3, 'average_sentence2_length': 107.62, 'max_sentence2_length': 378, 'unique_sentence2': 1996}, 'pus_Arab-tgk_Cyrl': {'num_samples': 1997, 'number_of_characters': 516780, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 121.5, 'max_sentence1_length': 366, 'unique_sentence1': 1996, 'min_sentence2_length': 3, 'average_sentence2_length': 137.28, 'max_sentence2_length': 451, 'unique_sentence2': 1995}, 'ron_Latn-cat_Latn': {'num_samples': 1997, 'number_of_characters': 575445, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 146.46, 'max_sentence1_length': 518, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 141.69, 'max_sentence2_length': 460, 'unique_sentence2': 1997}, 'ron_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 540205, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 146.46, 'max_sentence1_length': 518, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'ron_Latn-fra_Latn': {'num_samples': 1997, 'number_of_characters': 585593, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 146.46, 'max_sentence1_length': 518, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 146.77, 'max_sentence2_length': 512, 'unique_sentence2': 1996}, 'ron_Latn-glg_Latn': {'num_samples': 1997, 'number_of_characters': 564471, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 146.46, 'max_sentence1_length': 518, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 136.2, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'ron_Latn-ita_Latn': {'num_samples': 1997, 'number_of_characters': 581702, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 146.46, 'max_sentence1_length': 518, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 144.83, 'max_sentence2_length': 623, 'unique_sentence2': 1996}, 'ron_Latn-mlt_Latn': {'num_samples': 1997, 'number_of_characters': 569960, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 146.46, 'max_sentence1_length': 518, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 138.95, 'max_sentence2_length': 582, 'unique_sentence2': 1996}, 'ron_Latn-por_Latn': {'num_samples': 1997, 'number_of_characters': 569700, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 146.46, 'max_sentence1_length': 518, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 138.82, 'max_sentence2_length': 497, 'unique_sentence2': 1996}, 'ron_Latn-spa_Latn': {'num_samples': 1997, 'number_of_characters': 580685, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 146.46, 'max_sentence1_length': 518, 'unique_sentence1': 1997, 'min_sentence2_length': 1, 'average_sentence2_length': 144.32, 'max_sentence2_length': 504, 'unique_sentence2': 1996}, 'rus_Cyrl-arb_Arab': {'num_samples': 1997, 'number_of_characters': 506074, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 137.65, 'max_sentence1_length': 419, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 115.76, 'max_sentence2_length': 362, 'unique_sentence2': 1995}, 'rus_Cyrl-bel_Cyrl': {'num_samples': 1997, 'number_of_characters': 530983, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 137.65, 'max_sentence1_length': 419, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 128.24, 'max_sentence2_length': 422, 'unique_sentence2': 1996}, 'rus_Cyrl-ben_Beng': {'num_samples': 1997, 'number_of_characters': 518695, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 137.65, 'max_sentence1_length': 419, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 122.08, 'max_sentence2_length': 402, 'unique_sentence2': 1997}, 'rus_Cyrl-bos_Latn': {'num_samples': 1997, 'number_of_characters': 529803, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 137.65, 'max_sentence1_length': 419, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 127.65, 'max_sentence2_length': 434, 'unique_sentence2': 1996}, 'rus_Cyrl-bul_Cyrl': {'num_samples': 1997, 'number_of_characters': 544782, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 137.65, 'max_sentence1_length': 419, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 135.15, 'max_sentence2_length': 493, 'unique_sentence2': 1996}, 'rus_Cyrl-ces_Latn': {'num_samples': 1997, 'number_of_characters': 516211, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 137.65, 'max_sentence1_length': 419, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 120.84, 'max_sentence2_length': 474, 'unique_sentence2': 1997}, 'rus_Cyrl-deu_Latn': {'num_samples': 1997, 'number_of_characters': 570543, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 137.65, 'max_sentence1_length': 419, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 148.05, 'max_sentence2_length': 508, 'unique_sentence2': 1996}, 'rus_Cyrl-ell_Grek': {'num_samples': 1997, 'number_of_characters': 574020, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 137.65, 'max_sentence1_length': 419, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 149.79, 'max_sentence2_length': 584, 'unique_sentence2': 1996}, 'rus_Cyrl-eng_Latn': {'num_samples': 1997, 'number_of_characters': 522613, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 137.65, 'max_sentence1_length': 419, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'rus_Cyrl-fas_Arab': {'num_samples': 1997, 'number_of_characters': 518232, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 137.65, 'max_sentence1_length': 419, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 121.85, 'max_sentence2_length': 389, 'unique_sentence2': 1995}, 'rus_Cyrl-fin_Latn': {'num_samples': 1997, 'number_of_characters': 544693, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 137.65, 'max_sentence1_length': 419, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 135.1, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'rus_Cyrl-fra_Latn': {'num_samples': 1997, 'number_of_characters': 568001, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 137.65, 'max_sentence1_length': 419, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 146.77, 'max_sentence2_length': 512, 'unique_sentence2': 1996}, 'rus_Cyrl-heb_Hebr': {'num_samples': 1997, 'number_of_characters': 475189, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 137.65, 'max_sentence1_length': 419, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 100.3, 'max_sentence2_length': 375, 'unique_sentence2': 1996}, 'rus_Cyrl-hin_Deva': {'num_samples': 1997, 'number_of_characters': 536468, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 137.65, 'max_sentence1_length': 419, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 130.98, 'max_sentence2_length': 394, 'unique_sentence2': 1996}, 'rus_Cyrl-hrv_Latn': {'num_samples': 1997, 'number_of_characters': 530818, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 137.65, 'max_sentence1_length': 419, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 128.15, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'rus_Cyrl-hun_Latn': {'num_samples': 1997, 'number_of_characters': 553269, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 137.65, 'max_sentence1_length': 419, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 139.4, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'rus_Cyrl-ind_Latn': {'num_samples': 1997, 'number_of_characters': 561865, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 137.65, 'max_sentence1_length': 419, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 143.7, 'max_sentence2_length': 486, 'unique_sentence2': 1997}, 'rus_Cyrl-jpn_Jpan': {'num_samples': 1997, 'number_of_characters': 386519, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 137.65, 'max_sentence1_length': 419, 'unique_sentence1': 1996, 'min_sentence2_length': 4, 'average_sentence2_length': 55.9, 'max_sentence2_length': 189, 'unique_sentence2': 1994}, 'rus_Cyrl-kor_Hang': {'num_samples': 1997, 'number_of_characters': 408298, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 137.65, 'max_sentence1_length': 419, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 66.8, 'max_sentence2_length': 217, 'unique_sentence2': 1995}, 'rus_Cyrl-lit_Latn': {'num_samples': 1997, 'number_of_characters': 534290, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 137.65, 'max_sentence1_length': 419, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 129.89, 'max_sentence2_length': 446, 'unique_sentence2': 1995}, 'rus_Cyrl-mkd_Cyrl': {'num_samples': 1997, 'number_of_characters': 542784, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 137.65, 'max_sentence1_length': 419, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 134.15, 'max_sentence2_length': 451, 'unique_sentence2': 1997}, 'rus_Cyrl-nld_Latn': {'num_samples': 1997, 'number_of_characters': 566808, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 137.65, 'max_sentence1_length': 419, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 146.18, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'rus_Cyrl-pol_Latn': {'num_samples': 1997, 'number_of_characters': 552759, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 137.65, 'max_sentence1_length': 419, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 139.14, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'rus_Cyrl-por_Latn': {'num_samples': 1997, 'number_of_characters': 552108, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 137.65, 'max_sentence1_length': 419, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 138.82, 'max_sentence2_length': 497, 'unique_sentence2': 1996}, 'rus_Cyrl-slk_Latn': {'num_samples': 1997, 'number_of_characters': 527862, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 137.65, 'max_sentence1_length': 419, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 126.67, 'max_sentence2_length': 403, 'unique_sentence2': 1996}, 'rus_Cyrl-slv_Latn': {'num_samples': 1997, 'number_of_characters': 527789, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 137.65, 'max_sentence1_length': 419, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 126.64, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'rus_Cyrl-spa_Latn': {'num_samples': 1997, 'number_of_characters': 563093, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 137.65, 'max_sentence1_length': 419, 'unique_sentence1': 1996, 'min_sentence2_length': 1, 'average_sentence2_length': 144.32, 'max_sentence2_length': 504, 'unique_sentence2': 1996}, 'rus_Cyrl-srp_Cyrl': {'num_samples': 1997, 'number_of_characters': 527196, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 137.65, 'max_sentence1_length': 419, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 126.34, 'max_sentence2_length': 439, 'unique_sentence2': 1995}, 'rus_Cyrl-srp_Latn': {'num_samples': 1997, 'number_of_characters': 531034, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 137.65, 'max_sentence1_length': 419, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 128.26, 'max_sentence2_length': 452, 'unique_sentence2': 1996}, 'rus_Cyrl-swa_Latn': {'num_samples': 1997, 'number_of_characters': 547402, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 137.65, 'max_sentence1_length': 419, 'unique_sentence1': 1996, 'min_sentence2_length': 10, 'average_sentence2_length': 136.46, 'max_sentence2_length': 430, 'unique_sentence2': 1997}, 'rus_Cyrl-swe_Latn': {'num_samples': 1997, 'number_of_characters': 526720, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 137.65, 'max_sentence1_length': 419, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 126.1, 'max_sentence2_length': 430, 'unique_sentence2': 1996}, 'rus_Cyrl-tam_Taml': {'num_samples': 1997, 'number_of_characters': 584854, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 137.65, 'max_sentence1_length': 419, 'unique_sentence1': 1996, 'min_sentence2_length': 11, 'average_sentence2_length': 155.21, 'max_sentence2_length': 581, 'unique_sentence2': 1997}, 'rus_Cyrl-tur_Latn': {'num_samples': 1997, 'number_of_characters': 540506, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 137.65, 'max_sentence1_length': 419, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 133.01, 'max_sentence2_length': 504, 'unique_sentence2': 1997}, 'rus_Cyrl-ukr_Cyrl': {'num_samples': 1997, 'number_of_characters': 537676, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 137.65, 'max_sentence1_length': 419, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 131.59, 'max_sentence2_length': 440, 'unique_sentence2': 1996}, 'rus_Cyrl-vie_Latn': {'num_samples': 1997, 'number_of_characters': 546014, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 137.65, 'max_sentence1_length': 419, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 135.76, 'max_sentence2_length': 437, 'unique_sentence2': 1996}, 'rus_Cyrl-zho_Hant': {'num_samples': 1997, 'number_of_characters': 366371, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 137.65, 'max_sentence1_length': 419, 'unique_sentence1': 1996, 'min_sentence2_length': 3, 'average_sentence2_length': 45.81, 'max_sentence2_length': 200, 'unique_sentence2': 1996}, 'rus_Cyrl-zul_Latn': {'num_samples': 1997, 'number_of_characters': 532625, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 137.65, 'max_sentence1_length': 419, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 129.06, 'max_sentence2_length': 494, 'unique_sentence2': 1996}, 'shi_Arab-arb_Arab': {'num_samples': 1997, 'number_of_characters': 446094, 'unique_pairs': 1997, 'min_sentence1_length': 3, 'average_sentence1_length': 107.62, 'max_sentence1_length': 378, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 115.76, 'max_sentence2_length': 362, 'unique_sentence2': 1995}, 'shi_Arab-ckb_Arab': {'num_samples': 1997, 'number_of_characters': 467280, 'unique_pairs': 1997, 'min_sentence1_length': 3, 'average_sentence1_length': 107.62, 'max_sentence1_length': 378, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 126.37, 'max_sentence2_length': 399, 'unique_sentence2': 1995}, 'shi_Arab-eng_Latn': {'num_samples': 1997, 'number_of_characters': 462633, 'unique_pairs': 1997, 'min_sentence1_length': 3, 'average_sentence1_length': 107.62, 'max_sentence1_length': 378, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'shi_Arab-fas_Arab': {'num_samples': 1997, 'number_of_characters': 458252, 'unique_pairs': 1997, 'min_sentence1_length': 3, 'average_sentence1_length': 107.62, 'max_sentence1_length': 378, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 121.85, 'max_sentence2_length': 389, 'unique_sentence2': 1995}, 'shi_Arab-heb_Hebr': {'num_samples': 1997, 'number_of_characters': 415209, 'unique_pairs': 1997, 'min_sentence1_length': 3, 'average_sentence1_length': 107.62, 'max_sentence1_length': 378, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 100.3, 'max_sentence2_length': 375, 'unique_sentence2': 1996}, 'shi_Arab-kmr_Latn': {'num_samples': 1997, 'number_of_characters': 460859, 'unique_pairs': 1997, 'min_sentence1_length': 3, 'average_sentence1_length': 107.62, 'max_sentence1_length': 378, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 123.16, 'max_sentence2_length': 420, 'unique_sentence2': 1996}, 'shi_Arab-mey_Arab': {'num_samples': 1997, 'number_of_characters': 428748, 'unique_pairs': 1997, 'min_sentence1_length': 3, 'average_sentence1_length': 107.62, 'max_sentence1_length': 378, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 107.08, 'max_sentence2_length': 392, 'unique_sentence2': 1993}, 'shi_Arab-prs_Arab': {'num_samples': 1997, 'number_of_characters': 457449, 'unique_pairs': 1997, 'min_sentence1_length': 3, 'average_sentence1_length': 107.62, 'max_sentence1_length': 378, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 121.45, 'max_sentence2_length': 365, 'unique_sentence2': 1997}, 'shi_Arab-pus_Arab': {'num_samples': 1997, 'number_of_characters': 457546, 'unique_pairs': 1997, 'min_sentence1_length': 3, 'average_sentence1_length': 107.62, 'max_sentence1_length': 378, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 121.5, 'max_sentence2_length': 366, 'unique_sentence2': 1996}, 'shi_Arab-tgk_Cyrl': {'num_samples': 1997, 'number_of_characters': 489060, 'unique_pairs': 1997, 'min_sentence1_length': 3, 'average_sentence1_length': 107.62, 'max_sentence1_length': 378, 'unique_sentence1': 1996, 'min_sentence2_length': 3, 'average_sentence2_length': 137.28, 'max_sentence2_length': 451, 'unique_sentence2': 1995}, 'sin_Sinh-ben_Beng': {'num_samples': 1997, 'number_of_characters': 502543, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 129.56, 'max_sentence1_length': 441, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 122.08, 'max_sentence2_length': 402, 'unique_sentence2': 1997}, 'sin_Sinh-div_Thaa': {'num_samples': 1997, 'number_of_characters': 562589, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 129.56, 'max_sentence1_length': 441, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 152.15, 'max_sentence2_length': 609, 'unique_sentence2': 1996}, 'sin_Sinh-eng_Latn': {'num_samples': 1997, 'number_of_characters': 506461, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 129.56, 'max_sentence1_length': 441, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'sin_Sinh-eus_Latn': {'num_samples': 1997, 'number_of_characters': 533944, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 129.56, 'max_sentence1_length': 441, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 137.81, 'max_sentence2_length': 393, 'unique_sentence2': 1997}, 'sin_Sinh-guj_Gujr': {'num_samples': 1997, 'number_of_characters': 503672, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 129.56, 'max_sentence1_length': 441, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 122.65, 'max_sentence2_length': 378, 'unique_sentence2': 1997}, 'sin_Sinh-hin_Deva': {'num_samples': 1997, 'number_of_characters': 520316, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 129.56, 'max_sentence1_length': 441, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 130.98, 'max_sentence2_length': 394, 'unique_sentence2': 1996}, 'sin_Sinh-kan_Knda': {'num_samples': 1997, 'number_of_characters': 524277, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 129.56, 'max_sentence1_length': 441, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 132.97, 'max_sentence2_length': 449, 'unique_sentence2': 1996}, 'sin_Sinh-mar_Deva': {'num_samples': 1997, 'number_of_characters': 519628, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 129.56, 'max_sentence1_length': 441, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 130.64, 'max_sentence2_length': 443, 'unique_sentence2': 1995}, 'sin_Sinh-nep_Deva': {'num_samples': 1997, 'number_of_characters': 506964, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 129.56, 'max_sentence1_length': 441, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 124.3, 'max_sentence2_length': 412, 'unique_sentence2': 1996}, 'sin_Sinh-pan_Guru': {'num_samples': 1997, 'number_of_characters': 509163, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 129.56, 'max_sentence1_length': 441, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 125.4, 'max_sentence2_length': 383, 'unique_sentence2': 1996}, 'sin_Sinh-snd_Arab': {'num_samples': 1997, 'number_of_characters': 479068, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 129.56, 'max_sentence1_length': 441, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 110.33, 'max_sentence2_length': 335, 'unique_sentence2': 1996}, 'sin_Sinh-tam_Taml': {'num_samples': 1997, 'number_of_characters': 568702, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 129.56, 'max_sentence1_length': 441, 'unique_sentence1': 1996, 'min_sentence2_length': 11, 'average_sentence2_length': 155.21, 'max_sentence2_length': 581, 'unique_sentence2': 1997}, 'sin_Sinh-tel_Telu': {'num_samples': 1997, 'number_of_characters': 506268, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 129.56, 'max_sentence1_length': 441, 'unique_sentence1': 1996, 'min_sentence2_length': 10, 'average_sentence2_length': 123.95, 'max_sentence2_length': 412, 'unique_sentence2': 1996}, 'sin_Sinh-urd_Arab': {'num_samples': 1997, 'number_of_characters': 506739, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 129.56, 'max_sentence1_length': 441, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.19, 'max_sentence2_length': 390, 'unique_sentence2': 1996}, 'slk_Latn-bel_Cyrl': {'num_samples': 1997, 'number_of_characters': 509059, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 126.67, 'max_sentence1_length': 403, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 128.24, 'max_sentence2_length': 422, 'unique_sentence2': 1996}, 'slk_Latn-bos_Latn': {'num_samples': 1997, 'number_of_characters': 507879, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 126.67, 'max_sentence1_length': 403, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 127.65, 'max_sentence2_length': 434, 'unique_sentence2': 1996}, 'slk_Latn-bul_Cyrl': {'num_samples': 1997, 'number_of_characters': 522858, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 126.67, 'max_sentence1_length': 403, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 135.15, 'max_sentence2_length': 493, 'unique_sentence2': 1996}, 'slk_Latn-ces_Latn': {'num_samples': 1997, 'number_of_characters': 494287, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 126.67, 'max_sentence1_length': 403, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 120.84, 'max_sentence2_length': 474, 'unique_sentence2': 1997}, 'slk_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 500689, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 126.67, 'max_sentence1_length': 403, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'slk_Latn-hrv_Latn': {'num_samples': 1997, 'number_of_characters': 508894, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 126.67, 'max_sentence1_length': 403, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 128.15, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'slk_Latn-mkd_Cyrl': {'num_samples': 1997, 'number_of_characters': 520860, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 126.67, 'max_sentence1_length': 403, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 134.15, 'max_sentence2_length': 451, 'unique_sentence2': 1997}, 'slk_Latn-pol_Latn': {'num_samples': 1997, 'number_of_characters': 530835, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 126.67, 'max_sentence1_length': 403, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 139.14, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'slk_Latn-rus_Cyrl': {'num_samples': 1997, 'number_of_characters': 527862, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 126.67, 'max_sentence1_length': 403, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 137.65, 'max_sentence2_length': 419, 'unique_sentence2': 1996}, 'slk_Latn-slv_Latn': {'num_samples': 1997, 'number_of_characters': 505865, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 126.67, 'max_sentence1_length': 403, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 126.64, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'slk_Latn-srp_Cyrl': {'num_samples': 1997, 'number_of_characters': 505272, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 126.67, 'max_sentence1_length': 403, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 126.34, 'max_sentence2_length': 439, 'unique_sentence2': 1995}, 'slk_Latn-srp_Latn': {'num_samples': 1997, 'number_of_characters': 509110, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 126.67, 'max_sentence1_length': 403, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 128.26, 'max_sentence2_length': 452, 'unique_sentence2': 1996}, 'slk_Latn-ukr_Cyrl': {'num_samples': 1997, 'number_of_characters': 515752, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 126.67, 'max_sentence1_length': 403, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 131.59, 'max_sentence2_length': 440, 'unique_sentence2': 1996}, 'slv_Latn-bel_Cyrl': {'num_samples': 1997, 'number_of_characters': 508986, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 126.64, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 128.24, 'max_sentence2_length': 422, 'unique_sentence2': 1996}, 'slv_Latn-bos_Latn': {'num_samples': 1997, 'number_of_characters': 507806, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 126.64, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 127.65, 'max_sentence2_length': 434, 'unique_sentence2': 1996}, 'slv_Latn-bul_Cyrl': {'num_samples': 1997, 'number_of_characters': 522785, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 126.64, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 135.15, 'max_sentence2_length': 493, 'unique_sentence2': 1996}, 'slv_Latn-ces_Latn': {'num_samples': 1997, 'number_of_characters': 494214, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 126.64, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 120.84, 'max_sentence2_length': 474, 'unique_sentence2': 1997}, 'slv_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 500616, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 126.64, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'slv_Latn-hrv_Latn': {'num_samples': 1997, 'number_of_characters': 508821, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 126.64, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 128.15, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'slv_Latn-mkd_Cyrl': {'num_samples': 1997, 'number_of_characters': 520787, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 126.64, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 134.15, 'max_sentence2_length': 451, 'unique_sentence2': 1997}, 'slv_Latn-pol_Latn': {'num_samples': 1997, 'number_of_characters': 530762, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 126.64, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 139.14, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'slv_Latn-rus_Cyrl': {'num_samples': 1997, 'number_of_characters': 527789, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 126.64, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 137.65, 'max_sentence2_length': 419, 'unique_sentence2': 1996}, 'slv_Latn-slk_Latn': {'num_samples': 1997, 'number_of_characters': 505865, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 126.64, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 126.67, 'max_sentence2_length': 403, 'unique_sentence2': 1996}, 'slv_Latn-srp_Cyrl': {'num_samples': 1997, 'number_of_characters': 505199, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 126.64, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 126.34, 'max_sentence2_length': 439, 'unique_sentence2': 1995}, 'slv_Latn-srp_Latn': {'num_samples': 1997, 'number_of_characters': 509037, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 126.64, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 128.26, 'max_sentence2_length': 452, 'unique_sentence2': 1996}, 'slv_Latn-ukr_Cyrl': {'num_samples': 1997, 'number_of_characters': 515679, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 126.64, 'max_sentence1_length': 463, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 131.59, 'max_sentence2_length': 440, 'unique_sentence2': 1996}, 'smo_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 525575, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 139.14, 'max_sentence1_length': 431, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'smo_Latn-fij_Latn': {'num_samples': 1997, 'number_of_characters': 578360, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 139.14, 'max_sentence1_length': 431, 'unique_sentence1': 1996, 'min_sentence2_length': 10, 'average_sentence2_length': 150.48, 'max_sentence2_length': 448, 'unique_sentence2': 1988}, 'smo_Latn-fil_Latn': {'num_samples': 1997, 'number_of_characters': 571275, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 139.14, 'max_sentence1_length': 431, 'unique_sentence1': 1996, 'min_sentence2_length': 10, 'average_sentence2_length': 146.93, 'max_sentence2_length': 554, 'unique_sentence2': 1997}, 'smo_Latn-ind_Latn': {'num_samples': 1997, 'number_of_characters': 564827, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 139.14, 'max_sentence1_length': 431, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 143.7, 'max_sentence2_length': 486, 'unique_sentence2': 1997}, 'smo_Latn-mal_Mlym': {'num_samples': 1997, 'number_of_characters': 582007, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 139.14, 'max_sentence1_length': 431, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 152.3, 'max_sentence2_length': 540, 'unique_sentence2': 1996}, 'smo_Latn-mlg_Latn': {'num_samples': 1997, 'number_of_characters': 598163, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 139.14, 'max_sentence1_length': 431, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 160.39, 'max_sentence2_length': 559, 'unique_sentence2': 1994}, 'smo_Latn-mri_Latn': {'num_samples': 1997, 'number_of_characters': 551979, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 139.14, 'max_sentence1_length': 431, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 137.27, 'max_sentence2_length': 443, 'unique_sentence2': 1997}, 'smo_Latn-msa_Latn': {'num_samples': 1997, 'number_of_characters': 555038, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 139.14, 'max_sentence1_length': 431, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 138.8, 'max_sentence2_length': 463, 'unique_sentence2': 1997}, 'smo_Latn-tah_Latn': {'num_samples': 1997, 'number_of_characters': 587478, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 139.14, 'max_sentence1_length': 431, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 155.04, 'max_sentence2_length': 524, 'unique_sentence2': 1997}, 'smo_Latn-ton_Latn': {'num_samples': 1997, 'number_of_characters': 591495, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 139.14, 'max_sentence1_length': 431, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 157.06, 'max_sentence2_length': 468, 'unique_sentence2': 1997}, 'sna_Latn-bem_Latn': {'num_samples': 1997, 'number_of_characters': 596822, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 149.39, 'max_sentence1_length': 511, 'unique_sentence1': 1995, 'min_sentence2_length': 8, 'average_sentence2_length': 149.47, 'max_sentence2_length': 465, 'unique_sentence2': 1997}, 'sna_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 546050, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 149.39, 'max_sentence1_length': 511, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'sna_Latn-ewe_Latn': {'num_samples': 1997, 'number_of_characters': 537308, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 149.39, 'max_sentence1_length': 511, 'unique_sentence1': 1995, 'min_sentence2_length': 6, 'average_sentence2_length': 119.67, 'max_sentence2_length': 493, 'unique_sentence2': 1994}, 'sna_Latn-fuc_Latn': {'num_samples': 1997, 'number_of_characters': 526810, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 149.39, 'max_sentence1_length': 511, 'unique_sentence1': 1995, 'min_sentence2_length': 6, 'average_sentence2_length': 114.41, 'max_sentence2_length': 376, 'unique_sentence2': 1996}, 'sna_Latn-kin_Latn': {'num_samples': 1997, 'number_of_characters': 602117, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 149.39, 'max_sentence1_length': 511, 'unique_sentence1': 1995, 'min_sentence2_length': 10, 'average_sentence2_length': 152.12, 'max_sentence2_length': 541, 'unique_sentence2': 1996}, 'sna_Latn-nde_Latn': {'num_samples': 1997, 'number_of_characters': 596069, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 149.39, 'max_sentence1_length': 511, 'unique_sentence1': 1995, 'min_sentence2_length': 6, 'average_sentence2_length': 149.09, 'max_sentence2_length': 590, 'unique_sentence2': 1997}, 'sna_Latn-nya_Latn': {'num_samples': 1997, 'number_of_characters': 582612, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 149.39, 'max_sentence1_length': 511, 'unique_sentence1': 1995, 'min_sentence2_length': 10, 'average_sentence2_length': 142.35, 'max_sentence2_length': 464, 'unique_sentence2': 1993}, 'sna_Latn-ven_Latn': {'num_samples': 1997, 'number_of_characters': 598086, 'unique_pairs': 1995, 'min_sentence1_length': 6, 'average_sentence1_length': 149.39, 'max_sentence1_length': 511, 'unique_sentence1': 1995, 'min_sentence2_length': 10, 'average_sentence2_length': 150.1, 'max_sentence2_length': 535, 'unique_sentence2': 1993}, 'snd_Arab-ben_Beng': {'num_samples': 1997, 'number_of_characters': 464129, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 110.33, 'max_sentence1_length': 335, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 122.08, 'max_sentence2_length': 402, 'unique_sentence2': 1997}, 'snd_Arab-div_Thaa': {'num_samples': 1997, 'number_of_characters': 524175, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 110.33, 'max_sentence1_length': 335, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 152.15, 'max_sentence2_length': 609, 'unique_sentence2': 1996}, 'snd_Arab-eng_Latn': {'num_samples': 1997, 'number_of_characters': 468047, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 110.33, 'max_sentence1_length': 335, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'snd_Arab-eus_Latn': {'num_samples': 1997, 'number_of_characters': 495530, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 110.33, 'max_sentence1_length': 335, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 137.81, 'max_sentence2_length': 393, 'unique_sentence2': 1997}, 'snd_Arab-guj_Gujr': {'num_samples': 1997, 'number_of_characters': 465258, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 110.33, 'max_sentence1_length': 335, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 122.65, 'max_sentence2_length': 378, 'unique_sentence2': 1997}, 'snd_Arab-hin_Deva': {'num_samples': 1997, 'number_of_characters': 481902, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 110.33, 'max_sentence1_length': 335, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 130.98, 'max_sentence2_length': 394, 'unique_sentence2': 1996}, 'snd_Arab-kan_Knda': {'num_samples': 1997, 'number_of_characters': 485863, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 110.33, 'max_sentence1_length': 335, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 132.97, 'max_sentence2_length': 449, 'unique_sentence2': 1996}, 'snd_Arab-mar_Deva': {'num_samples': 1997, 'number_of_characters': 481214, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 110.33, 'max_sentence1_length': 335, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 130.64, 'max_sentence2_length': 443, 'unique_sentence2': 1995}, 'snd_Arab-nep_Deva': {'num_samples': 1997, 'number_of_characters': 468550, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 110.33, 'max_sentence1_length': 335, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 124.3, 'max_sentence2_length': 412, 'unique_sentence2': 1996}, 'snd_Arab-pan_Guru': {'num_samples': 1997, 'number_of_characters': 470749, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 110.33, 'max_sentence1_length': 335, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 125.4, 'max_sentence2_length': 383, 'unique_sentence2': 1996}, 'snd_Arab-sin_Sinh': {'num_samples': 1997, 'number_of_characters': 479068, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 110.33, 'max_sentence1_length': 335, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 129.56, 'max_sentence2_length': 441, 'unique_sentence2': 1996}, 'snd_Arab-tam_Taml': {'num_samples': 1997, 'number_of_characters': 530288, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 110.33, 'max_sentence1_length': 335, 'unique_sentence1': 1996, 'min_sentence2_length': 11, 'average_sentence2_length': 155.21, 'max_sentence2_length': 581, 'unique_sentence2': 1997}, 'snd_Arab-tel_Telu': {'num_samples': 1997, 'number_of_characters': 467854, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 110.33, 'max_sentence1_length': 335, 'unique_sentence1': 1996, 'min_sentence2_length': 10, 'average_sentence2_length': 123.95, 'max_sentence2_length': 412, 'unique_sentence2': 1996}, 'snd_Arab-urd_Arab': {'num_samples': 1997, 'number_of_characters': 468325, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 110.33, 'max_sentence1_length': 335, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.19, 'max_sentence2_length': 390, 'unique_sentence2': 1996}, 'som_Latn-amh_Ethi': {'num_samples': 1997, 'number_of_characters': 458799, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 145.86, 'max_sentence1_length': 455, 'unique_sentence1': 1997, 'min_sentence2_length': 1, 'average_sentence2_length': 83.88, 'max_sentence2_length': 290, 'unique_sentence2': 1994}, 'som_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 539012, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 145.86, 'max_sentence1_length': 455, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'som_Latn-hau_Latn': {'num_samples': 1997, 'number_of_characters': 561258, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 145.86, 'max_sentence1_length': 455, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 135.19, 'max_sentence2_length': 483, 'unique_sentence2': 1997}, 'som_Latn-ibo_Latn': {'num_samples': 1997, 'number_of_characters': 537393, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 145.86, 'max_sentence1_length': 455, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 123.24, 'max_sentence2_length': 469, 'unique_sentence2': 1997}, 'som_Latn-nso_Latn': {'num_samples': 1997, 'number_of_characters': 582791, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 145.86, 'max_sentence1_length': 455, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 145.97, 'max_sentence2_length': 487, 'unique_sentence2': 1996}, 'som_Latn-orm_Ethi': {'num_samples': 1997, 'number_of_characters': 528723, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 145.86, 'max_sentence1_length': 455, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 118.89, 'max_sentence2_length': 466, 'unique_sentence2': 1984}, 'som_Latn-ssw_Latn': {'num_samples': 1997, 'number_of_characters': 579434, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 145.86, 'max_sentence1_length': 455, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 144.29, 'max_sentence2_length': 510, 'unique_sentence2': 1996}, 'som_Latn-swa_Latn': {'num_samples': 1997, 'number_of_characters': 563801, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 145.86, 'max_sentence1_length': 455, 'unique_sentence1': 1997, 'min_sentence2_length': 10, 'average_sentence2_length': 136.46, 'max_sentence2_length': 430, 'unique_sentence2': 1997}, 'som_Latn-tir_Ethi': {'num_samples': 1997, 'number_of_characters': 456530, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 145.86, 'max_sentence1_length': 455, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 82.74, 'max_sentence2_length': 272, 'unique_sentence2': 1996}, 'som_Latn-tsn_Latn': {'num_samples': 1997, 'number_of_characters': 625575, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 145.86, 'max_sentence1_length': 455, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 167.39, 'max_sentence2_length': 556, 'unique_sentence2': 1997}, 'som_Latn-wol_Latn': {'num_samples': 1997, 'number_of_characters': 531095, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 145.86, 'max_sentence1_length': 455, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 120.08, 'max_sentence2_length': 405, 'unique_sentence2': 1990}, 'som_Latn-xho_Latn': {'num_samples': 1997, 'number_of_characters': 559382, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 145.86, 'max_sentence1_length': 455, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 134.25, 'max_sentence2_length': 492, 'unique_sentence2': 1997}, 'som_Latn-yor_Latn': {'num_samples': 1997, 'number_of_characters': 607380, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 145.86, 'max_sentence1_length': 455, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 158.28, 'max_sentence2_length': 582, 'unique_sentence2': 1996}, 'som_Latn-zul_Latn': {'num_samples': 1997, 'number_of_characters': 549024, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 145.86, 'max_sentence1_length': 455, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 129.06, 'max_sentence2_length': 494, 'unique_sentence2': 1996}, 'spa_Latn-arb_Arab': {'num_samples': 1997, 'number_of_characters': 519381, 'unique_pairs': 1996, 'min_sentence1_length': 1, 'average_sentence1_length': 144.32, 'max_sentence1_length': 504, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 115.76, 'max_sentence2_length': 362, 'unique_sentence2': 1995}, 'spa_Latn-ben_Beng': {'num_samples': 1997, 'number_of_characters': 532002, 'unique_pairs': 1997, 'min_sentence1_length': 1, 'average_sentence1_length': 144.32, 'max_sentence1_length': 504, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 122.08, 'max_sentence2_length': 402, 'unique_sentence2': 1997}, 'spa_Latn-cat_Latn': {'num_samples': 1997, 'number_of_characters': 571160, 'unique_pairs': 1997, 'min_sentence1_length': 1, 'average_sentence1_length': 144.32, 'max_sentence1_length': 504, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 141.69, 'max_sentence2_length': 460, 'unique_sentence2': 1997}, 'spa_Latn-deu_Latn': {'num_samples': 1997, 'number_of_characters': 583850, 'unique_pairs': 1996, 'min_sentence1_length': 1, 'average_sentence1_length': 144.32, 'max_sentence1_length': 504, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 148.05, 'max_sentence2_length': 508, 'unique_sentence2': 1996}, 'spa_Latn-ell_Grek': {'num_samples': 1997, 'number_of_characters': 587327, 'unique_pairs': 1996, 'min_sentence1_length': 1, 'average_sentence1_length': 144.32, 'max_sentence1_length': 504, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 149.79, 'max_sentence2_length': 584, 'unique_sentence2': 1996}, 'spa_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 535920, 'unique_pairs': 1997, 'min_sentence1_length': 1, 'average_sentence1_length': 144.32, 'max_sentence1_length': 504, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'spa_Latn-fas_Arab': {'num_samples': 1997, 'number_of_characters': 531539, 'unique_pairs': 1996, 'min_sentence1_length': 1, 'average_sentence1_length': 144.32, 'max_sentence1_length': 504, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 121.85, 'max_sentence2_length': 389, 'unique_sentence2': 1995}, 'spa_Latn-fin_Latn': {'num_samples': 1997, 'number_of_characters': 558000, 'unique_pairs': 1996, 'min_sentence1_length': 1, 'average_sentence1_length': 144.32, 'max_sentence1_length': 504, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 135.1, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'spa_Latn-fra_Latn': {'num_samples': 1997, 'number_of_characters': 581308, 'unique_pairs': 1996, 'min_sentence1_length': 1, 'average_sentence1_length': 144.32, 'max_sentence1_length': 504, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 146.77, 'max_sentence2_length': 512, 'unique_sentence2': 1996}, 'spa_Latn-glg_Latn': {'num_samples': 1997, 'number_of_characters': 560186, 'unique_pairs': 1996, 'min_sentence1_length': 1, 'average_sentence1_length': 144.32, 'max_sentence1_length': 504, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 136.2, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'spa_Latn-heb_Hebr': {'num_samples': 1997, 'number_of_characters': 488496, 'unique_pairs': 1996, 'min_sentence1_length': 1, 'average_sentence1_length': 144.32, 'max_sentence1_length': 504, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 100.3, 'max_sentence2_length': 375, 'unique_sentence2': 1996}, 'spa_Latn-hin_Deva': {'num_samples': 1997, 'number_of_characters': 549775, 'unique_pairs': 1996, 'min_sentence1_length': 1, 'average_sentence1_length': 144.32, 'max_sentence1_length': 504, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 130.98, 'max_sentence2_length': 394, 'unique_sentence2': 1996}, 'spa_Latn-hun_Latn': {'num_samples': 1997, 'number_of_characters': 566576, 'unique_pairs': 1997, 'min_sentence1_length': 1, 'average_sentence1_length': 144.32, 'max_sentence1_length': 504, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 139.4, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'spa_Latn-ind_Latn': {'num_samples': 1997, 'number_of_characters': 575172, 'unique_pairs': 1997, 'min_sentence1_length': 1, 'average_sentence1_length': 144.32, 'max_sentence1_length': 504, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 143.7, 'max_sentence2_length': 486, 'unique_sentence2': 1997}, 'spa_Latn-ita_Latn': {'num_samples': 1997, 'number_of_characters': 577417, 'unique_pairs': 1996, 'min_sentence1_length': 1, 'average_sentence1_length': 144.32, 'max_sentence1_length': 504, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 144.83, 'max_sentence2_length': 623, 'unique_sentence2': 1996}, 'spa_Latn-jpn_Jpan': {'num_samples': 1997, 'number_of_characters': 399826, 'unique_pairs': 1996, 'min_sentence1_length': 1, 'average_sentence1_length': 144.32, 'max_sentence1_length': 504, 'unique_sentence1': 1996, 'min_sentence2_length': 4, 'average_sentence2_length': 55.9, 'max_sentence2_length': 189, 'unique_sentence2': 1994}, 'spa_Latn-kor_Hang': {'num_samples': 1997, 'number_of_characters': 421605, 'unique_pairs': 1996, 'min_sentence1_length': 1, 'average_sentence1_length': 144.32, 'max_sentence1_length': 504, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 66.8, 'max_sentence2_length': 217, 'unique_sentence2': 1995}, 'spa_Latn-lit_Latn': {'num_samples': 1997, 'number_of_characters': 547597, 'unique_pairs': 1996, 'min_sentence1_length': 1, 'average_sentence1_length': 144.32, 'max_sentence1_length': 504, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 129.89, 'max_sentence2_length': 446, 'unique_sentence2': 1995}, 'spa_Latn-mlt_Latn': {'num_samples': 1997, 'number_of_characters': 565675, 'unique_pairs': 1996, 'min_sentence1_length': 1, 'average_sentence1_length': 144.32, 'max_sentence1_length': 504, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 138.95, 'max_sentence2_length': 582, 'unique_sentence2': 1996}, 'spa_Latn-nld_Latn': {'num_samples': 1997, 'number_of_characters': 580115, 'unique_pairs': 1996, 'min_sentence1_length': 1, 'average_sentence1_length': 144.32, 'max_sentence1_length': 504, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 146.18, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'spa_Latn-pol_Latn': {'num_samples': 1997, 'number_of_characters': 566066, 'unique_pairs': 1996, 'min_sentence1_length': 1, 'average_sentence1_length': 144.32, 'max_sentence1_length': 504, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 139.14, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'spa_Latn-por_Latn': {'num_samples': 1997, 'number_of_characters': 565415, 'unique_pairs': 1996, 'min_sentence1_length': 1, 'average_sentence1_length': 144.32, 'max_sentence1_length': 504, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 138.82, 'max_sentence2_length': 497, 'unique_sentence2': 1996}, 'spa_Latn-ron_Latn': {'num_samples': 1997, 'number_of_characters': 580685, 'unique_pairs': 1997, 'min_sentence1_length': 1, 'average_sentence1_length': 144.32, 'max_sentence1_length': 504, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 146.46, 'max_sentence2_length': 518, 'unique_sentence2': 1997}, 'spa_Latn-rus_Cyrl': {'num_samples': 1997, 'number_of_characters': 563093, 'unique_pairs': 1996, 'min_sentence1_length': 1, 'average_sentence1_length': 144.32, 'max_sentence1_length': 504, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 137.65, 'max_sentence2_length': 419, 'unique_sentence2': 1996}, 'spa_Latn-swa_Latn': {'num_samples': 1997, 'number_of_characters': 560709, 'unique_pairs': 1997, 'min_sentence1_length': 1, 'average_sentence1_length': 144.32, 'max_sentence1_length': 504, 'unique_sentence1': 1996, 'min_sentence2_length': 10, 'average_sentence2_length': 136.46, 'max_sentence2_length': 430, 'unique_sentence2': 1997}, 'spa_Latn-swe_Latn': {'num_samples': 1997, 'number_of_characters': 540027, 'unique_pairs': 1996, 'min_sentence1_length': 1, 'average_sentence1_length': 144.32, 'max_sentence1_length': 504, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 126.1, 'max_sentence2_length': 430, 'unique_sentence2': 1996}, 'spa_Latn-tam_Taml': {'num_samples': 1997, 'number_of_characters': 598161, 'unique_pairs': 1997, 'min_sentence1_length': 1, 'average_sentence1_length': 144.32, 'max_sentence1_length': 504, 'unique_sentence1': 1996, 'min_sentence2_length': 11, 'average_sentence2_length': 155.21, 'max_sentence2_length': 581, 'unique_sentence2': 1997}, 'spa_Latn-tur_Latn': {'num_samples': 1997, 'number_of_characters': 553813, 'unique_pairs': 1997, 'min_sentence1_length': 1, 'average_sentence1_length': 144.32, 'max_sentence1_length': 504, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 133.01, 'max_sentence2_length': 504, 'unique_sentence2': 1997}, 'spa_Latn-vie_Latn': {'num_samples': 1997, 'number_of_characters': 559321, 'unique_pairs': 1997, 'min_sentence1_length': 1, 'average_sentence1_length': 144.32, 'max_sentence1_length': 504, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 135.76, 'max_sentence2_length': 437, 'unique_sentence2': 1996}, 'spa_Latn-zho_Hant': {'num_samples': 1997, 'number_of_characters': 379678, 'unique_pairs': 1996, 'min_sentence1_length': 1, 'average_sentence1_length': 144.32, 'max_sentence1_length': 504, 'unique_sentence1': 1996, 'min_sentence2_length': 3, 'average_sentence2_length': 45.81, 'max_sentence2_length': 200, 'unique_sentence2': 1996}, 'spa_Latn-zul_Latn': {'num_samples': 1997, 'number_of_characters': 545932, 'unique_pairs': 1997, 'min_sentence1_length': 1, 'average_sentence1_length': 144.32, 'max_sentence1_length': 504, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 129.06, 'max_sentence2_length': 494, 'unique_sentence2': 1996}, 'sqi_Latn-ell_Grek': {'num_samples': 1997, 'number_of_characters': 582734, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 142.02, 'max_sentence1_length': 461, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 149.79, 'max_sentence2_length': 584, 'unique_sentence2': 1996}, 'sqi_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 531327, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 142.02, 'max_sentence1_length': 461, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'sqi_Latn-hye_Armn': {'num_samples': 1997, 'number_of_characters': 548322, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 142.02, 'max_sentence1_length': 461, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 132.56, 'max_sentence2_length': 441, 'unique_sentence2': 1996}, 'sqi_Latn-kat_Geor': {'num_samples': 1997, 'number_of_characters': 550199, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 142.02, 'max_sentence1_length': 461, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 133.5, 'max_sentence2_length': 503, 'unique_sentence2': 1995}, 'srp_Cyrl-bel_Cyrl': {'num_samples': 1997, 'number_of_characters': 508393, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 126.34, 'max_sentence1_length': 439, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 128.24, 'max_sentence2_length': 422, 'unique_sentence2': 1996}, 'srp_Cyrl-bos_Latn': {'num_samples': 1997, 'number_of_characters': 507213, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 126.34, 'max_sentence1_length': 439, 'unique_sentence1': 1995, 'min_sentence2_length': 6, 'average_sentence2_length': 127.65, 'max_sentence2_length': 434, 'unique_sentence2': 1996}, 'srp_Cyrl-bul_Cyrl': {'num_samples': 1997, 'number_of_characters': 522192, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 126.34, 'max_sentence1_length': 439, 'unique_sentence1': 1995, 'min_sentence2_length': 6, 'average_sentence2_length': 135.15, 'max_sentence2_length': 493, 'unique_sentence2': 1996}, 'srp_Cyrl-ces_Latn': {'num_samples': 1997, 'number_of_characters': 493621, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 126.34, 'max_sentence1_length': 439, 'unique_sentence1': 1995, 'min_sentence2_length': 8, 'average_sentence2_length': 120.84, 'max_sentence2_length': 474, 'unique_sentence2': 1997}, 'srp_Cyrl-eng_Latn': {'num_samples': 1997, 'number_of_characters': 500023, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 126.34, 'max_sentence1_length': 439, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'srp_Cyrl-hrv_Latn': {'num_samples': 1997, 'number_of_characters': 508228, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 126.34, 'max_sentence1_length': 439, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 128.15, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'srp_Cyrl-mkd_Cyrl': {'num_samples': 1997, 'number_of_characters': 520194, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 126.34, 'max_sentence1_length': 439, 'unique_sentence1': 1995, 'min_sentence2_length': 6, 'average_sentence2_length': 134.15, 'max_sentence2_length': 451, 'unique_sentence2': 1997}, 'srp_Cyrl-pol_Latn': {'num_samples': 1997, 'number_of_characters': 530169, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 126.34, 'max_sentence1_length': 439, 'unique_sentence1': 1995, 'min_sentence2_length': 9, 'average_sentence2_length': 139.14, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'srp_Cyrl-rus_Cyrl': {'num_samples': 1997, 'number_of_characters': 527196, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 126.34, 'max_sentence1_length': 439, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 137.65, 'max_sentence2_length': 419, 'unique_sentence2': 1996}, 'srp_Cyrl-slk_Latn': {'num_samples': 1997, 'number_of_characters': 505272, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 126.34, 'max_sentence1_length': 439, 'unique_sentence1': 1995, 'min_sentence2_length': 9, 'average_sentence2_length': 126.67, 'max_sentence2_length': 403, 'unique_sentence2': 1996}, 'srp_Cyrl-slv_Latn': {'num_samples': 1997, 'number_of_characters': 505199, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 126.34, 'max_sentence1_length': 439, 'unique_sentence1': 1995, 'min_sentence2_length': 6, 'average_sentence2_length': 126.64, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'srp_Cyrl-srp_Latn': {'num_samples': 1997, 'number_of_characters': 508444, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 126.34, 'max_sentence1_length': 439, 'unique_sentence1': 1995, 'min_sentence2_length': 6, 'average_sentence2_length': 128.26, 'max_sentence2_length': 452, 'unique_sentence2': 1996}, 'srp_Cyrl-ukr_Cyrl': {'num_samples': 1997, 'number_of_characters': 515086, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 126.34, 'max_sentence1_length': 439, 'unique_sentence1': 1995, 'min_sentence2_length': 6, 'average_sentence2_length': 131.59, 'max_sentence2_length': 440, 'unique_sentence2': 1996}, 'srp_Latn-bel_Cyrl': {'num_samples': 1997, 'number_of_characters': 512231, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 128.26, 'max_sentence1_length': 452, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 128.24, 'max_sentence2_length': 422, 'unique_sentence2': 1996}, 'srp_Latn-bos_Latn': {'num_samples': 1997, 'number_of_characters': 511051, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 128.26, 'max_sentence1_length': 452, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 127.65, 'max_sentence2_length': 434, 'unique_sentence2': 1996}, 'srp_Latn-bul_Cyrl': {'num_samples': 1997, 'number_of_characters': 526030, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 128.26, 'max_sentence1_length': 452, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 135.15, 'max_sentence2_length': 493, 'unique_sentence2': 1996}, 'srp_Latn-ces_Latn': {'num_samples': 1997, 'number_of_characters': 497459, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 128.26, 'max_sentence1_length': 452, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 120.84, 'max_sentence2_length': 474, 'unique_sentence2': 1997}, 'srp_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 503861, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 128.26, 'max_sentence1_length': 452, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'srp_Latn-hrv_Latn': {'num_samples': 1997, 'number_of_characters': 512066, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 128.26, 'max_sentence1_length': 452, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 128.15, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'srp_Latn-mkd_Cyrl': {'num_samples': 1997, 'number_of_characters': 524032, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 128.26, 'max_sentence1_length': 452, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 134.15, 'max_sentence2_length': 451, 'unique_sentence2': 1997}, 'srp_Latn-pol_Latn': {'num_samples': 1997, 'number_of_characters': 534007, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 128.26, 'max_sentence1_length': 452, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 139.14, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'srp_Latn-rus_Cyrl': {'num_samples': 1997, 'number_of_characters': 531034, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 128.26, 'max_sentence1_length': 452, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 137.65, 'max_sentence2_length': 419, 'unique_sentence2': 1996}, 'srp_Latn-slk_Latn': {'num_samples': 1997, 'number_of_characters': 509110, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 128.26, 'max_sentence1_length': 452, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 126.67, 'max_sentence2_length': 403, 'unique_sentence2': 1996}, 'srp_Latn-slv_Latn': {'num_samples': 1997, 'number_of_characters': 509037, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 128.26, 'max_sentence1_length': 452, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 126.64, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'srp_Latn-srp_Cyrl': {'num_samples': 1997, 'number_of_characters': 508444, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 128.26, 'max_sentence1_length': 452, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 126.34, 'max_sentence2_length': 439, 'unique_sentence2': 1995}, 'srp_Latn-ukr_Cyrl': {'num_samples': 1997, 'number_of_characters': 518924, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 128.26, 'max_sentence1_length': 452, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 131.59, 'max_sentence2_length': 440, 'unique_sentence2': 1996}, 'ssw_Latn-amh_Ethi': {'num_samples': 1997, 'number_of_characters': 455649, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 144.29, 'max_sentence1_length': 510, 'unique_sentence1': 1996, 'min_sentence2_length': 1, 'average_sentence2_length': 83.88, 'max_sentence2_length': 290, 'unique_sentence2': 1994}, 'ssw_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 535862, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 144.29, 'max_sentence1_length': 510, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'ssw_Latn-hau_Latn': {'num_samples': 1997, 'number_of_characters': 558108, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 144.29, 'max_sentence1_length': 510, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 135.19, 'max_sentence2_length': 483, 'unique_sentence2': 1997}, 'ssw_Latn-ibo_Latn': {'num_samples': 1997, 'number_of_characters': 534243, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 144.29, 'max_sentence1_length': 510, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 123.24, 'max_sentence2_length': 469, 'unique_sentence2': 1997}, 'ssw_Latn-nso_Latn': {'num_samples': 1997, 'number_of_characters': 579641, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 144.29, 'max_sentence1_length': 510, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 145.97, 'max_sentence2_length': 487, 'unique_sentence2': 1996}, 'ssw_Latn-orm_Ethi': {'num_samples': 1997, 'number_of_characters': 525573, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 144.29, 'max_sentence1_length': 510, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 118.89, 'max_sentence2_length': 466, 'unique_sentence2': 1984}, 'ssw_Latn-som_Latn': {'num_samples': 1997, 'number_of_characters': 579434, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 144.29, 'max_sentence1_length': 510, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 145.86, 'max_sentence2_length': 455, 'unique_sentence2': 1997}, 'ssw_Latn-swa_Latn': {'num_samples': 1997, 'number_of_characters': 560651, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 144.29, 'max_sentence1_length': 510, 'unique_sentence1': 1996, 'min_sentence2_length': 10, 'average_sentence2_length': 136.46, 'max_sentence2_length': 430, 'unique_sentence2': 1997}, 'ssw_Latn-tir_Ethi': {'num_samples': 1997, 'number_of_characters': 453380, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 144.29, 'max_sentence1_length': 510, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 82.74, 'max_sentence2_length': 272, 'unique_sentence2': 1996}, 'ssw_Latn-tsn_Latn': {'num_samples': 1997, 'number_of_characters': 622425, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 144.29, 'max_sentence1_length': 510, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 167.39, 'max_sentence2_length': 556, 'unique_sentence2': 1997}, 'ssw_Latn-wol_Latn': {'num_samples': 1997, 'number_of_characters': 527945, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 144.29, 'max_sentence1_length': 510, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 120.08, 'max_sentence2_length': 405, 'unique_sentence2': 1990}, 'ssw_Latn-xho_Latn': {'num_samples': 1997, 'number_of_characters': 556232, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 144.29, 'max_sentence1_length': 510, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 134.25, 'max_sentence2_length': 492, 'unique_sentence2': 1997}, 'ssw_Latn-yor_Latn': {'num_samples': 1997, 'number_of_characters': 604230, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 144.29, 'max_sentence1_length': 510, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 158.28, 'max_sentence2_length': 582, 'unique_sentence2': 1996}, 'ssw_Latn-zul_Latn': {'num_samples': 1997, 'number_of_characters': 545874, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 144.29, 'max_sentence1_length': 510, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 129.06, 'max_sentence2_length': 494, 'unique_sentence2': 1996}, 'swa_Latn-amh_Ethi': {'num_samples': 1997, 'number_of_characters': 440016, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 136.46, 'max_sentence1_length': 430, 'unique_sentence1': 1997, 'min_sentence2_length': 1, 'average_sentence2_length': 83.88, 'max_sentence2_length': 290, 'unique_sentence2': 1994}, 'swa_Latn-arb_Arab': {'num_samples': 1997, 'number_of_characters': 503690, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 136.46, 'max_sentence1_length': 430, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 115.76, 'max_sentence2_length': 362, 'unique_sentence2': 1995}, 'swa_Latn-ben_Beng': {'num_samples': 1997, 'number_of_characters': 516311, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 136.46, 'max_sentence1_length': 430, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 122.08, 'max_sentence2_length': 402, 'unique_sentence2': 1997}, 'swa_Latn-deu_Latn': {'num_samples': 1997, 'number_of_characters': 568159, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 136.46, 'max_sentence1_length': 430, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 148.05, 'max_sentence2_length': 508, 'unique_sentence2': 1996}, 'swa_Latn-ell_Grek': {'num_samples': 1997, 'number_of_characters': 571636, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 136.46, 'max_sentence1_length': 430, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 149.79, 'max_sentence2_length': 584, 'unique_sentence2': 1996}, 'swa_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 520229, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 136.46, 'max_sentence1_length': 430, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'swa_Latn-fas_Arab': {'num_samples': 1997, 'number_of_characters': 515848, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 136.46, 'max_sentence1_length': 430, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 121.85, 'max_sentence2_length': 389, 'unique_sentence2': 1995}, 'swa_Latn-fin_Latn': {'num_samples': 1997, 'number_of_characters': 542309, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 136.46, 'max_sentence1_length': 430, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 135.1, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'swa_Latn-fra_Latn': {'num_samples': 1997, 'number_of_characters': 565617, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 136.46, 'max_sentence1_length': 430, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 146.77, 'max_sentence2_length': 512, 'unique_sentence2': 1996}, 'swa_Latn-hau_Latn': {'num_samples': 1997, 'number_of_characters': 542475, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 136.46, 'max_sentence1_length': 430, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 135.19, 'max_sentence2_length': 483, 'unique_sentence2': 1997}, 'swa_Latn-heb_Hebr': {'num_samples': 1997, 'number_of_characters': 472805, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 136.46, 'max_sentence1_length': 430, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 100.3, 'max_sentence2_length': 375, 'unique_sentence2': 1996}, 'swa_Latn-hin_Deva': {'num_samples': 1997, 'number_of_characters': 534084, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 136.46, 'max_sentence1_length': 430, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 130.98, 'max_sentence2_length': 394, 'unique_sentence2': 1996}, 'swa_Latn-hun_Latn': {'num_samples': 1997, 'number_of_characters': 550885, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 136.46, 'max_sentence1_length': 430, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 139.4, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'swa_Latn-ibo_Latn': {'num_samples': 1997, 'number_of_characters': 518610, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 136.46, 'max_sentence1_length': 430, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 123.24, 'max_sentence2_length': 469, 'unique_sentence2': 1997}, 'swa_Latn-ind_Latn': {'num_samples': 1997, 'number_of_characters': 559481, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 136.46, 'max_sentence1_length': 430, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 143.7, 'max_sentence2_length': 486, 'unique_sentence2': 1997}, 'swa_Latn-jpn_Jpan': {'num_samples': 1997, 'number_of_characters': 384135, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 136.46, 'max_sentence1_length': 430, 'unique_sentence1': 1997, 'min_sentence2_length': 4, 'average_sentence2_length': 55.9, 'max_sentence2_length': 189, 'unique_sentence2': 1994}, 'swa_Latn-kor_Hang': {'num_samples': 1997, 'number_of_characters': 405914, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 136.46, 'max_sentence1_length': 430, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 66.8, 'max_sentence2_length': 217, 'unique_sentence2': 1995}, 'swa_Latn-lit_Latn': {'num_samples': 1997, 'number_of_characters': 531906, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 136.46, 'max_sentence1_length': 430, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 129.89, 'max_sentence2_length': 446, 'unique_sentence2': 1995}, 'swa_Latn-nld_Latn': {'num_samples': 1997, 'number_of_characters': 564424, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 136.46, 'max_sentence1_length': 430, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 146.18, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'swa_Latn-nso_Latn': {'num_samples': 1997, 'number_of_characters': 564008, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 136.46, 'max_sentence1_length': 430, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 145.97, 'max_sentence2_length': 487, 'unique_sentence2': 1996}, 'swa_Latn-orm_Ethi': {'num_samples': 1997, 'number_of_characters': 509940, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 136.46, 'max_sentence1_length': 430, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 118.89, 'max_sentence2_length': 466, 'unique_sentence2': 1984}, 'swa_Latn-pol_Latn': {'num_samples': 1997, 'number_of_characters': 550375, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 136.46, 'max_sentence1_length': 430, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 139.14, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'swa_Latn-por_Latn': {'num_samples': 1997, 'number_of_characters': 549724, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 136.46, 'max_sentence1_length': 430, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 138.82, 'max_sentence2_length': 497, 'unique_sentence2': 1996}, 'swa_Latn-rus_Cyrl': {'num_samples': 1997, 'number_of_characters': 547402, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 136.46, 'max_sentence1_length': 430, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 137.65, 'max_sentence2_length': 419, 'unique_sentence2': 1996}, 'swa_Latn-som_Latn': {'num_samples': 1997, 'number_of_characters': 563801, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 136.46, 'max_sentence1_length': 430, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 145.86, 'max_sentence2_length': 455, 'unique_sentence2': 1997}, 'swa_Latn-spa_Latn': {'num_samples': 1997, 'number_of_characters': 560709, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 136.46, 'max_sentence1_length': 430, 'unique_sentence1': 1997, 'min_sentence2_length': 1, 'average_sentence2_length': 144.32, 'max_sentence2_length': 504, 'unique_sentence2': 1996}, 'swa_Latn-ssw_Latn': {'num_samples': 1997, 'number_of_characters': 560651, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 136.46, 'max_sentence1_length': 430, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 144.29, 'max_sentence2_length': 510, 'unique_sentence2': 1996}, 'swa_Latn-swe_Latn': {'num_samples': 1997, 'number_of_characters': 524336, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 136.46, 'max_sentence1_length': 430, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 126.1, 'max_sentence2_length': 430, 'unique_sentence2': 1996}, 'swa_Latn-tam_Taml': {'num_samples': 1997, 'number_of_characters': 582470, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 136.46, 'max_sentence1_length': 430, 'unique_sentence1': 1997, 'min_sentence2_length': 11, 'average_sentence2_length': 155.21, 'max_sentence2_length': 581, 'unique_sentence2': 1997}, 'swa_Latn-tir_Ethi': {'num_samples': 1997, 'number_of_characters': 437747, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 136.46, 'max_sentence1_length': 430, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 82.74, 'max_sentence2_length': 272, 'unique_sentence2': 1996}, 'swa_Latn-tsn_Latn': {'num_samples': 1997, 'number_of_characters': 606792, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 136.46, 'max_sentence1_length': 430, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 167.39, 'max_sentence2_length': 556, 'unique_sentence2': 1997}, 'swa_Latn-tur_Latn': {'num_samples': 1997, 'number_of_characters': 538122, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 136.46, 'max_sentence1_length': 430, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 133.01, 'max_sentence2_length': 504, 'unique_sentence2': 1997}, 'swa_Latn-vie_Latn': {'num_samples': 1997, 'number_of_characters': 543630, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 136.46, 'max_sentence1_length': 430, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 135.76, 'max_sentence2_length': 437, 'unique_sentence2': 1996}, 'swa_Latn-wol_Latn': {'num_samples': 1997, 'number_of_characters': 512312, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 136.46, 'max_sentence1_length': 430, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 120.08, 'max_sentence2_length': 405, 'unique_sentence2': 1990}, 'swa_Latn-xho_Latn': {'num_samples': 1997, 'number_of_characters': 540599, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 136.46, 'max_sentence1_length': 430, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 134.25, 'max_sentence2_length': 492, 'unique_sentence2': 1997}, 'swa_Latn-yor_Latn': {'num_samples': 1997, 'number_of_characters': 588597, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 136.46, 'max_sentence1_length': 430, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 158.28, 'max_sentence2_length': 582, 'unique_sentence2': 1996}, 'swa_Latn-zho_Hant': {'num_samples': 1997, 'number_of_characters': 363987, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 136.46, 'max_sentence1_length': 430, 'unique_sentence1': 1997, 'min_sentence2_length': 3, 'average_sentence2_length': 45.81, 'max_sentence2_length': 200, 'unique_sentence2': 1996}, 'swa_Latn-zul_Latn': {'num_samples': 1997, 'number_of_characters': 530241, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 136.46, 'max_sentence1_length': 430, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 129.06, 'max_sentence2_length': 494, 'unique_sentence2': 1996}, 'swe_Latn-afr_Latn': {'num_samples': 1997, 'number_of_characters': 520179, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 126.1, 'max_sentence1_length': 430, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 134.38, 'max_sentence2_length': 437, 'unique_sentence2': 1996}, 'swe_Latn-arb_Arab': {'num_samples': 1997, 'number_of_characters': 483008, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 126.1, 'max_sentence1_length': 430, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 115.76, 'max_sentence2_length': 362, 'unique_sentence2': 1995}, 'swe_Latn-ben_Beng': {'num_samples': 1997, 'number_of_characters': 495629, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 126.1, 'max_sentence1_length': 430, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 122.08, 'max_sentence2_length': 402, 'unique_sentence2': 1997}, 'swe_Latn-dan_Latn': {'num_samples': 1997, 'number_of_characters': 503965, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 126.1, 'max_sentence1_length': 430, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 126.26, 'max_sentence2_length': 522, 'unique_sentence2': 1995}, 'swe_Latn-deu_Latn': {'num_samples': 1997, 'number_of_characters': 547477, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 126.1, 'max_sentence1_length': 430, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 148.05, 'max_sentence2_length': 508, 'unique_sentence2': 1996}, 'swe_Latn-ell_Grek': {'num_samples': 1997, 'number_of_characters': 550954, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 126.1, 'max_sentence1_length': 430, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 149.79, 'max_sentence2_length': 584, 'unique_sentence2': 1996}, 'swe_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 499547, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 126.1, 'max_sentence1_length': 430, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'swe_Latn-fao_Latn': {'num_samples': 1997, 'number_of_characters': 509630, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 126.1, 'max_sentence1_length': 430, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 129.1, 'max_sentence2_length': 433, 'unique_sentence2': 1997}, 'swe_Latn-fas_Arab': {'num_samples': 1997, 'number_of_characters': 495166, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 126.1, 'max_sentence1_length': 430, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 121.85, 'max_sentence2_length': 389, 'unique_sentence2': 1995}, 'swe_Latn-fin_Latn': {'num_samples': 1997, 'number_of_characters': 521627, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 126.1, 'max_sentence1_length': 430, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 135.1, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'swe_Latn-fra_Latn': {'num_samples': 1997, 'number_of_characters': 544935, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 126.1, 'max_sentence1_length': 430, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 146.77, 'max_sentence2_length': 512, 'unique_sentence2': 1996}, 'swe_Latn-heb_Hebr': {'num_samples': 1997, 'number_of_characters': 452123, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 126.1, 'max_sentence1_length': 430, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 100.3, 'max_sentence2_length': 375, 'unique_sentence2': 1996}, 'swe_Latn-hin_Deva': {'num_samples': 1997, 'number_of_characters': 513402, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 126.1, 'max_sentence1_length': 430, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 130.98, 'max_sentence2_length': 394, 'unique_sentence2': 1996}, 'swe_Latn-hun_Latn': {'num_samples': 1997, 'number_of_characters': 530203, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 126.1, 'max_sentence1_length': 430, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 139.4, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'swe_Latn-ind_Latn': {'num_samples': 1997, 'number_of_characters': 538799, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 126.1, 'max_sentence1_length': 430, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 143.7, 'max_sentence2_length': 486, 'unique_sentence2': 1997}, 'swe_Latn-isl_Latn': {'num_samples': 1997, 'number_of_characters': 514035, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 126.1, 'max_sentence1_length': 430, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 131.3, 'max_sentence2_length': 399, 'unique_sentence2': 1996}, 'swe_Latn-jpn_Jpan': {'num_samples': 1997, 'number_of_characters': 363453, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 126.1, 'max_sentence1_length': 430, 'unique_sentence1': 1996, 'min_sentence2_length': 4, 'average_sentence2_length': 55.9, 'max_sentence2_length': 189, 'unique_sentence2': 1994}, 'swe_Latn-kor_Hang': {'num_samples': 1997, 'number_of_characters': 385232, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 126.1, 'max_sentence1_length': 430, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 66.8, 'max_sentence2_length': 217, 'unique_sentence2': 1995}, 'swe_Latn-lit_Latn': {'num_samples': 1997, 'number_of_characters': 511224, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 126.1, 'max_sentence1_length': 430, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 129.89, 'max_sentence2_length': 446, 'unique_sentence2': 1995}, 'swe_Latn-ltz_Latn': {'num_samples': 1997, 'number_of_characters': 532584, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 126.1, 'max_sentence1_length': 430, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 140.59, 'max_sentence2_length': 543, 'unique_sentence2': 1996}, 'swe_Latn-nld_Latn': {'num_samples': 1997, 'number_of_characters': 543742, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 126.1, 'max_sentence1_length': 430, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 146.18, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'swe_Latn-nno_Latn': {'num_samples': 1997, 'number_of_characters': 500184, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 126.1, 'max_sentence1_length': 430, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.37, 'max_sentence2_length': 417, 'unique_sentence2': 1996}, 'swe_Latn-nob_Latn': {'num_samples': 1997, 'number_of_characters': 503271, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 126.1, 'max_sentence1_length': 430, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 125.91, 'max_sentence2_length': 482, 'unique_sentence2': 1996}, 'swe_Latn-pol_Latn': {'num_samples': 1997, 'number_of_characters': 529693, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 126.1, 'max_sentence1_length': 430, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 139.14, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'swe_Latn-por_Latn': {'num_samples': 1997, 'number_of_characters': 529042, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 126.1, 'max_sentence1_length': 430, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 138.82, 'max_sentence2_length': 497, 'unique_sentence2': 1996}, 'swe_Latn-rus_Cyrl': {'num_samples': 1997, 'number_of_characters': 526720, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 126.1, 'max_sentence1_length': 430, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 137.65, 'max_sentence2_length': 419, 'unique_sentence2': 1996}, 'swe_Latn-spa_Latn': {'num_samples': 1997, 'number_of_characters': 540027, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 126.1, 'max_sentence1_length': 430, 'unique_sentence1': 1996, 'min_sentence2_length': 1, 'average_sentence2_length': 144.32, 'max_sentence2_length': 504, 'unique_sentence2': 1996}, 'swe_Latn-swa_Latn': {'num_samples': 1997, 'number_of_characters': 524336, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 126.1, 'max_sentence1_length': 430, 'unique_sentence1': 1996, 'min_sentence2_length': 10, 'average_sentence2_length': 136.46, 'max_sentence2_length': 430, 'unique_sentence2': 1997}, 'swe_Latn-tam_Taml': {'num_samples': 1997, 'number_of_characters': 561788, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 126.1, 'max_sentence1_length': 430, 'unique_sentence1': 1996, 'min_sentence2_length': 11, 'average_sentence2_length': 155.21, 'max_sentence2_length': 581, 'unique_sentence2': 1997}, 'swe_Latn-tur_Latn': {'num_samples': 1997, 'number_of_characters': 517440, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 126.1, 'max_sentence1_length': 430, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 133.01, 'max_sentence2_length': 504, 'unique_sentence2': 1997}, 'swe_Latn-vie_Latn': {'num_samples': 1997, 'number_of_characters': 522948, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 126.1, 'max_sentence1_length': 430, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 135.76, 'max_sentence2_length': 437, 'unique_sentence2': 1996}, 'swe_Latn-zho_Hant': {'num_samples': 1997, 'number_of_characters': 343305, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 126.1, 'max_sentence1_length': 430, 'unique_sentence1': 1996, 'min_sentence2_length': 3, 'average_sentence2_length': 45.81, 'max_sentence2_length': 200, 'unique_sentence2': 1996}, 'swe_Latn-zul_Latn': {'num_samples': 1997, 'number_of_characters': 509559, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 126.1, 'max_sentence1_length': 430, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 129.06, 'max_sentence2_length': 494, 'unique_sentence2': 1996}, 'tah_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 557343, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 155.04, 'max_sentence1_length': 524, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'tah_Latn-fij_Latn': {'num_samples': 1997, 'number_of_characters': 610128, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 155.04, 'max_sentence1_length': 524, 'unique_sentence1': 1997, 'min_sentence2_length': 10, 'average_sentence2_length': 150.48, 'max_sentence2_length': 448, 'unique_sentence2': 1988}, 'tah_Latn-fil_Latn': {'num_samples': 1997, 'number_of_characters': 603043, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 155.04, 'max_sentence1_length': 524, 'unique_sentence1': 1997, 'min_sentence2_length': 10, 'average_sentence2_length': 146.93, 'max_sentence2_length': 554, 'unique_sentence2': 1997}, 'tah_Latn-ind_Latn': {'num_samples': 1997, 'number_of_characters': 596595, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 155.04, 'max_sentence1_length': 524, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 143.7, 'max_sentence2_length': 486, 'unique_sentence2': 1997}, 'tah_Latn-mal_Mlym': {'num_samples': 1997, 'number_of_characters': 613775, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 155.04, 'max_sentence1_length': 524, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 152.3, 'max_sentence2_length': 540, 'unique_sentence2': 1996}, 'tah_Latn-mlg_Latn': {'num_samples': 1997, 'number_of_characters': 629931, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 155.04, 'max_sentence1_length': 524, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 160.39, 'max_sentence2_length': 559, 'unique_sentence2': 1994}, 'tah_Latn-mri_Latn': {'num_samples': 1997, 'number_of_characters': 583747, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 155.04, 'max_sentence1_length': 524, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 137.27, 'max_sentence2_length': 443, 'unique_sentence2': 1997}, 'tah_Latn-msa_Latn': {'num_samples': 1997, 'number_of_characters': 586806, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 155.04, 'max_sentence1_length': 524, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 138.8, 'max_sentence2_length': 463, 'unique_sentence2': 1997}, 'tah_Latn-smo_Latn': {'num_samples': 1997, 'number_of_characters': 587478, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 155.04, 'max_sentence1_length': 524, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 139.14, 'max_sentence2_length': 431, 'unique_sentence2': 1996}, 'tah_Latn-ton_Latn': {'num_samples': 1997, 'number_of_characters': 623263, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 155.04, 'max_sentence1_length': 524, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 157.06, 'max_sentence2_length': 468, 'unique_sentence2': 1997}, 'tam_Taml-arb_Arab': {'num_samples': 1997, 'number_of_characters': 541142, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 155.21, 'max_sentence1_length': 581, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 115.76, 'max_sentence2_length': 362, 'unique_sentence2': 1995}, 'tam_Taml-ben_Beng': {'num_samples': 1997, 'number_of_characters': 553763, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 155.21, 'max_sentence1_length': 581, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 122.08, 'max_sentence2_length': 402, 'unique_sentence2': 1997}, 'tam_Taml-deu_Latn': {'num_samples': 1997, 'number_of_characters': 605611, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 155.21, 'max_sentence1_length': 581, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 148.05, 'max_sentence2_length': 508, 'unique_sentence2': 1996}, 'tam_Taml-div_Thaa': {'num_samples': 1997, 'number_of_characters': 613809, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 155.21, 'max_sentence1_length': 581, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 152.15, 'max_sentence2_length': 609, 'unique_sentence2': 1996}, 'tam_Taml-ell_Grek': {'num_samples': 1997, 'number_of_characters': 609088, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 155.21, 'max_sentence1_length': 581, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 149.79, 'max_sentence2_length': 584, 'unique_sentence2': 1996}, 'tam_Taml-eng_Latn': {'num_samples': 1997, 'number_of_characters': 557681, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 155.21, 'max_sentence1_length': 581, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'tam_Taml-eus_Latn': {'num_samples': 1997, 'number_of_characters': 585164, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 155.21, 'max_sentence1_length': 581, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 137.81, 'max_sentence2_length': 393, 'unique_sentence2': 1997}, 'tam_Taml-fas_Arab': {'num_samples': 1997, 'number_of_characters': 553300, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 155.21, 'max_sentence1_length': 581, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 121.85, 'max_sentence2_length': 389, 'unique_sentence2': 1995}, 'tam_Taml-fin_Latn': {'num_samples': 1997, 'number_of_characters': 579761, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 155.21, 'max_sentence1_length': 581, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 135.1, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'tam_Taml-fra_Latn': {'num_samples': 1997, 'number_of_characters': 603069, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 155.21, 'max_sentence1_length': 581, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 146.77, 'max_sentence2_length': 512, 'unique_sentence2': 1996}, 'tam_Taml-guj_Gujr': {'num_samples': 1997, 'number_of_characters': 554892, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 155.21, 'max_sentence1_length': 581, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 122.65, 'max_sentence2_length': 378, 'unique_sentence2': 1997}, 'tam_Taml-heb_Hebr': {'num_samples': 1997, 'number_of_characters': 510257, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 155.21, 'max_sentence1_length': 581, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 100.3, 'max_sentence2_length': 375, 'unique_sentence2': 1996}, 'tam_Taml-hin_Deva': {'num_samples': 1997, 'number_of_characters': 571536, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 155.21, 'max_sentence1_length': 581, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 130.98, 'max_sentence2_length': 394, 'unique_sentence2': 1996}, 'tam_Taml-hun_Latn': {'num_samples': 1997, 'number_of_characters': 588337, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 155.21, 'max_sentence1_length': 581, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 139.4, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'tam_Taml-ind_Latn': {'num_samples': 1997, 'number_of_characters': 596933, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 155.21, 'max_sentence1_length': 581, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 143.7, 'max_sentence2_length': 486, 'unique_sentence2': 1997}, 'tam_Taml-jpn_Jpan': {'num_samples': 1997, 'number_of_characters': 421587, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 155.21, 'max_sentence1_length': 581, 'unique_sentence1': 1997, 'min_sentence2_length': 4, 'average_sentence2_length': 55.9, 'max_sentence2_length': 189, 'unique_sentence2': 1994}, 'tam_Taml-kan_Knda': {'num_samples': 1997, 'number_of_characters': 575497, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 155.21, 'max_sentence1_length': 581, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 132.97, 'max_sentence2_length': 449, 'unique_sentence2': 1996}, 'tam_Taml-kor_Hang': {'num_samples': 1997, 'number_of_characters': 443366, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 155.21, 'max_sentence1_length': 581, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 66.8, 'max_sentence2_length': 217, 'unique_sentence2': 1995}, 'tam_Taml-lit_Latn': {'num_samples': 1997, 'number_of_characters': 569358, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 155.21, 'max_sentence1_length': 581, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 129.89, 'max_sentence2_length': 446, 'unique_sentence2': 1995}, 'tam_Taml-mar_Deva': {'num_samples': 1997, 'number_of_characters': 570848, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 155.21, 'max_sentence1_length': 581, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 130.64, 'max_sentence2_length': 443, 'unique_sentence2': 1995}, 'tam_Taml-nep_Deva': {'num_samples': 1997, 'number_of_characters': 558184, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 155.21, 'max_sentence1_length': 581, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 124.3, 'max_sentence2_length': 412, 'unique_sentence2': 1996}, 'tam_Taml-nld_Latn': {'num_samples': 1997, 'number_of_characters': 601876, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 155.21, 'max_sentence1_length': 581, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 146.18, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'tam_Taml-pan_Guru': {'num_samples': 1997, 'number_of_characters': 560383, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 155.21, 'max_sentence1_length': 581, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 125.4, 'max_sentence2_length': 383, 'unique_sentence2': 1996}, 'tam_Taml-pol_Latn': {'num_samples': 1997, 'number_of_characters': 587827, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 155.21, 'max_sentence1_length': 581, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 139.14, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'tam_Taml-por_Latn': {'num_samples': 1997, 'number_of_characters': 587176, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 155.21, 'max_sentence1_length': 581, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 138.82, 'max_sentence2_length': 497, 'unique_sentence2': 1996}, 'tam_Taml-rus_Cyrl': {'num_samples': 1997, 'number_of_characters': 584854, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 155.21, 'max_sentence1_length': 581, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 137.65, 'max_sentence2_length': 419, 'unique_sentence2': 1996}, 'tam_Taml-sin_Sinh': {'num_samples': 1997, 'number_of_characters': 568702, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 155.21, 'max_sentence1_length': 581, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 129.56, 'max_sentence2_length': 441, 'unique_sentence2': 1996}, 'tam_Taml-snd_Arab': {'num_samples': 1997, 'number_of_characters': 530288, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 155.21, 'max_sentence1_length': 581, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 110.33, 'max_sentence2_length': 335, 'unique_sentence2': 1996}, 'tam_Taml-spa_Latn': {'num_samples': 1997, 'number_of_characters': 598161, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 155.21, 'max_sentence1_length': 581, 'unique_sentence1': 1997, 'min_sentence2_length': 1, 'average_sentence2_length': 144.32, 'max_sentence2_length': 504, 'unique_sentence2': 1996}, 'tam_Taml-swa_Latn': {'num_samples': 1997, 'number_of_characters': 582470, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 155.21, 'max_sentence1_length': 581, 'unique_sentence1': 1997, 'min_sentence2_length': 10, 'average_sentence2_length': 136.46, 'max_sentence2_length': 430, 'unique_sentence2': 1997}, 'tam_Taml-swe_Latn': {'num_samples': 1997, 'number_of_characters': 561788, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 155.21, 'max_sentence1_length': 581, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 126.1, 'max_sentence2_length': 430, 'unique_sentence2': 1996}, 'tam_Taml-tel_Telu': {'num_samples': 1997, 'number_of_characters': 557488, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 155.21, 'max_sentence1_length': 581, 'unique_sentence1': 1997, 'min_sentence2_length': 10, 'average_sentence2_length': 123.95, 'max_sentence2_length': 412, 'unique_sentence2': 1996}, 'tam_Taml-tur_Latn': {'num_samples': 1997, 'number_of_characters': 575574, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 155.21, 'max_sentence1_length': 581, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 133.01, 'max_sentence2_length': 504, 'unique_sentence2': 1997}, 'tam_Taml-urd_Arab': {'num_samples': 1997, 'number_of_characters': 557959, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 155.21, 'max_sentence1_length': 581, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.19, 'max_sentence2_length': 390, 'unique_sentence2': 1996}, 'tam_Taml-vie_Latn': {'num_samples': 1997, 'number_of_characters': 581082, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 155.21, 'max_sentence1_length': 581, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 135.76, 'max_sentence2_length': 437, 'unique_sentence2': 1996}, 'tam_Taml-zho_Hant': {'num_samples': 1997, 'number_of_characters': 401439, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 155.21, 'max_sentence1_length': 581, 'unique_sentence1': 1997, 'min_sentence2_length': 3, 'average_sentence2_length': 45.81, 'max_sentence2_length': 200, 'unique_sentence2': 1996}, 'tam_Taml-zul_Latn': {'num_samples': 1997, 'number_of_characters': 567693, 'unique_pairs': 1997, 'min_sentence1_length': 11, 'average_sentence1_length': 155.21, 'max_sentence1_length': 581, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 129.06, 'max_sentence2_length': 494, 'unique_sentence2': 1996}, 'tat_Cyrl-aze_Latn': {'num_samples': 1997, 'number_of_characters': 515560, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 123.15, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 135.02, 'max_sentence2_length': 398, 'unique_sentence2': 1997}, 'tat_Cyrl-bak_Cyrl': {'num_samples': 1997, 'number_of_characters': 492252, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 123.15, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 11, 'average_sentence2_length': 123.35, 'max_sentence2_length': 437, 'unique_sentence2': 1995}, 'tat_Cyrl-eng_Latn': {'num_samples': 1997, 'number_of_characters': 493646, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 123.15, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'tat_Cyrl-kaz_Cyrl': {'num_samples': 1997, 'number_of_characters': 506202, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 123.15, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 130.33, 'max_sentence2_length': 473, 'unique_sentence2': 1996}, 'tat_Cyrl-kir_Cyrl': {'num_samples': 1997, 'number_of_characters': 496790, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 123.15, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 125.62, 'max_sentence2_length': 395, 'unique_sentence2': 1996}, 'tat_Cyrl-tuk_Latn': {'num_samples': 1997, 'number_of_characters': 531200, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 123.15, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 142.85, 'max_sentence2_length': 576, 'unique_sentence2': 1996}, 'tat_Cyrl-tur_Latn': {'num_samples': 1997, 'number_of_characters': 511539, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 123.15, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 133.01, 'max_sentence2_length': 504, 'unique_sentence2': 1997}, 'tat_Cyrl-uig_Arab': {'num_samples': 1997, 'number_of_characters': 556948, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 123.15, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 155.74, 'max_sentence2_length': 592, 'unique_sentence2': 1996}, 'tat_Cyrl-uzb_Latn': {'num_samples': 1997, 'number_of_characters': 539621, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 123.15, 'max_sentence1_length': 539, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 147.07, 'max_sentence2_length': 470, 'unique_sentence2': 1996}, 'tel_Telu-ben_Beng': {'num_samples': 1997, 'number_of_characters': 491329, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 123.95, 'max_sentence1_length': 412, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 122.08, 'max_sentence2_length': 402, 'unique_sentence2': 1997}, 'tel_Telu-div_Thaa': {'num_samples': 1997, 'number_of_characters': 551375, 'unique_pairs': 1996, 'min_sentence1_length': 10, 'average_sentence1_length': 123.95, 'max_sentence1_length': 412, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 152.15, 'max_sentence2_length': 609, 'unique_sentence2': 1996}, 'tel_Telu-eng_Latn': {'num_samples': 1997, 'number_of_characters': 495247, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 123.95, 'max_sentence1_length': 412, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'tel_Telu-eus_Latn': {'num_samples': 1997, 'number_of_characters': 522730, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 123.95, 'max_sentence1_length': 412, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 137.81, 'max_sentence2_length': 393, 'unique_sentence2': 1997}, 'tel_Telu-guj_Gujr': {'num_samples': 1997, 'number_of_characters': 492458, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 123.95, 'max_sentence1_length': 412, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 122.65, 'max_sentence2_length': 378, 'unique_sentence2': 1997}, 'tel_Telu-hin_Deva': {'num_samples': 1997, 'number_of_characters': 509102, 'unique_pairs': 1996, 'min_sentence1_length': 10, 'average_sentence1_length': 123.95, 'max_sentence1_length': 412, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 130.98, 'max_sentence2_length': 394, 'unique_sentence2': 1996}, 'tel_Telu-kan_Knda': {'num_samples': 1997, 'number_of_characters': 513063, 'unique_pairs': 1996, 'min_sentence1_length': 10, 'average_sentence1_length': 123.95, 'max_sentence1_length': 412, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 132.97, 'max_sentence2_length': 449, 'unique_sentence2': 1996}, 'tel_Telu-mar_Deva': {'num_samples': 1997, 'number_of_characters': 508414, 'unique_pairs': 1996, 'min_sentence1_length': 10, 'average_sentence1_length': 123.95, 'max_sentence1_length': 412, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 130.64, 'max_sentence2_length': 443, 'unique_sentence2': 1995}, 'tel_Telu-nep_Deva': {'num_samples': 1997, 'number_of_characters': 495750, 'unique_pairs': 1996, 'min_sentence1_length': 10, 'average_sentence1_length': 123.95, 'max_sentence1_length': 412, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 124.3, 'max_sentence2_length': 412, 'unique_sentence2': 1996}, 'tel_Telu-pan_Guru': {'num_samples': 1997, 'number_of_characters': 497949, 'unique_pairs': 1996, 'min_sentence1_length': 10, 'average_sentence1_length': 123.95, 'max_sentence1_length': 412, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 125.4, 'max_sentence2_length': 383, 'unique_sentence2': 1996}, 'tel_Telu-sin_Sinh': {'num_samples': 1997, 'number_of_characters': 506268, 'unique_pairs': 1996, 'min_sentence1_length': 10, 'average_sentence1_length': 123.95, 'max_sentence1_length': 412, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 129.56, 'max_sentence2_length': 441, 'unique_sentence2': 1996}, 'tel_Telu-snd_Arab': {'num_samples': 1997, 'number_of_characters': 467854, 'unique_pairs': 1996, 'min_sentence1_length': 10, 'average_sentence1_length': 123.95, 'max_sentence1_length': 412, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 110.33, 'max_sentence2_length': 335, 'unique_sentence2': 1996}, 'tel_Telu-tam_Taml': {'num_samples': 1997, 'number_of_characters': 557488, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 123.95, 'max_sentence1_length': 412, 'unique_sentence1': 1996, 'min_sentence2_length': 11, 'average_sentence2_length': 155.21, 'max_sentence2_length': 581, 'unique_sentence2': 1997}, 'tel_Telu-urd_Arab': {'num_samples': 1997, 'number_of_characters': 495525, 'unique_pairs': 1996, 'min_sentence1_length': 10, 'average_sentence1_length': 123.95, 'max_sentence1_length': 412, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.19, 'max_sentence2_length': 390, 'unique_sentence2': 1996}, 'tgk_Cyrl-arb_Arab': {'num_samples': 1997, 'number_of_characters': 505328, 'unique_pairs': 1995, 'min_sentence1_length': 3, 'average_sentence1_length': 137.28, 'max_sentence1_length': 451, 'unique_sentence1': 1995, 'min_sentence2_length': 5, 'average_sentence2_length': 115.76, 'max_sentence2_length': 362, 'unique_sentence2': 1995}, 'tgk_Cyrl-ckb_Arab': {'num_samples': 1997, 'number_of_characters': 526514, 'unique_pairs': 1996, 'min_sentence1_length': 3, 'average_sentence1_length': 137.28, 'max_sentence1_length': 451, 'unique_sentence1': 1995, 'min_sentence2_length': 5, 'average_sentence2_length': 126.37, 'max_sentence2_length': 399, 'unique_sentence2': 1995}, 'tgk_Cyrl-eng_Latn': {'num_samples': 1997, 'number_of_characters': 521867, 'unique_pairs': 1997, 'min_sentence1_length': 3, 'average_sentence1_length': 137.28, 'max_sentence1_length': 451, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'tgk_Cyrl-fas_Arab': {'num_samples': 1997, 'number_of_characters': 517486, 'unique_pairs': 1995, 'min_sentence1_length': 3, 'average_sentence1_length': 137.28, 'max_sentence1_length': 451, 'unique_sentence1': 1995, 'min_sentence2_length': 9, 'average_sentence2_length': 121.85, 'max_sentence2_length': 389, 'unique_sentence2': 1995}, 'tgk_Cyrl-heb_Hebr': {'num_samples': 1997, 'number_of_characters': 474443, 'unique_pairs': 1996, 'min_sentence1_length': 3, 'average_sentence1_length': 137.28, 'max_sentence1_length': 451, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 100.3, 'max_sentence2_length': 375, 'unique_sentence2': 1996}, 'tgk_Cyrl-kmr_Latn': {'num_samples': 1997, 'number_of_characters': 520093, 'unique_pairs': 1996, 'min_sentence1_length': 3, 'average_sentence1_length': 137.28, 'max_sentence1_length': 451, 'unique_sentence1': 1995, 'min_sentence2_length': 7, 'average_sentence2_length': 123.16, 'max_sentence2_length': 420, 'unique_sentence2': 1996}, 'tgk_Cyrl-mey_Arab': {'num_samples': 1997, 'number_of_characters': 487982, 'unique_pairs': 1996, 'min_sentence1_length': 3, 'average_sentence1_length': 137.28, 'max_sentence1_length': 451, 'unique_sentence1': 1995, 'min_sentence2_length': 6, 'average_sentence2_length': 107.08, 'max_sentence2_length': 392, 'unique_sentence2': 1993}, 'tgk_Cyrl-prs_Arab': {'num_samples': 1997, 'number_of_characters': 516683, 'unique_pairs': 1997, 'min_sentence1_length': 3, 'average_sentence1_length': 137.28, 'max_sentence1_length': 451, 'unique_sentence1': 1995, 'min_sentence2_length': 8, 'average_sentence2_length': 121.45, 'max_sentence2_length': 365, 'unique_sentence2': 1997}, 'tgk_Cyrl-pus_Arab': {'num_samples': 1997, 'number_of_characters': 516780, 'unique_pairs': 1996, 'min_sentence1_length': 3, 'average_sentence1_length': 137.28, 'max_sentence1_length': 451, 'unique_sentence1': 1995, 'min_sentence2_length': 8, 'average_sentence2_length': 121.5, 'max_sentence2_length': 366, 'unique_sentence2': 1996}, 'tgk_Cyrl-shi_Arab': {'num_samples': 1997, 'number_of_characters': 489060, 'unique_pairs': 1997, 'min_sentence1_length': 3, 'average_sentence1_length': 137.28, 'max_sentence1_length': 451, 'unique_sentence1': 1995, 'min_sentence2_length': 3, 'average_sentence2_length': 107.62, 'max_sentence2_length': 378, 'unique_sentence2': 1996}, 'tha_Thai-bod_Tibt': {'num_samples': 1997, 'number_of_characters': 538097, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 118.91, 'max_sentence1_length': 439, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 150.54, 'max_sentence2_length': 478, 'unique_sentence2': 1993}, 'tha_Thai-dzo_Tibt': {'num_samples': 1997, 'number_of_characters': 480689, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 118.91, 'max_sentence1_length': 439, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 121.79, 'max_sentence2_length': 411, 'unique_sentence2': 1992}, 'tha_Thai-eng_Latn': {'num_samples': 1997, 'number_of_characters': 485188, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 118.91, 'max_sentence1_length': 439, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'tha_Thai-khm_Khmr': {'num_samples': 1997, 'number_of_characters': 525959, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 118.91, 'max_sentence1_length': 439, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 144.46, 'max_sentence2_length': 517, 'unique_sentence2': 1996}, 'tha_Thai-lao_Laoo': {'num_samples': 1997, 'number_of_characters': 504448, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 118.91, 'max_sentence1_length': 439, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 133.69, 'max_sentence2_length': 507, 'unique_sentence2': 1997}, 'tha_Thai-mon_Mong': {'num_samples': 1997, 'number_of_characters': 496516, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 118.91, 'max_sentence1_length': 439, 'unique_sentence1': 1996, 'min_sentence2_length': 11, 'average_sentence2_length': 129.72, 'max_sentence2_length': 414, 'unique_sentence2': 1997}, 'tha_Thai-mya_Mymr': {'num_samples': 1997, 'number_of_characters': 549322, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 118.91, 'max_sentence1_length': 439, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 156.16, 'max_sentence2_length': 773, 'unique_sentence2': 1997}, 'tir_Ethi-amh_Ethi': {'num_samples': 1997, 'number_of_characters': 332745, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 82.74, 'max_sentence1_length': 272, 'unique_sentence1': 1996, 'min_sentence2_length': 1, 'average_sentence2_length': 83.88, 'max_sentence2_length': 290, 'unique_sentence2': 1994}, 'tir_Ethi-eng_Latn': {'num_samples': 1997, 'number_of_characters': 412958, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 82.74, 'max_sentence1_length': 272, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'tir_Ethi-hau_Latn': {'num_samples': 1997, 'number_of_characters': 435204, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 82.74, 'max_sentence1_length': 272, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 135.19, 'max_sentence2_length': 483, 'unique_sentence2': 1997}, 'tir_Ethi-ibo_Latn': {'num_samples': 1997, 'number_of_characters': 411339, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 82.74, 'max_sentence1_length': 272, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 123.24, 'max_sentence2_length': 469, 'unique_sentence2': 1997}, 'tir_Ethi-nso_Latn': {'num_samples': 1997, 'number_of_characters': 456737, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 82.74, 'max_sentence1_length': 272, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 145.97, 'max_sentence2_length': 487, 'unique_sentence2': 1996}, 'tir_Ethi-orm_Ethi': {'num_samples': 1997, 'number_of_characters': 402669, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 82.74, 'max_sentence1_length': 272, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 118.89, 'max_sentence2_length': 466, 'unique_sentence2': 1984}, 'tir_Ethi-som_Latn': {'num_samples': 1997, 'number_of_characters': 456530, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 82.74, 'max_sentence1_length': 272, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 145.86, 'max_sentence2_length': 455, 'unique_sentence2': 1997}, 'tir_Ethi-ssw_Latn': {'num_samples': 1997, 'number_of_characters': 453380, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 82.74, 'max_sentence1_length': 272, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 144.29, 'max_sentence2_length': 510, 'unique_sentence2': 1996}, 'tir_Ethi-swa_Latn': {'num_samples': 1997, 'number_of_characters': 437747, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 82.74, 'max_sentence1_length': 272, 'unique_sentence1': 1996, 'min_sentence2_length': 10, 'average_sentence2_length': 136.46, 'max_sentence2_length': 430, 'unique_sentence2': 1997}, 'tir_Ethi-tsn_Latn': {'num_samples': 1997, 'number_of_characters': 499521, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 82.74, 'max_sentence1_length': 272, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 167.39, 'max_sentence2_length': 556, 'unique_sentence2': 1997}, 'tir_Ethi-wol_Latn': {'num_samples': 1997, 'number_of_characters': 405041, 'unique_pairs': 1996, 'min_sentence1_length': 5, 'average_sentence1_length': 82.74, 'max_sentence1_length': 272, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 120.08, 'max_sentence2_length': 405, 'unique_sentence2': 1990}, 'tir_Ethi-xho_Latn': {'num_samples': 1997, 'number_of_characters': 433328, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 82.74, 'max_sentence1_length': 272, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 134.25, 'max_sentence2_length': 492, 'unique_sentence2': 1997}, 'tir_Ethi-yor_Latn': {'num_samples': 1997, 'number_of_characters': 481326, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 82.74, 'max_sentence1_length': 272, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 158.28, 'max_sentence2_length': 582, 'unique_sentence2': 1996}, 'tir_Ethi-zul_Latn': {'num_samples': 1997, 'number_of_characters': 422970, 'unique_pairs': 1997, 'min_sentence1_length': 5, 'average_sentence1_length': 82.74, 'max_sentence1_length': 272, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 129.06, 'max_sentence2_length': 494, 'unique_sentence2': 1996}, 'ton_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 561360, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 157.06, 'max_sentence1_length': 468, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'ton_Latn-fij_Latn': {'num_samples': 1997, 'number_of_characters': 614145, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 157.06, 'max_sentence1_length': 468, 'unique_sentence1': 1997, 'min_sentence2_length': 10, 'average_sentence2_length': 150.48, 'max_sentence2_length': 448, 'unique_sentence2': 1988}, 'ton_Latn-fil_Latn': {'num_samples': 1997, 'number_of_characters': 607060, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 157.06, 'max_sentence1_length': 468, 'unique_sentence1': 1997, 'min_sentence2_length': 10, 'average_sentence2_length': 146.93, 'max_sentence2_length': 554, 'unique_sentence2': 1997}, 'ton_Latn-ind_Latn': {'num_samples': 1997, 'number_of_characters': 600612, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 157.06, 'max_sentence1_length': 468, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 143.7, 'max_sentence2_length': 486, 'unique_sentence2': 1997}, 'ton_Latn-mal_Mlym': {'num_samples': 1997, 'number_of_characters': 617792, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 157.06, 'max_sentence1_length': 468, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 152.3, 'max_sentence2_length': 540, 'unique_sentence2': 1996}, 'ton_Latn-mlg_Latn': {'num_samples': 1997, 'number_of_characters': 633948, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 157.06, 'max_sentence1_length': 468, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 160.39, 'max_sentence2_length': 559, 'unique_sentence2': 1994}, 'ton_Latn-mri_Latn': {'num_samples': 1997, 'number_of_characters': 587764, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 157.06, 'max_sentence1_length': 468, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 137.27, 'max_sentence2_length': 443, 'unique_sentence2': 1997}, 'ton_Latn-msa_Latn': {'num_samples': 1997, 'number_of_characters': 590823, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 157.06, 'max_sentence1_length': 468, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 138.8, 'max_sentence2_length': 463, 'unique_sentence2': 1997}, 'ton_Latn-smo_Latn': {'num_samples': 1997, 'number_of_characters': 591495, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 157.06, 'max_sentence1_length': 468, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 139.14, 'max_sentence2_length': 431, 'unique_sentence2': 1996}, 'ton_Latn-tah_Latn': {'num_samples': 1997, 'number_of_characters': 623263, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 157.06, 'max_sentence1_length': 468, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 155.04, 'max_sentence2_length': 524, 'unique_sentence2': 1997}, 'tsn_Latn-amh_Ethi': {'num_samples': 1997, 'number_of_characters': 501790, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 167.39, 'max_sentence1_length': 556, 'unique_sentence1': 1997, 'min_sentence2_length': 1, 'average_sentence2_length': 83.88, 'max_sentence2_length': 290, 'unique_sentence2': 1994}, 'tsn_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 582003, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 167.39, 'max_sentence1_length': 556, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'tsn_Latn-hau_Latn': {'num_samples': 1997, 'number_of_characters': 604249, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 167.39, 'max_sentence1_length': 556, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 135.19, 'max_sentence2_length': 483, 'unique_sentence2': 1997}, 'tsn_Latn-ibo_Latn': {'num_samples': 1997, 'number_of_characters': 580384, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 167.39, 'max_sentence1_length': 556, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 123.24, 'max_sentence2_length': 469, 'unique_sentence2': 1997}, 'tsn_Latn-nso_Latn': {'num_samples': 1997, 'number_of_characters': 625782, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 167.39, 'max_sentence1_length': 556, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 145.97, 'max_sentence2_length': 487, 'unique_sentence2': 1996}, 'tsn_Latn-orm_Ethi': {'num_samples': 1997, 'number_of_characters': 571714, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 167.39, 'max_sentence1_length': 556, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 118.89, 'max_sentence2_length': 466, 'unique_sentence2': 1984}, 'tsn_Latn-som_Latn': {'num_samples': 1997, 'number_of_characters': 625575, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 167.39, 'max_sentence1_length': 556, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 145.86, 'max_sentence2_length': 455, 'unique_sentence2': 1997}, 'tsn_Latn-ssw_Latn': {'num_samples': 1997, 'number_of_characters': 622425, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 167.39, 'max_sentence1_length': 556, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 144.29, 'max_sentence2_length': 510, 'unique_sentence2': 1996}, 'tsn_Latn-swa_Latn': {'num_samples': 1997, 'number_of_characters': 606792, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 167.39, 'max_sentence1_length': 556, 'unique_sentence1': 1997, 'min_sentence2_length': 10, 'average_sentence2_length': 136.46, 'max_sentence2_length': 430, 'unique_sentence2': 1997}, 'tsn_Latn-tir_Ethi': {'num_samples': 1997, 'number_of_characters': 499521, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 167.39, 'max_sentence1_length': 556, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 82.74, 'max_sentence2_length': 272, 'unique_sentence2': 1996}, 'tsn_Latn-wol_Latn': {'num_samples': 1997, 'number_of_characters': 574086, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 167.39, 'max_sentence1_length': 556, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 120.08, 'max_sentence2_length': 405, 'unique_sentence2': 1990}, 'tsn_Latn-xho_Latn': {'num_samples': 1997, 'number_of_characters': 602373, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 167.39, 'max_sentence1_length': 556, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 134.25, 'max_sentence2_length': 492, 'unique_sentence2': 1997}, 'tsn_Latn-yor_Latn': {'num_samples': 1997, 'number_of_characters': 650371, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 167.39, 'max_sentence1_length': 556, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 158.28, 'max_sentence2_length': 582, 'unique_sentence2': 1996}, 'tsn_Latn-zul_Latn': {'num_samples': 1997, 'number_of_characters': 592015, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 167.39, 'max_sentence1_length': 556, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 129.06, 'max_sentence2_length': 494, 'unique_sentence2': 1996}, 'tuk_Latn-aze_Latn': {'num_samples': 1997, 'number_of_characters': 554908, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 142.85, 'max_sentence1_length': 576, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 135.02, 'max_sentence2_length': 398, 'unique_sentence2': 1997}, 'tuk_Latn-bak_Cyrl': {'num_samples': 1997, 'number_of_characters': 531600, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 142.85, 'max_sentence1_length': 576, 'unique_sentence1': 1996, 'min_sentence2_length': 11, 'average_sentence2_length': 123.35, 'max_sentence2_length': 437, 'unique_sentence2': 1995}, 'tuk_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 532994, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 142.85, 'max_sentence1_length': 576, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'tuk_Latn-kaz_Cyrl': {'num_samples': 1997, 'number_of_characters': 545550, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 142.85, 'max_sentence1_length': 576, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 130.33, 'max_sentence2_length': 473, 'unique_sentence2': 1996}, 'tuk_Latn-kir_Cyrl': {'num_samples': 1997, 'number_of_characters': 536138, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 142.85, 'max_sentence1_length': 576, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 125.62, 'max_sentence2_length': 395, 'unique_sentence2': 1996}, 'tuk_Latn-tat_Cyrl': {'num_samples': 1997, 'number_of_characters': 531200, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 142.85, 'max_sentence1_length': 576, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 123.15, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'tuk_Latn-tur_Latn': {'num_samples': 1997, 'number_of_characters': 550887, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 142.85, 'max_sentence1_length': 576, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 133.01, 'max_sentence2_length': 504, 'unique_sentence2': 1997}, 'tuk_Latn-uig_Arab': {'num_samples': 1997, 'number_of_characters': 596296, 'unique_pairs': 1997, 'min_sentence1_length': 9, 'average_sentence1_length': 142.85, 'max_sentence1_length': 576, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 155.74, 'max_sentence2_length': 592, 'unique_sentence2': 1996}, 'tuk_Latn-uzb_Latn': {'num_samples': 1997, 'number_of_characters': 578969, 'unique_pairs': 1996, 'min_sentence1_length': 9, 'average_sentence1_length': 142.85, 'max_sentence1_length': 576, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 147.07, 'max_sentence2_length': 470, 'unique_sentence2': 1996}, 'tur_Latn-arb_Arab': {'num_samples': 1997, 'number_of_characters': 496794, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 133.01, 'max_sentence1_length': 504, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 115.76, 'max_sentence2_length': 362, 'unique_sentence2': 1995}, 'tur_Latn-aze_Latn': {'num_samples': 1997, 'number_of_characters': 535247, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 133.01, 'max_sentence1_length': 504, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 135.02, 'max_sentence2_length': 398, 'unique_sentence2': 1997}, 'tur_Latn-bak_Cyrl': {'num_samples': 1997, 'number_of_characters': 511939, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 133.01, 'max_sentence1_length': 504, 'unique_sentence1': 1997, 'min_sentence2_length': 11, 'average_sentence2_length': 123.35, 'max_sentence2_length': 437, 'unique_sentence2': 1995}, 'tur_Latn-ben_Beng': {'num_samples': 1997, 'number_of_characters': 509415, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 133.01, 'max_sentence1_length': 504, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 122.08, 'max_sentence2_length': 402, 'unique_sentence2': 1997}, 'tur_Latn-deu_Latn': {'num_samples': 1997, 'number_of_characters': 561263, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 133.01, 'max_sentence1_length': 504, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 148.05, 'max_sentence2_length': 508, 'unique_sentence2': 1996}, 'tur_Latn-ell_Grek': {'num_samples': 1997, 'number_of_characters': 564740, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 133.01, 'max_sentence1_length': 504, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 149.79, 'max_sentence2_length': 584, 'unique_sentence2': 1996}, 'tur_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 513333, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 133.01, 'max_sentence1_length': 504, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'tur_Latn-fas_Arab': {'num_samples': 1997, 'number_of_characters': 508952, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 133.01, 'max_sentence1_length': 504, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 121.85, 'max_sentence2_length': 389, 'unique_sentence2': 1995}, 'tur_Latn-fin_Latn': {'num_samples': 1997, 'number_of_characters': 535413, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 133.01, 'max_sentence1_length': 504, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 135.1, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'tur_Latn-fra_Latn': {'num_samples': 1997, 'number_of_characters': 558721, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 133.01, 'max_sentence1_length': 504, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 146.77, 'max_sentence2_length': 512, 'unique_sentence2': 1996}, 'tur_Latn-heb_Hebr': {'num_samples': 1997, 'number_of_characters': 465909, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 133.01, 'max_sentence1_length': 504, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 100.3, 'max_sentence2_length': 375, 'unique_sentence2': 1996}, 'tur_Latn-hin_Deva': {'num_samples': 1997, 'number_of_characters': 527188, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 133.01, 'max_sentence1_length': 504, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 130.98, 'max_sentence2_length': 394, 'unique_sentence2': 1996}, 'tur_Latn-hun_Latn': {'num_samples': 1997, 'number_of_characters': 543989, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 133.01, 'max_sentence1_length': 504, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 139.4, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'tur_Latn-ind_Latn': {'num_samples': 1997, 'number_of_characters': 552585, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 133.01, 'max_sentence1_length': 504, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 143.7, 'max_sentence2_length': 486, 'unique_sentence2': 1997}, 'tur_Latn-jpn_Jpan': {'num_samples': 1997, 'number_of_characters': 377239, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 133.01, 'max_sentence1_length': 504, 'unique_sentence1': 1997, 'min_sentence2_length': 4, 'average_sentence2_length': 55.9, 'max_sentence2_length': 189, 'unique_sentence2': 1994}, 'tur_Latn-kaz_Cyrl': {'num_samples': 1997, 'number_of_characters': 525889, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 133.01, 'max_sentence1_length': 504, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 130.33, 'max_sentence2_length': 473, 'unique_sentence2': 1996}, 'tur_Latn-kir_Cyrl': {'num_samples': 1997, 'number_of_characters': 516477, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 133.01, 'max_sentence1_length': 504, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 125.62, 'max_sentence2_length': 395, 'unique_sentence2': 1996}, 'tur_Latn-kor_Hang': {'num_samples': 1997, 'number_of_characters': 399018, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 133.01, 'max_sentence1_length': 504, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 66.8, 'max_sentence2_length': 217, 'unique_sentence2': 1995}, 'tur_Latn-lit_Latn': {'num_samples': 1997, 'number_of_characters': 525010, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 133.01, 'max_sentence1_length': 504, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 129.89, 'max_sentence2_length': 446, 'unique_sentence2': 1995}, 'tur_Latn-nld_Latn': {'num_samples': 1997, 'number_of_characters': 557528, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 133.01, 'max_sentence1_length': 504, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 146.18, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'tur_Latn-pol_Latn': {'num_samples': 1997, 'number_of_characters': 543479, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 133.01, 'max_sentence1_length': 504, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 139.14, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'tur_Latn-por_Latn': {'num_samples': 1997, 'number_of_characters': 542828, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 133.01, 'max_sentence1_length': 504, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 138.82, 'max_sentence2_length': 497, 'unique_sentence2': 1996}, 'tur_Latn-rus_Cyrl': {'num_samples': 1997, 'number_of_characters': 540506, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 133.01, 'max_sentence1_length': 504, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 137.65, 'max_sentence2_length': 419, 'unique_sentence2': 1996}, 'tur_Latn-spa_Latn': {'num_samples': 1997, 'number_of_characters': 553813, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 133.01, 'max_sentence1_length': 504, 'unique_sentence1': 1997, 'min_sentence2_length': 1, 'average_sentence2_length': 144.32, 'max_sentence2_length': 504, 'unique_sentence2': 1996}, 'tur_Latn-swa_Latn': {'num_samples': 1997, 'number_of_characters': 538122, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 133.01, 'max_sentence1_length': 504, 'unique_sentence1': 1997, 'min_sentence2_length': 10, 'average_sentence2_length': 136.46, 'max_sentence2_length': 430, 'unique_sentence2': 1997}, 'tur_Latn-swe_Latn': {'num_samples': 1997, 'number_of_characters': 517440, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 133.01, 'max_sentence1_length': 504, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 126.1, 'max_sentence2_length': 430, 'unique_sentence2': 1996}, 'tur_Latn-tam_Taml': {'num_samples': 1997, 'number_of_characters': 575574, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 133.01, 'max_sentence1_length': 504, 'unique_sentence1': 1997, 'min_sentence2_length': 11, 'average_sentence2_length': 155.21, 'max_sentence2_length': 581, 'unique_sentence2': 1997}, 'tur_Latn-tat_Cyrl': {'num_samples': 1997, 'number_of_characters': 511539, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 133.01, 'max_sentence1_length': 504, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 123.15, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'tur_Latn-tuk_Latn': {'num_samples': 1997, 'number_of_characters': 550887, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 133.01, 'max_sentence1_length': 504, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 142.85, 'max_sentence2_length': 576, 'unique_sentence2': 1996}, 'tur_Latn-uig_Arab': {'num_samples': 1997, 'number_of_characters': 576635, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 133.01, 'max_sentence1_length': 504, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 155.74, 'max_sentence2_length': 592, 'unique_sentence2': 1996}, 'tur_Latn-uzb_Latn': {'num_samples': 1997, 'number_of_characters': 559308, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 133.01, 'max_sentence1_length': 504, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 147.07, 'max_sentence2_length': 470, 'unique_sentence2': 1996}, 'tur_Latn-vie_Latn': {'num_samples': 1997, 'number_of_characters': 536734, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 133.01, 'max_sentence1_length': 504, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 135.76, 'max_sentence2_length': 437, 'unique_sentence2': 1996}, 'tur_Latn-zho_Hant': {'num_samples': 1997, 'number_of_characters': 357091, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 133.01, 'max_sentence1_length': 504, 'unique_sentence1': 1997, 'min_sentence2_length': 3, 'average_sentence2_length': 45.81, 'max_sentence2_length': 200, 'unique_sentence2': 1996}, 'tur_Latn-zul_Latn': {'num_samples': 1997, 'number_of_characters': 523345, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 133.01, 'max_sentence1_length': 504, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 129.06, 'max_sentence2_length': 494, 'unique_sentence2': 1996}, 'uig_Arab-aze_Latn': {'num_samples': 1997, 'number_of_characters': 580656, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 155.74, 'max_sentence1_length': 592, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 135.02, 'max_sentence2_length': 398, 'unique_sentence2': 1997}, 'uig_Arab-bak_Cyrl': {'num_samples': 1997, 'number_of_characters': 557348, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 155.74, 'max_sentence1_length': 592, 'unique_sentence1': 1996, 'min_sentence2_length': 11, 'average_sentence2_length': 123.35, 'max_sentence2_length': 437, 'unique_sentence2': 1995}, 'uig_Arab-eng_Latn': {'num_samples': 1997, 'number_of_characters': 558742, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 155.74, 'max_sentence1_length': 592, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'uig_Arab-kaz_Cyrl': {'num_samples': 1997, 'number_of_characters': 571298, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 155.74, 'max_sentence1_length': 592, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 130.33, 'max_sentence2_length': 473, 'unique_sentence2': 1996}, 'uig_Arab-kir_Cyrl': {'num_samples': 1997, 'number_of_characters': 561886, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 155.74, 'max_sentence1_length': 592, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 125.62, 'max_sentence2_length': 395, 'unique_sentence2': 1996}, 'uig_Arab-tat_Cyrl': {'num_samples': 1997, 'number_of_characters': 556948, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 155.74, 'max_sentence1_length': 592, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 123.15, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'uig_Arab-tuk_Latn': {'num_samples': 1997, 'number_of_characters': 596296, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 155.74, 'max_sentence1_length': 592, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 142.85, 'max_sentence2_length': 576, 'unique_sentence2': 1996}, 'uig_Arab-tur_Latn': {'num_samples': 1997, 'number_of_characters': 576635, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 155.74, 'max_sentence1_length': 592, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 133.01, 'max_sentence2_length': 504, 'unique_sentence2': 1997}, 'uig_Arab-uzb_Latn': {'num_samples': 1997, 'number_of_characters': 604717, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 155.74, 'max_sentence1_length': 592, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 147.07, 'max_sentence2_length': 470, 'unique_sentence2': 1996}, 'ukr_Cyrl-bel_Cyrl': {'num_samples': 1997, 'number_of_characters': 518873, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 131.59, 'max_sentence1_length': 440, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 128.24, 'max_sentence2_length': 422, 'unique_sentence2': 1996}, 'ukr_Cyrl-bos_Latn': {'num_samples': 1997, 'number_of_characters': 517693, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 131.59, 'max_sentence1_length': 440, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 127.65, 'max_sentence2_length': 434, 'unique_sentence2': 1996}, 'ukr_Cyrl-bul_Cyrl': {'num_samples': 1997, 'number_of_characters': 532672, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 131.59, 'max_sentence1_length': 440, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 135.15, 'max_sentence2_length': 493, 'unique_sentence2': 1996}, 'ukr_Cyrl-ces_Latn': {'num_samples': 1997, 'number_of_characters': 504101, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 131.59, 'max_sentence1_length': 440, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 120.84, 'max_sentence2_length': 474, 'unique_sentence2': 1997}, 'ukr_Cyrl-eng_Latn': {'num_samples': 1997, 'number_of_characters': 510503, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 131.59, 'max_sentence1_length': 440, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'ukr_Cyrl-hrv_Latn': {'num_samples': 1997, 'number_of_characters': 518708, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 131.59, 'max_sentence1_length': 440, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 128.15, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'ukr_Cyrl-mkd_Cyrl': {'num_samples': 1997, 'number_of_characters': 530674, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 131.59, 'max_sentence1_length': 440, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 134.15, 'max_sentence2_length': 451, 'unique_sentence2': 1997}, 'ukr_Cyrl-pol_Latn': {'num_samples': 1997, 'number_of_characters': 540649, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 131.59, 'max_sentence1_length': 440, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 139.14, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'ukr_Cyrl-rus_Cyrl': {'num_samples': 1997, 'number_of_characters': 537676, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 131.59, 'max_sentence1_length': 440, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 137.65, 'max_sentence2_length': 419, 'unique_sentence2': 1996}, 'ukr_Cyrl-slk_Latn': {'num_samples': 1997, 'number_of_characters': 515752, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 131.59, 'max_sentence1_length': 440, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 126.67, 'max_sentence2_length': 403, 'unique_sentence2': 1996}, 'ukr_Cyrl-slv_Latn': {'num_samples': 1997, 'number_of_characters': 515679, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 131.59, 'max_sentence1_length': 440, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 126.64, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'ukr_Cyrl-srp_Cyrl': {'num_samples': 1997, 'number_of_characters': 515086, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 131.59, 'max_sentence1_length': 440, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 126.34, 'max_sentence2_length': 439, 'unique_sentence2': 1995}, 'ukr_Cyrl-srp_Latn': {'num_samples': 1997, 'number_of_characters': 518924, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 131.59, 'max_sentence1_length': 440, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 128.26, 'max_sentence2_length': 452, 'unique_sentence2': 1996}, 'urd_Arab-ben_Beng': {'num_samples': 1997, 'number_of_characters': 491800, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.19, 'max_sentence1_length': 390, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 122.08, 'max_sentence2_length': 402, 'unique_sentence2': 1997}, 'urd_Arab-div_Thaa': {'num_samples': 1997, 'number_of_characters': 551846, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 124.19, 'max_sentence1_length': 390, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 152.15, 'max_sentence2_length': 609, 'unique_sentence2': 1996}, 'urd_Arab-eng_Latn': {'num_samples': 1997, 'number_of_characters': 495718, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.19, 'max_sentence1_length': 390, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'urd_Arab-eus_Latn': {'num_samples': 1997, 'number_of_characters': 523201, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.19, 'max_sentence1_length': 390, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 137.81, 'max_sentence2_length': 393, 'unique_sentence2': 1997}, 'urd_Arab-guj_Gujr': {'num_samples': 1997, 'number_of_characters': 492929, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.19, 'max_sentence1_length': 390, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 122.65, 'max_sentence2_length': 378, 'unique_sentence2': 1997}, 'urd_Arab-hin_Deva': {'num_samples': 1997, 'number_of_characters': 509573, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 124.19, 'max_sentence1_length': 390, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 130.98, 'max_sentence2_length': 394, 'unique_sentence2': 1996}, 'urd_Arab-kan_Knda': {'num_samples': 1997, 'number_of_characters': 513534, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 124.19, 'max_sentence1_length': 390, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 132.97, 'max_sentence2_length': 449, 'unique_sentence2': 1996}, 'urd_Arab-mar_Deva': {'num_samples': 1997, 'number_of_characters': 508885, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 124.19, 'max_sentence1_length': 390, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 130.64, 'max_sentence2_length': 443, 'unique_sentence2': 1995}, 'urd_Arab-nep_Deva': {'num_samples': 1997, 'number_of_characters': 496221, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 124.19, 'max_sentence1_length': 390, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 124.3, 'max_sentence2_length': 412, 'unique_sentence2': 1996}, 'urd_Arab-pan_Guru': {'num_samples': 1997, 'number_of_characters': 498420, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 124.19, 'max_sentence1_length': 390, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 125.4, 'max_sentence2_length': 383, 'unique_sentence2': 1996}, 'urd_Arab-sin_Sinh': {'num_samples': 1997, 'number_of_characters': 506739, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 124.19, 'max_sentence1_length': 390, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 129.56, 'max_sentence2_length': 441, 'unique_sentence2': 1996}, 'urd_Arab-snd_Arab': {'num_samples': 1997, 'number_of_characters': 468325, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 124.19, 'max_sentence1_length': 390, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 110.33, 'max_sentence2_length': 335, 'unique_sentence2': 1996}, 'urd_Arab-tam_Taml': {'num_samples': 1997, 'number_of_characters': 557959, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 124.19, 'max_sentence1_length': 390, 'unique_sentence1': 1996, 'min_sentence2_length': 11, 'average_sentence2_length': 155.21, 'max_sentence2_length': 581, 'unique_sentence2': 1997}, 'urd_Arab-tel_Telu': {'num_samples': 1997, 'number_of_characters': 495525, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 124.19, 'max_sentence1_length': 390, 'unique_sentence1': 1996, 'min_sentence2_length': 10, 'average_sentence2_length': 123.95, 'max_sentence2_length': 412, 'unique_sentence2': 1996}, 'uzb_Latn-aze_Latn': {'num_samples': 1997, 'number_of_characters': 563329, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 147.07, 'max_sentence1_length': 470, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 135.02, 'max_sentence2_length': 398, 'unique_sentence2': 1997}, 'uzb_Latn-bak_Cyrl': {'num_samples': 1997, 'number_of_characters': 540021, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 147.07, 'max_sentence1_length': 470, 'unique_sentence1': 1996, 'min_sentence2_length': 11, 'average_sentence2_length': 123.35, 'max_sentence2_length': 437, 'unique_sentence2': 1995}, 'uzb_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 541415, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 147.07, 'max_sentence1_length': 470, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'uzb_Latn-kaz_Cyrl': {'num_samples': 1997, 'number_of_characters': 553971, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 147.07, 'max_sentence1_length': 470, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 130.33, 'max_sentence2_length': 473, 'unique_sentence2': 1996}, 'uzb_Latn-kir_Cyrl': {'num_samples': 1997, 'number_of_characters': 544559, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 147.07, 'max_sentence1_length': 470, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 125.62, 'max_sentence2_length': 395, 'unique_sentence2': 1996}, 'uzb_Latn-tat_Cyrl': {'num_samples': 1997, 'number_of_characters': 539621, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 147.07, 'max_sentence1_length': 470, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 123.15, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'uzb_Latn-tuk_Latn': {'num_samples': 1997, 'number_of_characters': 578969, 'unique_pairs': 1996, 'min_sentence1_length': 8, 'average_sentence1_length': 147.07, 'max_sentence1_length': 470, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 142.85, 'max_sentence2_length': 576, 'unique_sentence2': 1996}, 'uzb_Latn-tur_Latn': {'num_samples': 1997, 'number_of_characters': 559308, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 147.07, 'max_sentence1_length': 470, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 133.01, 'max_sentence2_length': 504, 'unique_sentence2': 1997}, 'uzb_Latn-uig_Arab': {'num_samples': 1997, 'number_of_characters': 604717, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 147.07, 'max_sentence1_length': 470, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 155.74, 'max_sentence2_length': 592, 'unique_sentence2': 1996}, 'ven_Latn-bem_Latn': {'num_samples': 1997, 'number_of_characters': 598248, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 150.1, 'max_sentence1_length': 535, 'unique_sentence1': 1993, 'min_sentence2_length': 8, 'average_sentence2_length': 149.47, 'max_sentence2_length': 465, 'unique_sentence2': 1997}, 'ven_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 547476, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 150.1, 'max_sentence1_length': 535, 'unique_sentence1': 1993, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'ven_Latn-ewe_Latn': {'num_samples': 1997, 'number_of_characters': 538734, 'unique_pairs': 1996, 'min_sentence1_length': 10, 'average_sentence1_length': 150.1, 'max_sentence1_length': 535, 'unique_sentence1': 1993, 'min_sentence2_length': 6, 'average_sentence2_length': 119.67, 'max_sentence2_length': 493, 'unique_sentence2': 1994}, 'ven_Latn-fuc_Latn': {'num_samples': 1997, 'number_of_characters': 528236, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 150.1, 'max_sentence1_length': 535, 'unique_sentence1': 1993, 'min_sentence2_length': 6, 'average_sentence2_length': 114.41, 'max_sentence2_length': 376, 'unique_sentence2': 1996}, 'ven_Latn-kin_Latn': {'num_samples': 1997, 'number_of_characters': 603543, 'unique_pairs': 1996, 'min_sentence1_length': 10, 'average_sentence1_length': 150.1, 'max_sentence1_length': 535, 'unique_sentence1': 1993, 'min_sentence2_length': 10, 'average_sentence2_length': 152.12, 'max_sentence2_length': 541, 'unique_sentence2': 1996}, 'ven_Latn-nde_Latn': {'num_samples': 1997, 'number_of_characters': 597495, 'unique_pairs': 1997, 'min_sentence1_length': 10, 'average_sentence1_length': 150.1, 'max_sentence1_length': 535, 'unique_sentence1': 1993, 'min_sentence2_length': 6, 'average_sentence2_length': 149.09, 'max_sentence2_length': 590, 'unique_sentence2': 1997}, 'ven_Latn-nya_Latn': {'num_samples': 1997, 'number_of_characters': 584038, 'unique_pairs': 1996, 'min_sentence1_length': 10, 'average_sentence1_length': 150.1, 'max_sentence1_length': 535, 'unique_sentence1': 1993, 'min_sentence2_length': 10, 'average_sentence2_length': 142.35, 'max_sentence2_length': 464, 'unique_sentence2': 1993}, 'ven_Latn-sna_Latn': {'num_samples': 1997, 'number_of_characters': 598086, 'unique_pairs': 1995, 'min_sentence1_length': 10, 'average_sentence1_length': 150.1, 'max_sentence1_length': 535, 'unique_sentence1': 1993, 'min_sentence2_length': 6, 'average_sentence2_length': 149.39, 'max_sentence2_length': 511, 'unique_sentence2': 1995}, 'vie_Latn-arb_Arab': {'num_samples': 1997, 'number_of_characters': 502302, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 135.76, 'max_sentence1_length': 437, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 115.76, 'max_sentence2_length': 362, 'unique_sentence2': 1995}, 'vie_Latn-ben_Beng': {'num_samples': 1997, 'number_of_characters': 514923, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 135.76, 'max_sentence1_length': 437, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 122.08, 'max_sentence2_length': 402, 'unique_sentence2': 1997}, 'vie_Latn-deu_Latn': {'num_samples': 1997, 'number_of_characters': 566771, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 135.76, 'max_sentence1_length': 437, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 148.05, 'max_sentence2_length': 508, 'unique_sentence2': 1996}, 'vie_Latn-ell_Grek': {'num_samples': 1997, 'number_of_characters': 570248, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 135.76, 'max_sentence1_length': 437, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 149.79, 'max_sentence2_length': 584, 'unique_sentence2': 1996}, 'vie_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 518841, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 135.76, 'max_sentence1_length': 437, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'vie_Latn-fas_Arab': {'num_samples': 1997, 'number_of_characters': 514460, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 135.76, 'max_sentence1_length': 437, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 121.85, 'max_sentence2_length': 389, 'unique_sentence2': 1995}, 'vie_Latn-fin_Latn': {'num_samples': 1997, 'number_of_characters': 540921, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 135.76, 'max_sentence1_length': 437, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 135.1, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'vie_Latn-fra_Latn': {'num_samples': 1997, 'number_of_characters': 564229, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 135.76, 'max_sentence1_length': 437, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 146.77, 'max_sentence2_length': 512, 'unique_sentence2': 1996}, 'vie_Latn-heb_Hebr': {'num_samples': 1997, 'number_of_characters': 471417, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 135.76, 'max_sentence1_length': 437, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 100.3, 'max_sentence2_length': 375, 'unique_sentence2': 1996}, 'vie_Latn-hin_Deva': {'num_samples': 1997, 'number_of_characters': 532696, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 135.76, 'max_sentence1_length': 437, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 130.98, 'max_sentence2_length': 394, 'unique_sentence2': 1996}, 'vie_Latn-hun_Latn': {'num_samples': 1997, 'number_of_characters': 549497, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 135.76, 'max_sentence1_length': 437, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 139.4, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'vie_Latn-ind_Latn': {'num_samples': 1997, 'number_of_characters': 558093, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 135.76, 'max_sentence1_length': 437, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 143.7, 'max_sentence2_length': 486, 'unique_sentence2': 1997}, 'vie_Latn-jpn_Jpan': {'num_samples': 1997, 'number_of_characters': 382747, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 135.76, 'max_sentence1_length': 437, 'unique_sentence1': 1996, 'min_sentence2_length': 4, 'average_sentence2_length': 55.9, 'max_sentence2_length': 189, 'unique_sentence2': 1994}, 'vie_Latn-kor_Hang': {'num_samples': 1997, 'number_of_characters': 404526, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 135.76, 'max_sentence1_length': 437, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 66.8, 'max_sentence2_length': 217, 'unique_sentence2': 1995}, 'vie_Latn-lit_Latn': {'num_samples': 1997, 'number_of_characters': 530518, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 135.76, 'max_sentence1_length': 437, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 129.89, 'max_sentence2_length': 446, 'unique_sentence2': 1995}, 'vie_Latn-nld_Latn': {'num_samples': 1997, 'number_of_characters': 563036, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 135.76, 'max_sentence1_length': 437, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 146.18, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'vie_Latn-pol_Latn': {'num_samples': 1997, 'number_of_characters': 548987, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 135.76, 'max_sentence1_length': 437, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 139.14, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'vie_Latn-por_Latn': {'num_samples': 1997, 'number_of_characters': 548336, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 135.76, 'max_sentence1_length': 437, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 138.82, 'max_sentence2_length': 497, 'unique_sentence2': 1996}, 'vie_Latn-rus_Cyrl': {'num_samples': 1997, 'number_of_characters': 546014, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 135.76, 'max_sentence1_length': 437, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 137.65, 'max_sentence2_length': 419, 'unique_sentence2': 1996}, 'vie_Latn-spa_Latn': {'num_samples': 1997, 'number_of_characters': 559321, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 135.76, 'max_sentence1_length': 437, 'unique_sentence1': 1996, 'min_sentence2_length': 1, 'average_sentence2_length': 144.32, 'max_sentence2_length': 504, 'unique_sentence2': 1996}, 'vie_Latn-swa_Latn': {'num_samples': 1997, 'number_of_characters': 543630, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 135.76, 'max_sentence1_length': 437, 'unique_sentence1': 1996, 'min_sentence2_length': 10, 'average_sentence2_length': 136.46, 'max_sentence2_length': 430, 'unique_sentence2': 1997}, 'vie_Latn-swe_Latn': {'num_samples': 1997, 'number_of_characters': 522948, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 135.76, 'max_sentence1_length': 437, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 126.1, 'max_sentence2_length': 430, 'unique_sentence2': 1996}, 'vie_Latn-tam_Taml': {'num_samples': 1997, 'number_of_characters': 581082, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 135.76, 'max_sentence1_length': 437, 'unique_sentence1': 1996, 'min_sentence2_length': 11, 'average_sentence2_length': 155.21, 'max_sentence2_length': 581, 'unique_sentence2': 1997}, 'vie_Latn-tur_Latn': {'num_samples': 1997, 'number_of_characters': 536734, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 135.76, 'max_sentence1_length': 437, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 133.01, 'max_sentence2_length': 504, 'unique_sentence2': 1997}, 'vie_Latn-yue_Hant': {'num_samples': 1997, 'number_of_characters': 350008, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 135.76, 'max_sentence1_length': 437, 'unique_sentence1': 1996, 'min_sentence2_length': 4, 'average_sentence2_length': 39.5, 'max_sentence2_length': 133, 'unique_sentence2': 1996}, 'vie_Latn-zho_Hans': {'num_samples': 1997, 'number_of_characters': 356082, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 135.76, 'max_sentence1_length': 437, 'unique_sentence1': 1996, 'min_sentence2_length': 4, 'average_sentence2_length': 42.54, 'max_sentence2_length': 263, 'unique_sentence2': 1997}, 'vie_Latn-zho_Hant': {'num_samples': 1997, 'number_of_characters': 362599, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 135.76, 'max_sentence1_length': 437, 'unique_sentence1': 1996, 'min_sentence2_length': 3, 'average_sentence2_length': 45.81, 'max_sentence2_length': 200, 'unique_sentence2': 1996}, 'vie_Latn-zul_Latn': {'num_samples': 1997, 'number_of_characters': 528853, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 135.76, 'max_sentence1_length': 437, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 129.06, 'max_sentence2_length': 494, 'unique_sentence2': 1996}, 'wol_Latn-amh_Ethi': {'num_samples': 1997, 'number_of_characters': 407310, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 120.08, 'max_sentence1_length': 405, 'unique_sentence1': 1990, 'min_sentence2_length': 1, 'average_sentence2_length': 83.88, 'max_sentence2_length': 290, 'unique_sentence2': 1994}, 'wol_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 487523, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 120.08, 'max_sentence1_length': 405, 'unique_sentence1': 1990, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'wol_Latn-hau_Latn': {'num_samples': 1997, 'number_of_characters': 509769, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 120.08, 'max_sentence1_length': 405, 'unique_sentence1': 1990, 'min_sentence2_length': 8, 'average_sentence2_length': 135.19, 'max_sentence2_length': 483, 'unique_sentence2': 1997}, 'wol_Latn-ibo_Latn': {'num_samples': 1997, 'number_of_characters': 485904, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 120.08, 'max_sentence1_length': 405, 'unique_sentence1': 1990, 'min_sentence2_length': 6, 'average_sentence2_length': 123.24, 'max_sentence2_length': 469, 'unique_sentence2': 1997}, 'wol_Latn-nso_Latn': {'num_samples': 1997, 'number_of_characters': 531302, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 120.08, 'max_sentence1_length': 405, 'unique_sentence1': 1990, 'min_sentence2_length': 5, 'average_sentence2_length': 145.97, 'max_sentence2_length': 487, 'unique_sentence2': 1996}, 'wol_Latn-orm_Ethi': {'num_samples': 1997, 'number_of_characters': 477234, 'unique_pairs': 1992, 'min_sentence1_length': 6, 'average_sentence1_length': 120.08, 'max_sentence1_length': 405, 'unique_sentence1': 1990, 'min_sentence2_length': 9, 'average_sentence2_length': 118.89, 'max_sentence2_length': 466, 'unique_sentence2': 1984}, 'wol_Latn-som_Latn': {'num_samples': 1997, 'number_of_characters': 531095, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 120.08, 'max_sentence1_length': 405, 'unique_sentence1': 1990, 'min_sentence2_length': 8, 'average_sentence2_length': 145.86, 'max_sentence2_length': 455, 'unique_sentence2': 1997}, 'wol_Latn-ssw_Latn': {'num_samples': 1997, 'number_of_characters': 527945, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 120.08, 'max_sentence1_length': 405, 'unique_sentence1': 1990, 'min_sentence2_length': 8, 'average_sentence2_length': 144.29, 'max_sentence2_length': 510, 'unique_sentence2': 1996}, 'wol_Latn-swa_Latn': {'num_samples': 1997, 'number_of_characters': 512312, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 120.08, 'max_sentence1_length': 405, 'unique_sentence1': 1990, 'min_sentence2_length': 10, 'average_sentence2_length': 136.46, 'max_sentence2_length': 430, 'unique_sentence2': 1997}, 'wol_Latn-tir_Ethi': {'num_samples': 1997, 'number_of_characters': 405041, 'unique_pairs': 1996, 'min_sentence1_length': 6, 'average_sentence1_length': 120.08, 'max_sentence1_length': 405, 'unique_sentence1': 1990, 'min_sentence2_length': 5, 'average_sentence2_length': 82.74, 'max_sentence2_length': 272, 'unique_sentence2': 1996}, 'wol_Latn-tsn_Latn': {'num_samples': 1997, 'number_of_characters': 574086, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 120.08, 'max_sentence1_length': 405, 'unique_sentence1': 1990, 'min_sentence2_length': 6, 'average_sentence2_length': 167.39, 'max_sentence2_length': 556, 'unique_sentence2': 1997}, 'wol_Latn-xho_Latn': {'num_samples': 1997, 'number_of_characters': 507893, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 120.08, 'max_sentence1_length': 405, 'unique_sentence1': 1990, 'min_sentence2_length': 6, 'average_sentence2_length': 134.25, 'max_sentence2_length': 492, 'unique_sentence2': 1997}, 'wol_Latn-yor_Latn': {'num_samples': 1997, 'number_of_characters': 555891, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 120.08, 'max_sentence1_length': 405, 'unique_sentence1': 1990, 'min_sentence2_length': 7, 'average_sentence2_length': 158.28, 'max_sentence2_length': 582, 'unique_sentence2': 1996}, 'wol_Latn-zul_Latn': {'num_samples': 1997, 'number_of_characters': 497535, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 120.08, 'max_sentence1_length': 405, 'unique_sentence1': 1990, 'min_sentence2_length': 8, 'average_sentence2_length': 129.06, 'max_sentence2_length': 494, 'unique_sentence2': 1996}, 'xho_Latn-amh_Ethi': {'num_samples': 1997, 'number_of_characters': 435597, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 134.25, 'max_sentence1_length': 492, 'unique_sentence1': 1997, 'min_sentence2_length': 1, 'average_sentence2_length': 83.88, 'max_sentence2_length': 290, 'unique_sentence2': 1994}, 'xho_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 515810, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 134.25, 'max_sentence1_length': 492, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'xho_Latn-hau_Latn': {'num_samples': 1997, 'number_of_characters': 538056, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 134.25, 'max_sentence1_length': 492, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 135.19, 'max_sentence2_length': 483, 'unique_sentence2': 1997}, 'xho_Latn-ibo_Latn': {'num_samples': 1997, 'number_of_characters': 514191, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 134.25, 'max_sentence1_length': 492, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 123.24, 'max_sentence2_length': 469, 'unique_sentence2': 1997}, 'xho_Latn-nso_Latn': {'num_samples': 1997, 'number_of_characters': 559589, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 134.25, 'max_sentence1_length': 492, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 145.97, 'max_sentence2_length': 487, 'unique_sentence2': 1996}, 'xho_Latn-orm_Ethi': {'num_samples': 1997, 'number_of_characters': 505521, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 134.25, 'max_sentence1_length': 492, 'unique_sentence1': 1997, 'min_sentence2_length': 9, 'average_sentence2_length': 118.89, 'max_sentence2_length': 466, 'unique_sentence2': 1984}, 'xho_Latn-som_Latn': {'num_samples': 1997, 'number_of_characters': 559382, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 134.25, 'max_sentence1_length': 492, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 145.86, 'max_sentence2_length': 455, 'unique_sentence2': 1997}, 'xho_Latn-ssw_Latn': {'num_samples': 1997, 'number_of_characters': 556232, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 134.25, 'max_sentence1_length': 492, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 144.29, 'max_sentence2_length': 510, 'unique_sentence2': 1996}, 'xho_Latn-swa_Latn': {'num_samples': 1997, 'number_of_characters': 540599, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 134.25, 'max_sentence1_length': 492, 'unique_sentence1': 1997, 'min_sentence2_length': 10, 'average_sentence2_length': 136.46, 'max_sentence2_length': 430, 'unique_sentence2': 1997}, 'xho_Latn-tir_Ethi': {'num_samples': 1997, 'number_of_characters': 433328, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 134.25, 'max_sentence1_length': 492, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 82.74, 'max_sentence2_length': 272, 'unique_sentence2': 1996}, 'xho_Latn-tsn_Latn': {'num_samples': 1997, 'number_of_characters': 602373, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 134.25, 'max_sentence1_length': 492, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 167.39, 'max_sentence2_length': 556, 'unique_sentence2': 1997}, 'xho_Latn-wol_Latn': {'num_samples': 1997, 'number_of_characters': 507893, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 134.25, 'max_sentence1_length': 492, 'unique_sentence1': 1997, 'min_sentence2_length': 6, 'average_sentence2_length': 120.08, 'max_sentence2_length': 405, 'unique_sentence2': 1990}, 'xho_Latn-yor_Latn': {'num_samples': 1997, 'number_of_characters': 584178, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 134.25, 'max_sentence1_length': 492, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 158.28, 'max_sentence2_length': 582, 'unique_sentence2': 1996}, 'xho_Latn-zul_Latn': {'num_samples': 1997, 'number_of_characters': 525822, 'unique_pairs': 1997, 'min_sentence1_length': 6, 'average_sentence1_length': 134.25, 'max_sentence1_length': 492, 'unique_sentence1': 1997, 'min_sentence2_length': 8, 'average_sentence2_length': 129.06, 'max_sentence2_length': 494, 'unique_sentence2': 1996}, 'yor_Latn-amh_Ethi': {'num_samples': 1997, 'number_of_characters': 483595, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 158.28, 'max_sentence1_length': 582, 'unique_sentence1': 1996, 'min_sentence2_length': 1, 'average_sentence2_length': 83.88, 'max_sentence2_length': 290, 'unique_sentence2': 1994}, 'yor_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 563808, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 158.28, 'max_sentence1_length': 582, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'yor_Latn-hau_Latn': {'num_samples': 1997, 'number_of_characters': 586054, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 158.28, 'max_sentence1_length': 582, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 135.19, 'max_sentence2_length': 483, 'unique_sentence2': 1997}, 'yor_Latn-ibo_Latn': {'num_samples': 1997, 'number_of_characters': 562189, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 158.28, 'max_sentence1_length': 582, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 123.24, 'max_sentence2_length': 469, 'unique_sentence2': 1997}, 'yor_Latn-nso_Latn': {'num_samples': 1997, 'number_of_characters': 607587, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 158.28, 'max_sentence1_length': 582, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 145.97, 'max_sentence2_length': 487, 'unique_sentence2': 1996}, 'yor_Latn-orm_Ethi': {'num_samples': 1997, 'number_of_characters': 553519, 'unique_pairs': 1996, 'min_sentence1_length': 7, 'average_sentence1_length': 158.28, 'max_sentence1_length': 582, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 118.89, 'max_sentence2_length': 466, 'unique_sentence2': 1984}, 'yor_Latn-som_Latn': {'num_samples': 1997, 'number_of_characters': 607380, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 158.28, 'max_sentence1_length': 582, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 145.86, 'max_sentence2_length': 455, 'unique_sentence2': 1997}, 'yor_Latn-ssw_Latn': {'num_samples': 1997, 'number_of_characters': 604230, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 158.28, 'max_sentence1_length': 582, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 144.29, 'max_sentence2_length': 510, 'unique_sentence2': 1996}, 'yor_Latn-swa_Latn': {'num_samples': 1997, 'number_of_characters': 588597, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 158.28, 'max_sentence1_length': 582, 'unique_sentence1': 1996, 'min_sentence2_length': 10, 'average_sentence2_length': 136.46, 'max_sentence2_length': 430, 'unique_sentence2': 1997}, 'yor_Latn-tir_Ethi': {'num_samples': 1997, 'number_of_characters': 481326, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 158.28, 'max_sentence1_length': 582, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 82.74, 'max_sentence2_length': 272, 'unique_sentence2': 1996}, 'yor_Latn-tsn_Latn': {'num_samples': 1997, 'number_of_characters': 650371, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 158.28, 'max_sentence1_length': 582, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 167.39, 'max_sentence2_length': 556, 'unique_sentence2': 1997}, 'yor_Latn-wol_Latn': {'num_samples': 1997, 'number_of_characters': 555891, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 158.28, 'max_sentence1_length': 582, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 120.08, 'max_sentence2_length': 405, 'unique_sentence2': 1990}, 'yor_Latn-xho_Latn': {'num_samples': 1997, 'number_of_characters': 584178, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 158.28, 'max_sentence1_length': 582, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 134.25, 'max_sentence2_length': 492, 'unique_sentence2': 1997}, 'yor_Latn-zul_Latn': {'num_samples': 1997, 'number_of_characters': 573820, 'unique_pairs': 1997, 'min_sentence1_length': 7, 'average_sentence1_length': 158.28, 'max_sentence1_length': 582, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 129.06, 'max_sentence2_length': 494, 'unique_sentence2': 1996}, 'yue_Hant-eng_Latn': {'num_samples': 1997, 'number_of_characters': 326607, 'unique_pairs': 1997, 'min_sentence1_length': 4, 'average_sentence1_length': 39.5, 'max_sentence1_length': 133, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'yue_Hant-jpn_Jpan': {'num_samples': 1997, 'number_of_characters': 190513, 'unique_pairs': 1996, 'min_sentence1_length': 4, 'average_sentence1_length': 39.5, 'max_sentence1_length': 133, 'unique_sentence1': 1996, 'min_sentence2_length': 4, 'average_sentence2_length': 55.9, 'max_sentence2_length': 189, 'unique_sentence2': 1994}, 'yue_Hant-kor_Hang': {'num_samples': 1997, 'number_of_characters': 212292, 'unique_pairs': 1996, 'min_sentence1_length': 4, 'average_sentence1_length': 39.5, 'max_sentence1_length': 133, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 66.8, 'max_sentence2_length': 217, 'unique_sentence2': 1995}, 'yue_Hant-vie_Latn': {'num_samples': 1997, 'number_of_characters': 350008, 'unique_pairs': 1997, 'min_sentence1_length': 4, 'average_sentence1_length': 39.5, 'max_sentence1_length': 133, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 135.76, 'max_sentence2_length': 437, 'unique_sentence2': 1996}, 'yue_Hant-zho_Hans': {'num_samples': 1997, 'number_of_characters': 163848, 'unique_pairs': 1997, 'min_sentence1_length': 4, 'average_sentence1_length': 39.5, 'max_sentence1_length': 133, 'unique_sentence1': 1996, 'min_sentence2_length': 4, 'average_sentence2_length': 42.54, 'max_sentence2_length': 263, 'unique_sentence2': 1997}, 'yue_Hant-zho_Hant': {'num_samples': 1997, 'number_of_characters': 170365, 'unique_pairs': 1996, 'min_sentence1_length': 4, 'average_sentence1_length': 39.5, 'max_sentence1_length': 133, 'unique_sentence1': 1996, 'min_sentence2_length': 3, 'average_sentence2_length': 45.81, 'max_sentence2_length': 200, 'unique_sentence2': 1996}, 'zho_Hans-eng_Latn': {'num_samples': 1997, 'number_of_characters': 332681, 'unique_pairs': 1997, 'min_sentence1_length': 4, 'average_sentence1_length': 42.54, 'max_sentence1_length': 263, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'zho_Hans-jpn_Jpan': {'num_samples': 1997, 'number_of_characters': 196587, 'unique_pairs': 1997, 'min_sentence1_length': 4, 'average_sentence1_length': 42.54, 'max_sentence1_length': 263, 'unique_sentence1': 1997, 'min_sentence2_length': 4, 'average_sentence2_length': 55.9, 'max_sentence2_length': 189, 'unique_sentence2': 1994}, 'zho_Hans-kor_Hang': {'num_samples': 1997, 'number_of_characters': 218366, 'unique_pairs': 1997, 'min_sentence1_length': 4, 'average_sentence1_length': 42.54, 'max_sentence1_length': 263, 'unique_sentence1': 1997, 'min_sentence2_length': 5, 'average_sentence2_length': 66.8, 'max_sentence2_length': 217, 'unique_sentence2': 1995}, 'zho_Hans-vie_Latn': {'num_samples': 1997, 'number_of_characters': 356082, 'unique_pairs': 1997, 'min_sentence1_length': 4, 'average_sentence1_length': 42.54, 'max_sentence1_length': 263, 'unique_sentence1': 1997, 'min_sentence2_length': 7, 'average_sentence2_length': 135.76, 'max_sentence2_length': 437, 'unique_sentence2': 1996}, 'zho_Hans-yue_Hant': {'num_samples': 1997, 'number_of_characters': 163848, 'unique_pairs': 1997, 'min_sentence1_length': 4, 'average_sentence1_length': 42.54, 'max_sentence1_length': 263, 'unique_sentence1': 1997, 'min_sentence2_length': 4, 'average_sentence2_length': 39.5, 'max_sentence2_length': 133, 'unique_sentence2': 1996}, 'zho_Hans-zho_Hant': {'num_samples': 1997, 'number_of_characters': 176439, 'unique_pairs': 1997, 'min_sentence1_length': 4, 'average_sentence1_length': 42.54, 'max_sentence1_length': 263, 'unique_sentence1': 1997, 'min_sentence2_length': 3, 'average_sentence2_length': 45.81, 'max_sentence2_length': 200, 'unique_sentence2': 1996}, 'zho_Hant-arb_Arab': {'num_samples': 1997, 'number_of_characters': 322659, 'unique_pairs': 1996, 'min_sentence1_length': 3, 'average_sentence1_length': 45.81, 'max_sentence1_length': 200, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 115.76, 'max_sentence2_length': 362, 'unique_sentence2': 1995}, 'zho_Hant-ben_Beng': {'num_samples': 1997, 'number_of_characters': 335280, 'unique_pairs': 1997, 'min_sentence1_length': 3, 'average_sentence1_length': 45.81, 'max_sentence1_length': 200, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 122.08, 'max_sentence2_length': 402, 'unique_sentence2': 1997}, 'zho_Hant-deu_Latn': {'num_samples': 1997, 'number_of_characters': 387128, 'unique_pairs': 1996, 'min_sentence1_length': 3, 'average_sentence1_length': 45.81, 'max_sentence1_length': 200, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 148.05, 'max_sentence2_length': 508, 'unique_sentence2': 1996}, 'zho_Hant-ell_Grek': {'num_samples': 1997, 'number_of_characters': 390605, 'unique_pairs': 1996, 'min_sentence1_length': 3, 'average_sentence1_length': 45.81, 'max_sentence1_length': 200, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 149.79, 'max_sentence2_length': 584, 'unique_sentence2': 1996}, 'zho_Hant-eng_Latn': {'num_samples': 1997, 'number_of_characters': 339198, 'unique_pairs': 1997, 'min_sentence1_length': 3, 'average_sentence1_length': 45.81, 'max_sentence1_length': 200, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'zho_Hant-fas_Arab': {'num_samples': 1997, 'number_of_characters': 334817, 'unique_pairs': 1996, 'min_sentence1_length': 3, 'average_sentence1_length': 45.81, 'max_sentence1_length': 200, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 121.85, 'max_sentence2_length': 389, 'unique_sentence2': 1995}, 'zho_Hant-fin_Latn': {'num_samples': 1997, 'number_of_characters': 361278, 'unique_pairs': 1996, 'min_sentence1_length': 3, 'average_sentence1_length': 45.81, 'max_sentence1_length': 200, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 135.1, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'zho_Hant-fra_Latn': {'num_samples': 1997, 'number_of_characters': 384586, 'unique_pairs': 1996, 'min_sentence1_length': 3, 'average_sentence1_length': 45.81, 'max_sentence1_length': 200, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 146.77, 'max_sentence2_length': 512, 'unique_sentence2': 1996}, 'zho_Hant-heb_Hebr': {'num_samples': 1997, 'number_of_characters': 291774, 'unique_pairs': 1996, 'min_sentence1_length': 3, 'average_sentence1_length': 45.81, 'max_sentence1_length': 200, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 100.3, 'max_sentence2_length': 375, 'unique_sentence2': 1996}, 'zho_Hant-hin_Deva': {'num_samples': 1997, 'number_of_characters': 353053, 'unique_pairs': 1996, 'min_sentence1_length': 3, 'average_sentence1_length': 45.81, 'max_sentence1_length': 200, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 130.98, 'max_sentence2_length': 394, 'unique_sentence2': 1996}, 'zho_Hant-hun_Latn': {'num_samples': 1997, 'number_of_characters': 369854, 'unique_pairs': 1997, 'min_sentence1_length': 3, 'average_sentence1_length': 45.81, 'max_sentence1_length': 200, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 139.4, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'zho_Hant-ind_Latn': {'num_samples': 1997, 'number_of_characters': 378450, 'unique_pairs': 1997, 'min_sentence1_length': 3, 'average_sentence1_length': 45.81, 'max_sentence1_length': 200, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 143.7, 'max_sentence2_length': 486, 'unique_sentence2': 1997}, 'zho_Hant-jpn_Jpan': {'num_samples': 1997, 'number_of_characters': 203104, 'unique_pairs': 1996, 'min_sentence1_length': 3, 'average_sentence1_length': 45.81, 'max_sentence1_length': 200, 'unique_sentence1': 1996, 'min_sentence2_length': 4, 'average_sentence2_length': 55.9, 'max_sentence2_length': 189, 'unique_sentence2': 1994}, 'zho_Hant-kor_Hang': {'num_samples': 1997, 'number_of_characters': 224883, 'unique_pairs': 1996, 'min_sentence1_length': 3, 'average_sentence1_length': 45.81, 'max_sentence1_length': 200, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 66.8, 'max_sentence2_length': 217, 'unique_sentence2': 1995}, 'zho_Hant-lit_Latn': {'num_samples': 1997, 'number_of_characters': 350875, 'unique_pairs': 1996, 'min_sentence1_length': 3, 'average_sentence1_length': 45.81, 'max_sentence1_length': 200, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 129.89, 'max_sentence2_length': 446, 'unique_sentence2': 1995}, 'zho_Hant-nld_Latn': {'num_samples': 1997, 'number_of_characters': 383393, 'unique_pairs': 1996, 'min_sentence1_length': 3, 'average_sentence1_length': 45.81, 'max_sentence1_length': 200, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 146.18, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'zho_Hant-pol_Latn': {'num_samples': 1997, 'number_of_characters': 369344, 'unique_pairs': 1996, 'min_sentence1_length': 3, 'average_sentence1_length': 45.81, 'max_sentence1_length': 200, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 139.14, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'zho_Hant-por_Latn': {'num_samples': 1997, 'number_of_characters': 368693, 'unique_pairs': 1996, 'min_sentence1_length': 3, 'average_sentence1_length': 45.81, 'max_sentence1_length': 200, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 138.82, 'max_sentence2_length': 497, 'unique_sentence2': 1996}, 'zho_Hant-rus_Cyrl': {'num_samples': 1997, 'number_of_characters': 366371, 'unique_pairs': 1996, 'min_sentence1_length': 3, 'average_sentence1_length': 45.81, 'max_sentence1_length': 200, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 137.65, 'max_sentence2_length': 419, 'unique_sentence2': 1996}, 'zho_Hant-spa_Latn': {'num_samples': 1997, 'number_of_characters': 379678, 'unique_pairs': 1996, 'min_sentence1_length': 3, 'average_sentence1_length': 45.81, 'max_sentence1_length': 200, 'unique_sentence1': 1996, 'min_sentence2_length': 1, 'average_sentence2_length': 144.32, 'max_sentence2_length': 504, 'unique_sentence2': 1996}, 'zho_Hant-swa_Latn': {'num_samples': 1997, 'number_of_characters': 363987, 'unique_pairs': 1997, 'min_sentence1_length': 3, 'average_sentence1_length': 45.81, 'max_sentence1_length': 200, 'unique_sentence1': 1996, 'min_sentence2_length': 10, 'average_sentence2_length': 136.46, 'max_sentence2_length': 430, 'unique_sentence2': 1997}, 'zho_Hant-swe_Latn': {'num_samples': 1997, 'number_of_characters': 343305, 'unique_pairs': 1996, 'min_sentence1_length': 3, 'average_sentence1_length': 45.81, 'max_sentence1_length': 200, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 126.1, 'max_sentence2_length': 430, 'unique_sentence2': 1996}, 'zho_Hant-tam_Taml': {'num_samples': 1997, 'number_of_characters': 401439, 'unique_pairs': 1997, 'min_sentence1_length': 3, 'average_sentence1_length': 45.81, 'max_sentence1_length': 200, 'unique_sentence1': 1996, 'min_sentence2_length': 11, 'average_sentence2_length': 155.21, 'max_sentence2_length': 581, 'unique_sentence2': 1997}, 'zho_Hant-tur_Latn': {'num_samples': 1997, 'number_of_characters': 357091, 'unique_pairs': 1997, 'min_sentence1_length': 3, 'average_sentence1_length': 45.81, 'max_sentence1_length': 200, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 133.01, 'max_sentence2_length': 504, 'unique_sentence2': 1997}, 'zho_Hant-vie_Latn': {'num_samples': 1997, 'number_of_characters': 362599, 'unique_pairs': 1997, 'min_sentence1_length': 3, 'average_sentence1_length': 45.81, 'max_sentence1_length': 200, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 135.76, 'max_sentence2_length': 437, 'unique_sentence2': 1996}, 'zho_Hant-yue_Hant': {'num_samples': 1997, 'number_of_characters': 170365, 'unique_pairs': 1996, 'min_sentence1_length': 3, 'average_sentence1_length': 45.81, 'max_sentence1_length': 200, 'unique_sentence1': 1996, 'min_sentence2_length': 4, 'average_sentence2_length': 39.5, 'max_sentence2_length': 133, 'unique_sentence2': 1996}, 'zho_Hant-zho_Hans': {'num_samples': 1997, 'number_of_characters': 176439, 'unique_pairs': 1997, 'min_sentence1_length': 3, 'average_sentence1_length': 45.81, 'max_sentence1_length': 200, 'unique_sentence1': 1996, 'min_sentence2_length': 4, 'average_sentence2_length': 42.54, 'max_sentence2_length': 263, 'unique_sentence2': 1997}, 'zho_Hant-zul_Latn': {'num_samples': 1997, 'number_of_characters': 349210, 'unique_pairs': 1997, 'min_sentence1_length': 3, 'average_sentence1_length': 45.81, 'max_sentence1_length': 200, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 129.06, 'max_sentence2_length': 494, 'unique_sentence2': 1996}, 'zul_Latn-amh_Ethi': {'num_samples': 1997, 'number_of_characters': 425239, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.06, 'max_sentence1_length': 494, 'unique_sentence1': 1996, 'min_sentence2_length': 1, 'average_sentence2_length': 83.88, 'max_sentence2_length': 290, 'unique_sentence2': 1994}, 'zul_Latn-arb_Arab': {'num_samples': 1997, 'number_of_characters': 488913, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.06, 'max_sentence1_length': 494, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 115.76, 'max_sentence2_length': 362, 'unique_sentence2': 1995}, 'zul_Latn-ben_Beng': {'num_samples': 1997, 'number_of_characters': 501534, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.06, 'max_sentence1_length': 494, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 122.08, 'max_sentence2_length': 402, 'unique_sentence2': 1997}, 'zul_Latn-deu_Latn': {'num_samples': 1997, 'number_of_characters': 553382, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.06, 'max_sentence1_length': 494, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 148.05, 'max_sentence2_length': 508, 'unique_sentence2': 1996}, 'zul_Latn-ell_Grek': {'num_samples': 1997, 'number_of_characters': 556859, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.06, 'max_sentence1_length': 494, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 149.79, 'max_sentence2_length': 584, 'unique_sentence2': 1996}, 'zul_Latn-eng_Latn': {'num_samples': 1997, 'number_of_characters': 505452, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.06, 'max_sentence1_length': 494, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 124.05, 'max_sentence2_length': 437, 'unique_sentence2': 1997}, 'zul_Latn-fas_Arab': {'num_samples': 1997, 'number_of_characters': 501071, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.06, 'max_sentence1_length': 494, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 121.85, 'max_sentence2_length': 389, 'unique_sentence2': 1995}, 'zul_Latn-fin_Latn': {'num_samples': 1997, 'number_of_characters': 527532, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.06, 'max_sentence1_length': 494, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 135.1, 'max_sentence2_length': 463, 'unique_sentence2': 1996}, 'zul_Latn-fra_Latn': {'num_samples': 1997, 'number_of_characters': 550840, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.06, 'max_sentence1_length': 494, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 146.77, 'max_sentence2_length': 512, 'unique_sentence2': 1996}, 'zul_Latn-hau_Latn': {'num_samples': 1997, 'number_of_characters': 527698, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.06, 'max_sentence1_length': 494, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 135.19, 'max_sentence2_length': 483, 'unique_sentence2': 1997}, 'zul_Latn-heb_Hebr': {'num_samples': 1997, 'number_of_characters': 458028, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.06, 'max_sentence1_length': 494, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 100.3, 'max_sentence2_length': 375, 'unique_sentence2': 1996}, 'zul_Latn-hin_Deva': {'num_samples': 1997, 'number_of_characters': 519307, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.06, 'max_sentence1_length': 494, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 130.98, 'max_sentence2_length': 394, 'unique_sentence2': 1996}, 'zul_Latn-hun_Latn': {'num_samples': 1997, 'number_of_characters': 536108, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.06, 'max_sentence1_length': 494, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 139.4, 'max_sentence2_length': 508, 'unique_sentence2': 1997}, 'zul_Latn-ibo_Latn': {'num_samples': 1997, 'number_of_characters': 503833, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.06, 'max_sentence1_length': 494, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 123.24, 'max_sentence2_length': 469, 'unique_sentence2': 1997}, 'zul_Latn-ind_Latn': {'num_samples': 1997, 'number_of_characters': 544704, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.06, 'max_sentence1_length': 494, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 143.7, 'max_sentence2_length': 486, 'unique_sentence2': 1997}, 'zul_Latn-jpn_Jpan': {'num_samples': 1997, 'number_of_characters': 369358, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.06, 'max_sentence1_length': 494, 'unique_sentence1': 1996, 'min_sentence2_length': 4, 'average_sentence2_length': 55.9, 'max_sentence2_length': 189, 'unique_sentence2': 1994}, 'zul_Latn-kor_Hang': {'num_samples': 1997, 'number_of_characters': 391137, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.06, 'max_sentence1_length': 494, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 66.8, 'max_sentence2_length': 217, 'unique_sentence2': 1995}, 'zul_Latn-lit_Latn': {'num_samples': 1997, 'number_of_characters': 517129, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.06, 'max_sentence1_length': 494, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 129.89, 'max_sentence2_length': 446, 'unique_sentence2': 1995}, 'zul_Latn-nld_Latn': {'num_samples': 1997, 'number_of_characters': 549647, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.06, 'max_sentence1_length': 494, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 146.18, 'max_sentence2_length': 539, 'unique_sentence2': 1996}, 'zul_Latn-nso_Latn': {'num_samples': 1997, 'number_of_characters': 549231, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.06, 'max_sentence1_length': 494, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 145.97, 'max_sentence2_length': 487, 'unique_sentence2': 1996}, 'zul_Latn-orm_Ethi': {'num_samples': 1997, 'number_of_characters': 495163, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.06, 'max_sentence1_length': 494, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 118.89, 'max_sentence2_length': 466, 'unique_sentence2': 1984}, 'zul_Latn-pol_Latn': {'num_samples': 1997, 'number_of_characters': 535598, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.06, 'max_sentence1_length': 494, 'unique_sentence1': 1996, 'min_sentence2_length': 9, 'average_sentence2_length': 139.14, 'max_sentence2_length': 468, 'unique_sentence2': 1996}, 'zul_Latn-por_Latn': {'num_samples': 1997, 'number_of_characters': 534947, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.06, 'max_sentence1_length': 494, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 138.82, 'max_sentence2_length': 497, 'unique_sentence2': 1996}, 'zul_Latn-rus_Cyrl': {'num_samples': 1997, 'number_of_characters': 532625, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.06, 'max_sentence1_length': 494, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 137.65, 'max_sentence2_length': 419, 'unique_sentence2': 1996}, 'zul_Latn-som_Latn': {'num_samples': 1997, 'number_of_characters': 549024, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.06, 'max_sentence1_length': 494, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 145.86, 'max_sentence2_length': 455, 'unique_sentence2': 1997}, 'zul_Latn-spa_Latn': {'num_samples': 1997, 'number_of_characters': 545932, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.06, 'max_sentence1_length': 494, 'unique_sentence1': 1996, 'min_sentence2_length': 1, 'average_sentence2_length': 144.32, 'max_sentence2_length': 504, 'unique_sentence2': 1996}, 'zul_Latn-ssw_Latn': {'num_samples': 1997, 'number_of_characters': 545874, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.06, 'max_sentence1_length': 494, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 144.29, 'max_sentence2_length': 510, 'unique_sentence2': 1996}, 'zul_Latn-swa_Latn': {'num_samples': 1997, 'number_of_characters': 530241, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.06, 'max_sentence1_length': 494, 'unique_sentence1': 1996, 'min_sentence2_length': 10, 'average_sentence2_length': 136.46, 'max_sentence2_length': 430, 'unique_sentence2': 1997}, 'zul_Latn-swe_Latn': {'num_samples': 1997, 'number_of_characters': 509559, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.06, 'max_sentence1_length': 494, 'unique_sentence1': 1996, 'min_sentence2_length': 8, 'average_sentence2_length': 126.1, 'max_sentence2_length': 430, 'unique_sentence2': 1996}, 'zul_Latn-tam_Taml': {'num_samples': 1997, 'number_of_characters': 567693, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.06, 'max_sentence1_length': 494, 'unique_sentence1': 1996, 'min_sentence2_length': 11, 'average_sentence2_length': 155.21, 'max_sentence2_length': 581, 'unique_sentence2': 1997}, 'zul_Latn-tir_Ethi': {'num_samples': 1997, 'number_of_characters': 422970, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.06, 'max_sentence1_length': 494, 'unique_sentence1': 1996, 'min_sentence2_length': 5, 'average_sentence2_length': 82.74, 'max_sentence2_length': 272, 'unique_sentence2': 1996}, 'zul_Latn-tsn_Latn': {'num_samples': 1997, 'number_of_characters': 592015, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.06, 'max_sentence1_length': 494, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 167.39, 'max_sentence2_length': 556, 'unique_sentence2': 1997}, 'zul_Latn-tur_Latn': {'num_samples': 1997, 'number_of_characters': 523345, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.06, 'max_sentence1_length': 494, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 133.01, 'max_sentence2_length': 504, 'unique_sentence2': 1997}, 'zul_Latn-vie_Latn': {'num_samples': 1997, 'number_of_characters': 528853, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.06, 'max_sentence1_length': 494, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 135.76, 'max_sentence2_length': 437, 'unique_sentence2': 1996}, 'zul_Latn-wol_Latn': {'num_samples': 1997, 'number_of_characters': 497535, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.06, 'max_sentence1_length': 494, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 120.08, 'max_sentence2_length': 405, 'unique_sentence2': 1990}, 'zul_Latn-xho_Latn': {'num_samples': 1997, 'number_of_characters': 525822, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.06, 'max_sentence1_length': 494, 'unique_sentence1': 1996, 'min_sentence2_length': 6, 'average_sentence2_length': 134.25, 'max_sentence2_length': 492, 'unique_sentence2': 1997}, 'zul_Latn-yor_Latn': {'num_samples': 1997, 'number_of_characters': 573820, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.06, 'max_sentence1_length': 494, 'unique_sentence1': 1996, 'min_sentence2_length': 7, 'average_sentence2_length': 158.28, 'max_sentence2_length': 582, 'unique_sentence2': 1996}, 'zul_Latn-zho_Hant': {'num_samples': 1997, 'number_of_characters': 349210, 'unique_pairs': 1997, 'min_sentence1_length': 8, 'average_sentence1_length': 129.06, 'max_sentence1_length': 494, 'unique_sentence1': 1996, 'min_sentence2_length': 3, 'average_sentence2_length': 45.81, 'max_sentence2_length': 200, 'unique_sentence2': 1996}}}} | | [NYSJudicialEthicsLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | -| [NaijaSenti](https://github.com/hausanlp/NaijaSenti) | ['hau', 'ibo', 'pcm', 'yor'] | Classification | s2s | [Social, Written] | None | None | +| [NaijaSenti](https://github.com/hausanlp/NaijaSenti) (Muhammad et al., 2022) | ['hau', 'ibo', 'pcm', 'yor'] | Classification | s2s | [Social, Written] | None | None | | [NamaaMrTydiReranking](https://huggingface.co/NAMAA-Space) (Muennighoff et al., 2022) | ['ara'] | Reranking | s2s | [Encyclopaedic, Written] | None | None | | [NanoArguAnaRetrieval](http://argumentation.bplaced.net/arguana/data) (Boteva et al., 2016) | ['eng'] | Retrieval | s2p | [Medical, Written] | None | None | | [NanoClimateFeverRetrieval](https://arxiv.org/abs/2012.00614) (Thomas Diggelmann, 2021) | ['eng'] | Retrieval | s2p | [Academic, News, Non-fiction] | None | None | | [NanoDBPediaRetrieval](https://huggingface.co/datasets/zeta-alpha-ai/NanoDBPedia) (Lehmann et al., 2015) | ['eng'] | Retrieval | s2p | [Encyclopaedic] | None | None | -| [NanoFEVERRetrieval](https://fever.ai/) | ['eng'] | Retrieval | s2p | [Academic, Encyclopaedic] | None | None | +| [NanoFEVERRetrieval](https://fever.ai/) (Thorne et al., 2018) | ['eng'] | Retrieval | s2p | [Academic, Encyclopaedic] | None | None | | [NanoFiQA2018Retrieval](https://sites.google.com/view/fiqa/) (Nandan Thakur, 2021) | ['eng'] | Retrieval | s2p | [Academic, Social] | None | None | -| [NanoHotpotQARetrieval](https://hotpotqa.github.io/) | ['eng'] | Retrieval | s2p | [Web, Written] | None | None | -| [NanoMSMARCORetrieval](https://microsoft.github.io/msmarco/) (Tri Nguyen and Mir Rosenberg and Xia Song and Jianfeng Gao and Saurabh Tiwary and Rangan Majumder and Li Deng, 2016) | ['eng'] | Retrieval | s2p | [Web] | None | None | +| [NanoHotpotQARetrieval](https://hotpotqa.github.io/) (Yang et al., 2018) | ['eng'] | Retrieval | s2p | [Web, Written] | None | None | +| [NanoMSMARCORetrieval](https://microsoft.github.io/msmarco/) (Tri Nguyen and Mir Rosenberg and Xia Song and Jianfeng Gao and Saurabh Tiwary and Rangan Majumder and Li Deng, 2016) | ['eng'] | Retrieval | s2p | [Web] | None | None | | [NanoNFCorpusRetrieval](https://www.cl.uni-heidelberg.de/statnlpgroup/nfcorpus/) (Boteva et al., 2016) | ['eng'] | Retrieval | s2p | [Academic, Medical, Written] | None | None | | [NanoNQRetrieval](https://ai.google.com/research/NaturalQuestions) (Tom Kwiatkowski, 2019) | ['eng'] | Retrieval | s2p | [Academic, Web] | None | None | | [NanoQuoraRetrieval](https://quoradata.quora.com/First-Quora-Dataset-Release-Question-Pairs) (DataCanary et al., 2017) | ['eng'] | Retrieval | s2s | [Social] | None | None | | [NanoSCIDOCSRetrieval](https://allenai.org/data/scidocs) (Arman Cohan, 2020) | ['eng'] | Retrieval | s2p | [Academic, Non-fiction, Written] | None | None | | [NanoSciFactRetrieval](https://github.com/allenai/scifact) (Arman Cohan, 2020) | ['eng'] | Retrieval | s2p | [Academic, Medical, Written] | None | None | -| [NanoTouche2020Retrieval](https://webis.de/events/touche-20/shared-task-1.html) | ['eng'] | Retrieval | s2p | [Academic] | None | None | +| [NanoTouche2020Retrieval](https://webis.de/events/touche-20/shared-task-1.html) (Potthast et al., 2022) | ['eng'] | Retrieval | s2p | [Academic] | None | None | | [NarrativeQARetrieval](https://metatext.io/datasets/narrativeqa) (Tomáš Kočiský, 2017) | ['eng'] | Retrieval | s2p | | None | None | -| [NepaliNewsClassification](https://github.com/goru001/nlp-for-nepali) | ['nep'] | Classification | s2s | [News, Written] | None | None | +| [NepaliNewsClassification](https://github.com/goru001/nlp-for-nepali) (Arora et al., 2020) | ['nep'] | Classification | s2s | [News, Written] | None | None | | [NeuCLIR2022Retrieval](https://neuclir.github.io/) (Lawrie et al., 2023) | ['fas', 'rus', 'zho'] | Retrieval | s2p | [News, Written] | None | None | | [NeuCLIR2022RetrievalHardNegatives](https://neuclir.github.io/) (Lawrie et al., 2023) | ['fas', 'rus', 'zho'] | Retrieval | s2p | [News, Written] | None | None | | [NeuCLIR2023Retrieval](https://neuclir.github.io/) (Dawn Lawrie, 2024) | ['fas', 'rus', 'zho'] | Retrieval | s2p | [News, Written] | None | None | | [NeuCLIR2023RetrievalHardNegatives](https://neuclir.github.io/) (Dawn Lawrie, 2024) | ['fas', 'rus', 'zho'] | Retrieval | s2p | [News, Written] | None | None | | [News21InstructionRetrieval](https://arxiv.org/abs/2403.15246) (Orion Weller, 2024) | ['eng'] | InstructionRetrieval | s2p | [News, Written] | None | None | | [NewsClassification](https://arxiv.org/abs/1509.01626) (Zhang et al., 2015) | ['eng'] | Classification | s2s | [News, Written] | None | None | -| [NoRecClassification](https://aclanthology.org/L18-1661/) | ['nob'] | Classification | s2s | [Reviews, Written] | None | None | +| [NoRecClassification](https://aclanthology.org/L18-1661/) (Velldal et al., 2018) | ['nob'] | Classification | s2s | [Reviews, Written] | None | None | | [NollySentiBitextMining](https://github.com/IyanuSh/NollySenti) (Shode et al., 2023) | ['eng', 'hau', 'ibo', 'pcm', 'yor'] | BitextMining | s2s | [Reviews, Social, Written] | {'train': 1640} | {'train': {'num_samples': 1640, 'number_of_characters': 445805, 'unique_pairs': 1632, 'min_sentence1_length': 3, 'average_sentence1_length': 136.32, 'max_sentence1_length': 1698, 'unique_sentence1': 405, 'min_sentence2_length': 3, 'average_sentence2_length': 135.52, 'max_sentence2_length': 1728, 'unique_sentence2': 1631, 'hf_subset_descriptive_stats': {'en-ha': {'num_samples': 410, 'number_of_characters': 115348, 'unique_pairs': 407, 'min_sentence1_length': 3, 'average_sentence1_length': 136.32, 'max_sentence1_length': 1698, 'unique_sentence1': 405, 'min_sentence2_length': 4, 'average_sentence2_length': 145.02, 'max_sentence2_length': 1728, 'unique_sentence2': 407}, 'en-ig': {'num_samples': 410, 'number_of_characters': 107173, 'unique_pairs': 409, 'min_sentence1_length': 3, 'average_sentence1_length': 136.32, 'max_sentence1_length': 1698, 'unique_sentence1': 405, 'min_sentence2_length': 5, 'average_sentence2_length': 125.08, 'max_sentence2_length': 1137, 'unique_sentence2': 408}, 'en-pcm': {'num_samples': 410, 'number_of_characters': 109955, 'unique_pairs': 408, 'min_sentence1_length': 3, 'average_sentence1_length': 136.32, 'max_sentence1_length': 1698, 'unique_sentence1': 405, 'min_sentence2_length': 3, 'average_sentence2_length': 131.87, 'max_sentence2_length': 1552, 'unique_sentence2': 408}, 'en-yo': {'num_samples': 410, 'number_of_characters': 113329, 'unique_pairs': 409, 'min_sentence1_length': 3, 'average_sentence1_length': 136.32, 'max_sentence1_length': 1698, 'unique_sentence1': 405, 'min_sentence2_length': 6, 'average_sentence2_length': 140.1, 'max_sentence2_length': 1338, 'unique_sentence2': 409}}}} | -| [NorQuadRetrieval](https://aclanthology.org/2023.nodalida-1.17/) | ['nob'] | Retrieval | p2p | [Encyclopaedic, Non-fiction, Written] | None | None | -| [NordicLangClassification](https://aclanthology.org/2021.vardial-1.8/) | ['dan', 'fao', 'isl', 'nno', 'nob', 'swe'] | Classification | s2s | [Encyclopaedic] | None | None | +| [NorQuadRetrieval](https://aclanthology.org/2023.nodalida-1.17/) (Ivanova et al., 2023) | ['nob'] | Retrieval | p2p | [Encyclopaedic, Non-fiction, Written] | None | None | +| [NordicLangClassification](https://aclanthology.org/2021.vardial-1.8/) (Haas et al., 2021) | ['dan', 'fao', 'isl', 'nno', 'nob', 'swe'] | Classification | s2s | [Encyclopaedic] | None | None | | [NorwegianCourtsBitextMining](https://opus.nlpl.eu/index.php) (Tiedemann et al., 2020) | ['nno', 'nob'] | BitextMining | s2s | [Legal, Written] | {'test': 228} | {'test': {'num_samples': 228, 'number_of_characters': 37441, 'unique_pairs': 228, 'min_sentence1_length': 13, 'average_sentence1_length': 82.2, 'max_sentence1_length': 272, 'unique_sentence1': 227, 'min_sentence2_length': 10, 'average_sentence2_length': 82.02, 'max_sentence2_length': 269, 'unique_sentence2': 226}} | -| [NorwegianParliamentClassification](https://huggingface.co/datasets/NbAiLab/norwegian_parliament) | ['nob'] | Classification | s2s | [Government, Spoken] | None | None | -| [NusaParagraphEmotionClassification](https://github.com/IndoNLP/nusa-writes) | ['bbc', 'bew', 'bug', 'jav', 'mad', 'mak', 'min', 'mui', 'rej', 'sun'] | Classification | s2s | [Fiction, Non-fiction, Written] | None | None | -| [NusaParagraphTopicClassification](https://github.com/IndoNLP/nusa-writes) | ['bbc', 'bew', 'bug', 'jav', 'mad', 'mak', 'min', 'mui', 'rej', 'sun'] | Classification | s2s | [Fiction, Non-fiction, Written] | None | None | +| [NorwegianParliamentClassification](https://huggingface.co/datasets/NbAiLab/norwegian_parliament) (Kummervold et al., 2021) | ['nob'] | Classification | s2s | [Government, Spoken] | None | None | +| [NusaParagraphEmotionClassification](https://github.com/IndoNLP/nusa-writes) (Cahyawijaya et al., 2023) | ['bbc', 'bew', 'bug', 'jav', 'mad', 'mak', 'min', 'mui', 'rej', 'sun'] | Classification | s2s | [Fiction, Non-fiction, Written] | None | None | +| [NusaParagraphTopicClassification](https://github.com/IndoNLP/nusa-writes) (Cahyawijaya et al., 2023) | ['bbc', 'bew', 'bug', 'jav', 'mad', 'mak', 'min', 'mui', 'rej', 'sun'] | Classification | s2s | [Fiction, Non-fiction, Written] | None | None | | [NusaTranslationBitextMining](https://huggingface.co/datasets/indonlp/nusatranslation_mt) (Cahyawijaya et al., 2023) | ['abs', 'bbc', 'bew', 'bhp', 'ind', 'jav', 'mad', 'mak', 'min', 'mui', 'rej', 'sun'] | BitextMining | s2s | [Social, Written] | {'train': 50200} | {'train': {'num_samples': 50200, 'number_of_characters': 14759870, 'unique_pairs': 50140, 'min_sentence1_length': 5, 'average_sentence1_length': 145.46, 'max_sentence1_length': 873, 'unique_sentence1': 8258, 'min_sentence2_length': 5, 'average_sentence2_length': 148.57, 'max_sentence2_length': 980, 'unique_sentence2': 50102, 'hf_subset_descriptive_stats': {'ind-abs': {'num_samples': 1000, 'number_of_characters': 295680, 'unique_pairs': 999, 'min_sentence1_length': 5, 'average_sentence1_length': 148.37, 'max_sentence1_length': 727, 'unique_sentence1': 998, 'min_sentence2_length': 6, 'average_sentence2_length': 147.31, 'max_sentence2_length': 629, 'unique_sentence2': 998}, 'ind-btk': {'num_samples': 6600, 'number_of_characters': 1927907, 'unique_pairs': 6597, 'min_sentence1_length': 5, 'average_sentence1_length': 145.37, 'max_sentence1_length': 873, 'unique_sentence1': 6521, 'min_sentence2_length': 5, 'average_sentence2_length': 146.74, 'max_sentence2_length': 980, 'unique_sentence2': 6596}, 'ind-bew': {'num_samples': 6600, 'number_of_characters': 1939300, 'unique_pairs': 6595, 'min_sentence1_length': 5, 'average_sentence1_length': 145.43, 'max_sentence1_length': 873, 'unique_sentence1': 6512, 'min_sentence2_length': 6, 'average_sentence2_length': 148.41, 'max_sentence2_length': 840, 'unique_sentence2': 6590}, 'ind-bhp': {'num_samples': 1000, 'number_of_characters': 261666, 'unique_pairs': 1000, 'min_sentence1_length': 11, 'average_sentence1_length': 133.53, 'max_sentence1_length': 468, 'unique_sentence1': 999, 'min_sentence2_length': 10, 'average_sentence2_length': 128.14, 'max_sentence2_length': 459, 'unique_sentence2': 999}, 'ind-jav': {'num_samples': 6600, 'number_of_characters': 1922162, 'unique_pairs': 6594, 'min_sentence1_length': 5, 'average_sentence1_length': 145.43, 'max_sentence1_length': 873, 'unique_sentence1': 6512, 'min_sentence2_length': 5, 'average_sentence2_length': 145.81, 'max_sentence2_length': 854, 'unique_sentence2': 6585}, 'ind-mad': {'num_samples': 6600, 'number_of_characters': 1973257, 'unique_pairs': 6598, 'min_sentence1_length': 5, 'average_sentence1_length': 145.36, 'max_sentence1_length': 873, 'unique_sentence1': 6521, 'min_sentence2_length': 5, 'average_sentence2_length': 153.62, 'max_sentence2_length': 827, 'unique_sentence2': 6592}, 'ind-mak': {'num_samples': 6600, 'number_of_characters': 1953868, 'unique_pairs': 6594, 'min_sentence1_length': 5, 'average_sentence1_length': 145.43, 'max_sentence1_length': 873, 'unique_sentence1': 6512, 'min_sentence2_length': 6, 'average_sentence2_length': 150.61, 'max_sentence2_length': 888, 'unique_sentence2': 6586}, 'ind-min': {'num_samples': 6600, 'number_of_characters': 1937033, 'unique_pairs': 6595, 'min_sentence1_length': 5, 'average_sentence1_length': 145.43, 'max_sentence1_length': 873, 'unique_sentence1': 6512, 'min_sentence2_length': 6, 'average_sentence2_length': 148.06, 'max_sentence2_length': 837, 'unique_sentence2': 6591}, 'ind-mui': {'num_samples': 1000, 'number_of_characters': 301448, 'unique_pairs': 1000, 'min_sentence1_length': 11, 'average_sentence1_length': 150.45, 'max_sentence1_length': 451, 'unique_sentence1': 997, 'min_sentence2_length': 11, 'average_sentence2_length': 150.99, 'max_sentence2_length': 450, 'unique_sentence2': 1000}, 'ind-rej': {'num_samples': 1000, 'number_of_characters': 291205, 'unique_pairs': 1000, 'min_sentence1_length': 9, 'average_sentence1_length': 151.62, 'max_sentence1_length': 873, 'unique_sentence1': 998, 'min_sentence2_length': 8, 'average_sentence2_length': 139.58, 'max_sentence2_length': 784, 'unique_sentence2': 1000}, 'ind-sun': {'num_samples': 6600, 'number_of_characters': 1956344, 'unique_pairs': 6591, 'min_sentence1_length': 5, 'average_sentence1_length': 145.43, 'max_sentence1_length': 873, 'unique_sentence1': 6512, 'min_sentence2_length': 5, 'average_sentence2_length': 150.99, 'max_sentence2_length': 881, 'unique_sentence2': 6588}}}} | | [NusaX-senti](https://arxiv.org/abs/2205.15960) (Winata et al., 2022) | ['ace', 'ban', 'bbc', 'bjn', 'bug', 'eng', 'ind', 'jav', 'mad', 'min', 'nij', 'sun'] | Classification | s2s | [Constructed, Reviews, Social, Web, Written] | None | None | | [NusaXBitextMining](https://huggingface.co/datasets/indonlp/NusaX-senti/) (Winata et al., 2023) | ['ace', 'ban', 'bbc', 'bjn', 'bug', 'eng', 'ind', 'jav', 'mad', 'min', 'nij', 'sun'] | BitextMining | s2s | [Reviews, Written] | None | None | @@ -555,14 +574,14 @@ The following tables give you an overview of the tasks in MTEB. | [OxfordPetsZeroShot](https://arxiv.org/abs/1306.5151) (Subhransu Maji, 2013) | ['eng'] | ZeroShotClassification | i2t | [Encyclopaedic] | {'test': 3669} | {'test': {'num_samples': 3669, 'unique_num_labels': 37, 'min_image_width': 137, 'average_image_width': 443.46, 'max_image_width': 1646, 'min_image_height': 103, 'average_image_height': 399.38, 'max_image_height': 2160, 'min_label_text_length': 32, 'average_label_text_length': 40.68, 'max_label_text_length': 55, 'labels': {'0': {'count': 98}, '1': {'count': 100}, '2': {'count': 100}, '3': {'count': 100}, '4': {'count': 100}, '5': {'count': 100}, '6': {'count': 100}, '7': {'count': 88}, '8': {'count': 99}, '9': {'count': 100}, '10': {'count': 100}, '11': {'count': 97}, '12': {'count': 100}, '13': {'count': 100}, '14': {'count': 100}, '15': {'count': 100}, '16': {'count': 100}, '17': {'count': 100}, '18': {'count': 99}, '19': {'count': 100}, '20': {'count': 100}, '21': {'count': 100}, '22': {'count': 100}, '23': {'count': 100}, '24': {'count': 100}, '25': {'count': 100}, '26': {'count': 100}, '27': {'count': 100}, '28': {'count': 100}, '29': {'count': 100}, '30': {'count': 99}, '31': {'count': 100}, '32': {'count': 100}, '33': {'count': 100}, '34': {'count': 89}, '35': {'count': 100}, '36': {'count': 100}}}} | | [PAC](https://arxiv.org/pdf/2211.13112.pdf) (Łukasz Augustyniak, 2022) | ['pol'] | Classification | p2p | [Legal, Written] | None | None | | [PAWSX](https://aclanthology.org/2021.emnlp-main.357) (Shitao Xiao, 2024) | ['cmn'] | STS | s2s | | None | None | -| [PIQA](https://arxiv.org/abs/1911.11641) (Xiao et al., 2024) | ['eng'] | Retrieval | s2s | [Encyclopaedic, Written] | None | None | +| [PIQA](https://arxiv.org/abs/1911.11641) (Bisk et al., 2020) | ['eng'] | Retrieval | s2s | [Encyclopaedic, Written] | None | None | | [PROALegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | -| [PSC](http://www.lrec-conf.org/proceedings/lrec2014/pdf/1211_Paper.pdf) | ['pol'] | PairClassification | s2s | [News, Written] | None | None | -| [ParsinluEntail](https://github.com/persiannlp/parsinlu) | ['fas'] | PairClassification | s2s | | None | None | -| [ParsinluQueryParaphPC](https://huggingface.co/datasets/persiannlp/parsinlu_query_paraphrasing) | ['fas'] | PairClassification | s2s | | None | None | -| [PatchCamelyon](https://link.springer.com/chapter/10.1007/978-3-030-00934-2_24) | ['eng'] | ImageClassification | i2i | [Medical] | {'test': 32768} | {'test': {'num_samples': 32768, 'unique_num_labels': 2, 'min_image_width': 96, 'average_image_width': 96.0, 'max_image_width': 96, 'min_image_height': 96, 'average_image_height': 96.0, 'max_image_height': 96, 'labels': {'0': {'count': 16391}, '1': {'count': 16377}}}} | -| [PatchCamelyonZeroShot](https://link.springer.com/chapter/10.1007/978-3-030-00934-2_24) | ['eng'] | ZeroShotClassification | i2t | [Medical] | {'test': 32768} | {'test': {'num_samples': 32768, 'unique_num_labels': 2, 'min_image_width': 96, 'average_image_width': 96.0, 'max_image_width': 96, 'min_image_height': 96, 'average_image_height': 96.0, 'max_image_height': 96, 'min_label_text_length': 35, 'average_label_text_length': 52.0, 'max_label_text_length': 69, 'labels': {'0': {'count': 16391}, '1': {'count': 16377}}}} | -| [PatentClassification](https://aclanthology.org/P19-1212.pdf) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | +| [PSC](http://www.lrec-conf.org/proceedings/lrec2014/pdf/1211_Paper.pdf) (Ogrodniczuk et al., 2014) | ['pol'] | PairClassification | s2s | [News, Written] | None | None | +| [ParsinluEntail](https://github.com/persiannlp/parsinlu) (Daniel Khashabi, 2021) | ['fas'] | PairClassification | s2s | [Reviews, Written] | None | None | +| [ParsinluQueryParaphPC](https://huggingface.co/datasets/persiannlp/parsinlu_query_paraphrasing) (Daniel Khashabi, 2021) | ['fas'] | PairClassification | s2s | [Reviews, Written] | None | None | +| [PatchCamelyon](https://link.springer.com/chapter/10.1007/978-3-030-00934-2_24) (Veeling et al., 2018) | ['eng'] | ImageClassification | i2i | [Medical] | {'test': 32768} | {'test': {'num_samples': 32768, 'unique_num_labels': 2, 'min_image_width': 96, 'average_image_width': 96.0, 'max_image_width': 96, 'min_image_height': 96, 'average_image_height': 96.0, 'max_image_height': 96, 'labels': {'0': {'count': 16391}, '1': {'count': 16377}}}} | +| [PatchCamelyonZeroShot](https://link.springer.com/chapter/10.1007/978-3-030-00934-2_24) (Veeling et al., 2018) | ['eng'] | ZeroShotClassification | i2t | [Medical] | {'test': 32768} | {'test': {'num_samples': 32768, 'unique_num_labels': 2, 'min_image_width': 96, 'average_image_width': 96.0, 'max_image_width': 96, 'min_image_height': 96, 'average_image_height': 96.0, 'max_image_height': 96, 'min_label_text_length': 35, 'average_label_text_length': 52.0, 'max_label_text_length': 69, 'labels': {'0': {'count': 16391}, '1': {'count': 16377}}}} | +| [PatentClassification](https://aclanthology.org/P19-1212.pdf) (Sharma et al., 2019) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | | [PawsXPairClassification](https://arxiv.org/abs/1908.11828) (Yinfei Yang, 2019) | ['cmn', 'deu', 'eng', 'fra', 'jpn', 'kor', 'spa'] | PairClassification | s2s | [Encyclopaedic, Web, Written] | {'test': 14000, 'validation': 14000} | {'test': {'num_samples': 14000, 'number_of_characters': 2551922, 'min_sentence1_length': 2, 'avg_sentence1_length': 91.18, 'max_sentence1_length': 268, 'unique_sentence1': 13404, 'min_sentence2_length': 2, 'avg_sentence2_length': 91.1, 'max_sentence2_length': 247, 'unique_sentence2': 13462, 'unique_labels': 2, 'labels': {'1': {'count': 6285}, '0': {'count': 7715}}, 'hf_subset_descriptive_stats': {'de': {'num_samples': 2000, 'number_of_characters': 478034, 'min_sentence1_length': 2, 'avg_sentence1_length': 119.78, 'max_sentence1_length': 268, 'unique_sentence1': 1934, 'min_sentence2_length': 2, 'avg_sentence2_length': 119.24, 'max_sentence2_length': 235, 'unique_sentence2': 1938, 'unique_labels': 2, 'labels': {'1': {'count': 895}, '0': {'count': 1105}}}, 'en': {'num_samples': 2000, 'number_of_characters': 454362, 'min_sentence1_length': 25, 'avg_sentence1_length': 113.76, 'max_sentence1_length': 209, 'unique_sentence1': 1761, 'min_sentence2_length': 25, 'avg_sentence2_length': 113.42, 'max_sentence2_length': 209, 'unique_sentence2': 1800, 'unique_labels': 2, 'labels': {'1': {'count': 907}, '0': {'count': 1093}}}, 'es': {'num_samples': 2000, 'number_of_characters': 471226, 'min_sentence1_length': 2, 'avg_sentence1_length': 117.81, 'max_sentence1_length': 226, 'unique_sentence1': 1955, 'min_sentence2_length': 22, 'avg_sentence2_length': 117.8, 'max_sentence2_length': 233, 'unique_sentence2': 1959, 'unique_labels': 2, 'labels': {'1': {'count': 907}, '0': {'count': 1093}}}, 'fr': {'num_samples': 2000, 'number_of_characters': 480033, 'min_sentence1_length': 2, 'avg_sentence1_length': 120.03, 'max_sentence1_length': 238, 'unique_sentence1': 1954, 'min_sentence2_length': 2, 'avg_sentence2_length': 119.99, 'max_sentence2_length': 247, 'unique_sentence2': 1953, 'unique_labels': 2, 'labels': {'1': {'count': 903}, '0': {'count': 1097}}}, 'ja': {'num_samples': 2000, 'number_of_characters': 235106, 'min_sentence1_length': 2, 'avg_sentence1_length': 58.68, 'max_sentence1_length': 192, 'unique_sentence1': 1944, 'min_sentence2_length': 2, 'avg_sentence2_length': 58.88, 'max_sentence2_length': 198, 'unique_sentence2': 1941, 'unique_labels': 2, 'labels': {'1': {'count': 883}, '0': {'count': 1117}}}, 'ko': {'num_samples': 2000, 'number_of_characters': 260149, 'min_sentence1_length': 2, 'avg_sentence1_length': 64.96, 'max_sentence1_length': 153, 'unique_sentence1': 1954, 'min_sentence2_length': 2, 'avg_sentence2_length': 65.11, 'max_sentence2_length': 159, 'unique_sentence2': 1969, 'unique_labels': 2, 'labels': {'1': {'count': 896}, '0': {'count': 1104}}}, 'zh': {'num_samples': 2000, 'number_of_characters': 173012, 'min_sentence1_length': 2, 'avg_sentence1_length': 43.23, 'max_sentence1_length': 120, 'unique_sentence1': 1909, 'min_sentence2_length': 2, 'avg_sentence2_length': 43.27, 'max_sentence2_length': 113, 'unique_sentence2': 1909, 'unique_labels': 2, 'labels': {'1': {'count': 894}, '0': {'count': 1106}}}}}, 'validation': {'num_samples': 14000, 'number_of_characters': 2524625, 'min_sentence1_length': 2, 'avg_sentence1_length': 90.13, 'max_sentence1_length': 248, 'unique_sentence1': 13357, 'min_sentence2_length': 2, 'avg_sentence2_length': 90.2, 'max_sentence2_length': 275, 'unique_sentence2': 13397, 'unique_labels': 2, 'labels': {'1': {'count': 5948}, '0': {'count': 8052}}, 'hf_subset_descriptive_stats': {'de': {'num_samples': 2000, 'number_of_characters': 467643, 'min_sentence1_length': 2, 'avg_sentence1_length': 116.82, 'max_sentence1_length': 248, 'unique_sentence1': 1914, 'min_sentence2_length': 2, 'avg_sentence2_length': 117.0, 'max_sentence2_length': 275, 'unique_sentence2': 1920, 'unique_labels': 2, 'labels': {'1': {'count': 831}, '0': {'count': 1169}}}, 'en': {'num_samples': 2000, 'number_of_characters': 451931, 'min_sentence1_length': 25, 'avg_sentence1_length': 113.11, 'max_sentence1_length': 213, 'unique_sentence1': 1758, 'min_sentence2_length': 25, 'avg_sentence2_length': 112.86, 'max_sentence2_length': 213, 'unique_sentence2': 1771, 'unique_labels': 2, 'labels': {'1': {'count': 863}, '0': {'count': 1137}}}, 'es': {'num_samples': 2000, 'number_of_characters': 466112, 'min_sentence1_length': 2, 'avg_sentence1_length': 116.33, 'max_sentence1_length': 240, 'unique_sentence1': 1938, 'min_sentence2_length': 2, 'avg_sentence2_length': 116.73, 'max_sentence2_length': 241, 'unique_sentence2': 1941, 'unique_labels': 2, 'labels': {'1': {'count': 847}, '0': {'count': 1153}}}, 'fr': {'num_samples': 2000, 'number_of_characters': 478510, 'min_sentence1_length': 2, 'avg_sentence1_length': 119.5, 'max_sentence1_length': 233, 'unique_sentence1': 1933, 'min_sentence2_length': 2, 'avg_sentence2_length': 119.75, 'max_sentence2_length': 246, 'unique_sentence2': 1939, 'unique_labels': 2, 'labels': {'1': {'count': 860}, '0': {'count': 1140}}}, 'ja': {'num_samples': 2000, 'number_of_characters': 229655, 'min_sentence1_length': 2, 'avg_sentence1_length': 57.51, 'max_sentence1_length': 126, 'unique_sentence1': 1957, 'min_sentence2_length': 2, 'avg_sentence2_length': 57.32, 'max_sentence2_length': 121, 'unique_sentence2': 1969, 'unique_labels': 2, 'labels': {'1': {'count': 854}, '0': {'count': 1146}}}, 'ko': {'num_samples': 2000, 'number_of_characters': 261355, 'min_sentence1_length': 2, 'avg_sentence1_length': 65.16, 'max_sentence1_length': 178, 'unique_sentence1': 1963, 'min_sentence2_length': 2, 'avg_sentence2_length': 65.52, 'max_sentence2_length': 174, 'unique_sentence2': 1968, 'unique_labels': 2, 'labels': {'1': {'count': 840}, '0': {'count': 1160}}}, 'zh': {'num_samples': 2000, 'number_of_characters': 169419, 'min_sentence1_length': 2, 'avg_sentence1_length': 42.45, 'max_sentence1_length': 101, 'unique_sentence1': 1899, 'min_sentence2_length': 2, 'avg_sentence2_length': 42.26, 'max_sentence2_length': 120, 'unique_sentence2': 1895, 'unique_labels': 2, 'labels': {'1': {'count': 853}, '0': {'count': 1147}}}}}} | | [PersianFoodSentimentClassification](https://hooshvare.github.io/docs/datasets/sa) (Mehrdad Farahani et al., 2020) | ['fas'] | Classification | s2s | [Reviews, Written] | None | None | | [PersianTextEmotion](https://huggingface.co/datasets/SeyedAli/Persian-Text-Emotion) | ['fas'] | Classification | s2s | | None | None | @@ -573,7 +592,7 @@ The following tables give you an overview of the tasks in MTEB. | [PlscClusteringP2P.v2](https://huggingface.co/datasets/rafalposwiata/plsc) | ['pol'] | Clustering | s2s | [Academic, Written] | None | None | | [PlscClusteringS2S.v2](https://huggingface.co/datasets/rafalposwiata/plsc) | ['pol'] | Clustering | s2s | [Academic, Written] | None | None | | [PoemSentimentClassification](https://arxiv.org/abs/2011.02686) (Emily Sheng, 2020) | ['eng'] | Classification | s2s | [Reviews, Written] | None | None | -| [PolEmo2.0-IN](https://aclanthology.org/K19-1092.pdf) | ['pol'] | Classification | s2s | [Social, Written] | None | None | +| [PolEmo2.0-IN](https://aclanthology.org/K19-1092.pdf) (Koco{\'n, 2019) | ['pol'] | Classification | s2s | [Social, Written] | None | None | | [PolEmo2.0-OUT](https://aclanthology.org/K19-1092.pdf) | ['pol'] | Classification | s2s | [Social, Written] | None | None | | [PpcPC](https://arxiv.org/pdf/2207.12759.pdf) (Sławomir Dadas, 2022) | ['pol'] | PairClassification | s2s | [Fiction, News, Non-fiction, Social, Spoken, Web, Written] | None | None | | [PubChemAISentenceParaphrasePC](https://arxiv.org/abs/2412.00532) (Kasmaee et al., 2024) | ['eng'] | PairClassification | s2s | [Chemistry] | None | None | @@ -582,10 +601,10 @@ The following tables give you an overview of the tasks in MTEB. | [PubChemSynonymPC](https://arxiv.org/abs/2412.00532) (Kasmaee et al., 2024) | ['eng'] | PairClassification | s2s | [Chemistry] | None | None | | [PubChemWikiPairClassification](https://arxiv.org/abs/2412.00532) (Kasmaee et al., 2024) | ['ces', 'deu', 'eng', 'fra', 'hin', 'jpn', 'kor', 'msa', 'nld', 'por', 'spa', 'tur', 'zho'] | PairClassification | s2s | [Chemistry] | None | None | | [PubChemWikiParagraphsPC](https://arxiv.org/abs/2412.00532) (Kasmaee et al., 2024) | ['eng'] | PairClassification | p2p | [Chemistry] | None | None | -| [PublicHealthQA](https://huggingface.co/datasets/xhluca/publichealth-qa) | ['ara', 'eng', 'fra', 'kor', 'rus', 'spa', 'vie', 'zho'] | Retrieval | s2p | [Government, Medical, Web, Written] | None | None | +| [PublicHealthQA](https://huggingface.co/datasets/xhluca/publichealth-qa) ( {Xing Han Lu, 2024) | ['ara', 'eng', 'fra', 'kor', 'rus', 'spa', 'vie', 'zho'] | Retrieval | s2p | [Government, Medical, Web, Written] | None | None | | [PunjabiNewsClassification](https://github.com/goru001/nlp-for-punjabi/) (Anoop Kunchukuttan, 2020) | ['pan'] | Classification | s2s | [News, Written] | None | None | | [QBQTC](https://github.com/CLUEbenchmark/QBQTC/tree/main/dataset) | ['cmn'] | STS | s2s | | None | None | -| [Quail](https://text-machine.cs.uml.edu/lab2/projects/quail/) (Xiao et al., 2024) | ['eng'] | Retrieval | s2s | [Encyclopaedic, Written] | None | None | +| [Quail](https://text-machine.cs.uml.edu/lab2/projects/quail/) (Rogers et al., 2020) | ['eng'] | Retrieval | s2s | [Encyclopaedic, Written] | None | None | | [Query2Query](https://mcinext.com/) | ['fas'] | STS | s2s | | None | None | | [Quora-NL](https://huggingface.co/datasets/clips/beir-nl-quora) (Nikolay Banar, 2024) | ['nld'] | Retrieval | s2s | [Written] | None | None | | [Quora-PL](https://quoradata.quora.com/First-Quora-Dataset-Release-Question-Pairs) (Konrad Wojtasik, 2024) | ['pol'] | Retrieval | s2s | | None | None | @@ -593,22 +612,23 @@ The following tables give you an overview of the tasks in MTEB. | [QuoraRetrieval](https://quoradata.quora.com/First-Quora-Dataset-Release-Question-Pairs) (DataCanary et al., 2017) | ['eng'] | Retrieval | s2s | [Blog, Web, Written] | None | None | | [QuoraRetrieval-Fa](https://huggingface.co/datasets/MCINext/quora-fa) | ['fas'] | Retrieval | s2s | [Web] | None | None | | [QuoraRetrievalHardNegatives](https://quoradata.quora.com/First-Quora-Dataset-Release-Question-Pairs) (DataCanary et al., 2017) | ['eng'] | Retrieval | s2s | | None | None | -| [RARbCode](https://arxiv.org/abs/2404.06347) (Xiao et al., 2024) | ['eng'] | Retrieval | s2p | [Programming, Written] | None | None | -| [RARbMath](https://arxiv.org/abs/2404.06347) (Xiao et al., 2024) | ['eng'] | Retrieval | s2p | [Encyclopaedic, Written] | None | None | +| [RARbCode](https://arxiv.org/abs/2404.06347) (Husain et al., 2019) | ['eng'] | Retrieval | s2p | [Programming, Written] | None | None | +| [RARbMath](https://arxiv.org/abs/2404.06347) (Cobbe et al., 2021) | ['eng'] | Retrieval | s2p | [Encyclopaedic, Written] | None | None | | [RESISC45](https://ieeexplore.ieee.org/abstract/document/7891544) (Cheng et al., 2017) | ['eng'] | ImageClassification | i2i | [Encyclopaedic] | {'test': 6300} | {'test': {'num_samples': 6300, 'unique_num_labels': 45, 'min_image_width': 256, 'average_image_width': 256.0, 'max_image_width': 256, 'min_image_height': 256, 'average_image_height': 256.0, 'max_image_height': 256, 'labels': {'31': {'count': 135}, '11': {'count': 144}, '28': {'count': 135}, '43': {'count': 154}, '41': {'count': 144}, '33': {'count': 134}, '19': {'count': 130}, '16': {'count': 127}, '22': {'count': 130}, '34': {'count': 143}, '24': {'count': 164}, '0': {'count': 169}, '13': {'count': 146}, '25': {'count': 115}, '6': {'count': 132}, '36': {'count': 135}, '39': {'count': 142}, '18': {'count': 140}, '23': {'count': 147}, '37': {'count': 159}, '15': {'count': 122}, '29': {'count': 140}, '9': {'count': 159}, '27': {'count': 140}, '21': {'count': 131}, '3': {'count': 134}, '1': {'count': 162}, '32': {'count': 153}, '26': {'count': 150}, '35': {'count': 151}, '44': {'count': 118}, '30': {'count': 154}, '20': {'count': 139}, '4': {'count': 130}, '42': {'count': 127}, '40': {'count': 137}, '5': {'count': 140}, '17': {'count': 142}, '2': {'count': 123}, '38': {'count': 130}, '10': {'count': 140}, '12': {'count': 146}, '8': {'count': 146}, '7': {'count': 143}, '14': {'count': 118}}}} | | [RESISC45ZeroShot](https://ieeexplore.ieee.org/abstract/document/7891544) (Cheng et al., 2017) | ['eng'] | ZeroShotClassification | i2t | [Encyclopaedic] | {'test': 6300} | {'test': {'num_samples': 6300, 'unique_num_labels': 45, 'min_image_width': 256, 'average_image_width': 256.0, 'max_image_width': 256, 'min_image_height': 256, 'average_image_height': 256.0, 'max_image_height': 256, 'min_label_text_length': 26, 'average_label_text_length': 32.16, 'max_label_text_length': 43, 'labels': {'31': {'count': 135}, '11': {'count': 144}, '28': {'count': 135}, '43': {'count': 154}, '41': {'count': 144}, '33': {'count': 134}, '19': {'count': 130}, '16': {'count': 127}, '22': {'count': 130}, '34': {'count': 143}, '24': {'count': 164}, '0': {'count': 169}, '13': {'count': 146}, '25': {'count': 115}, '6': {'count': 132}, '36': {'count': 135}, '39': {'count': 142}, '18': {'count': 140}, '23': {'count': 147}, '37': {'count': 159}, '15': {'count': 122}, '29': {'count': 140}, '9': {'count': 159}, '27': {'count': 140}, '21': {'count': 131}, '3': {'count': 134}, '1': {'count': 162}, '32': {'count': 153}, '26': {'count': 150}, '35': {'count': 151}, '44': {'count': 118}, '30': {'count': 154}, '20': {'count': 139}, '4': {'count': 130}, '42': {'count': 127}, '40': {'count': 137}, '5': {'count': 140}, '17': {'count': 142}, '2': {'count': 123}, '38': {'count': 130}, '10': {'count': 140}, '12': {'count': 146}, '8': {'count': 146}, '7': {'count': 143}, '14': {'count': 118}}}} | -| [ROxfordEasyI2IRetrieval](https://openaccess.thecvf.com/content_cvpr_2018/html/Radenovic_Revisiting_Oxford_and_CVPR_2018_paper.html) (Radenovi{'c, 2018) | ['eng'] | Any2AnyRetrieval | i2i | [Web] | {'test': 5063} | {'test': {'number_of_characters': 0, 'num_samples': 5063, 'num_queries': 70, 'num_documents': 4993, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'min_document_image_width': 256, 'average_document_image_width': 256.0, 'max_document_image_width': 256, 'min_document_image_height': 256, 'average_document_image_height': 256.0, 'max_document_image_height': 256, 'num_document_images': 4993, 'min_query_length': 0, 'average_query_length': 0, 'max_query_length': 0, 'unique_queries': 0, 'num_query_images': 70, 'min_query_image_width': 256, 'average_query_image_width': 256.0, 'max_query_image_width': 256, 'min_query_image_height': 256, 'average_query_image_height': 256.0, 'max_query_image_height': 256, 'min_relevant_docs_per_query': 0, 'average_relevant_docs_per_query': 43.27, 'max_relevant_docs_per_query': 248, 'unique_relevant_docs': 4993}} | -| [ROxfordHardI2IRetrieval](https://openaccess.thecvf.com/content_cvpr_2018/html/Radenovic_Revisiting_Oxford_and_CVPR_2018_paper.html) (Radenovi{'c, 2018) | ['eng'] | Any2AnyRetrieval | i2i | [Web] | {'test': 5063} | {'test': {'number_of_characters': 0, 'num_samples': 5063, 'num_queries': 70, 'num_documents': 4993, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'min_document_image_width': 256, 'average_document_image_width': 256.0, 'max_document_image_width': 256, 'min_document_image_height': 256, 'average_document_image_height': 256.0, 'max_document_image_height': 256, 'num_document_images': 4993, 'min_query_length': 0, 'average_query_length': 0, 'max_query_length': 0, 'unique_queries': 0, 'num_query_images': 70, 'min_query_image_width': 256, 'average_query_image_width': 256.0, 'max_query_image_width': 256, 'min_query_image_height': 256, 'average_query_image_height': 256.0, 'max_query_image_height': 256, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 35.67, 'max_relevant_docs_per_query': 284, 'unique_relevant_docs': 4993}} | -| [ROxfordMediumI2IRetrieval](https://openaccess.thecvf.com/content_cvpr_2018/html/Radenovic_Revisiting_Oxford_and_CVPR_2018_paper.html) (Radenovi{'c, 2018) | ['eng'] | Any2AnyRetrieval | i2i | [Web] | {'test': 5063} | {'test': {'number_of_characters': 0, 'num_samples': 5063, 'num_queries': 70, 'num_documents': 4993, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'min_document_image_width': 256, 'average_document_image_width': 256.0, 'max_document_image_width': 256, 'min_document_image_height': 256, 'average_document_image_height': 256.0, 'max_document_image_height': 256, 'num_document_images': 4993, 'min_query_length': 0, 'average_query_length': 0, 'max_query_length': 0, 'unique_queries': 0, 'num_query_images': 70, 'min_query_image_width': 256, 'average_query_image_width': 256.0, 'max_query_image_width': 256, 'min_query_image_height': 256, 'average_query_image_height': 256.0, 'max_query_image_height': 256, 'min_relevant_docs_per_query': 2, 'average_relevant_docs_per_query': 78.94, 'max_relevant_docs_per_query': 347, 'unique_relevant_docs': 4993}} | +| [ROxfordEasyI2IRetrieval](https://openaccess.thecvf.com/content_cvpr_2018/html/Radenovic_Revisiting_Oxford_and_CVPR_2018_paper.html) (Radenovi{\'c, 2018) | ['eng'] | Any2AnyRetrieval | i2i | [Web] | {'test': 5063} | {'test': {'number_of_characters': 0, 'num_samples': 5063, 'num_queries': 70, 'num_documents': 4993, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'min_document_image_width': 256, 'average_document_image_width': 256.0, 'max_document_image_width': 256, 'min_document_image_height': 256, 'average_document_image_height': 256.0, 'max_document_image_height': 256, 'num_document_images': 4993, 'min_query_length': 0, 'average_query_length': 0, 'max_query_length': 0, 'unique_queries': 0, 'num_query_images': 70, 'min_query_image_width': 256, 'average_query_image_width': 256.0, 'max_query_image_width': 256, 'min_query_image_height': 256, 'average_query_image_height': 256.0, 'max_query_image_height': 256, 'min_relevant_docs_per_query': 0, 'average_relevant_docs_per_query': 43.27, 'max_relevant_docs_per_query': 248, 'unique_relevant_docs': 4993}} | +| [ROxfordHardI2IRetrieval](https://openaccess.thecvf.com/content_cvpr_2018/html/Radenovic_Revisiting_Oxford_and_CVPR_2018_paper.html) (Radenovi{\'c, 2018) | ['eng'] | Any2AnyRetrieval | i2i | [Web] | {'test': 5063} | {'test': {'number_of_characters': 0, 'num_samples': 5063, 'num_queries': 70, 'num_documents': 4993, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'min_document_image_width': 256, 'average_document_image_width': 256.0, 'max_document_image_width': 256, 'min_document_image_height': 256, 'average_document_image_height': 256.0, 'max_document_image_height': 256, 'num_document_images': 4993, 'min_query_length': 0, 'average_query_length': 0, 'max_query_length': 0, 'unique_queries': 0, 'num_query_images': 70, 'min_query_image_width': 256, 'average_query_image_width': 256.0, 'max_query_image_width': 256, 'min_query_image_height': 256, 'average_query_image_height': 256.0, 'max_query_image_height': 256, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 35.67, 'max_relevant_docs_per_query': 284, 'unique_relevant_docs': 4993}} | +| [ROxfordMediumI2IRetrieval](https://openaccess.thecvf.com/content_cvpr_2018/html/Radenovic_Revisiting_Oxford_and_CVPR_2018_paper.html) (Radenovi{\'c, 2018) | ['eng'] | Any2AnyRetrieval | i2i | [Web] | {'test': 5063} | {'test': {'number_of_characters': 0, 'num_samples': 5063, 'num_queries': 70, 'num_documents': 4993, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'min_document_image_width': 256, 'average_document_image_width': 256.0, 'max_document_image_width': 256, 'min_document_image_height': 256, 'average_document_image_height': 256.0, 'max_document_image_height': 256, 'num_document_images': 4993, 'min_query_length': 0, 'average_query_length': 0, 'max_query_length': 0, 'unique_queries': 0, 'num_query_images': 70, 'min_query_image_width': 256, 'average_query_image_width': 256.0, 'max_query_image_width': 256, 'min_query_image_height': 256, 'average_query_image_height': 256.0, 'max_query_image_height': 256, 'min_relevant_docs_per_query': 2, 'average_relevant_docs_per_query': 78.94, 'max_relevant_docs_per_query': 347, 'unique_relevant_docs': 4993}} | | [RP2kI2IRetrieval](https://arxiv.org/abs/2006.12634) (Peng et al., 2020) | ['eng'] | Any2AnyRetrieval | i2i | [Web] | {'test': 77643} | {'test': {'number_of_characters': 0, 'num_samples': 77643, 'num_queries': 38186, 'num_documents': 39457, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 39457, 'min_query_length': 0, 'average_query_length': 0, 'max_query_length': 0, 'unique_queries': 0, 'num_query_images': 38186, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 115.47, 'max_relevant_docs_per_query': 1069, 'unique_relevant_docs': 38181}} | -| [RParisEasyI2IRetrieval](https://openaccess.thecvf.com/content_cvpr_2018/html/Radenovic_Revisiting_Paris_and_CVPR_2018_paper.html) (Radenovi{'c, 2018) | ['eng'] | Any2AnyRetrieval | i2i | [Web] | {'test': 6392} | {'test': {'number_of_characters': 0, 'num_samples': 6392, 'num_queries': 70, 'num_documents': 6322, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'min_document_image_width': 256, 'average_document_image_width': 256.0, 'max_document_image_width': 256, 'min_document_image_height': 256, 'average_document_image_height': 256.0, 'max_document_image_height': 256, 'num_document_images': 6322, 'min_query_length': 0, 'average_query_length': 0, 'max_query_length': 0, 'unique_queries': 0, 'num_query_images': 70, 'min_query_image_width': 256, 'average_query_image_width': 256.0, 'max_query_image_width': 256, 'min_query_image_height': 256, 'average_query_image_height': 256.0, 'max_query_image_height': 256, 'min_relevant_docs_per_query': 2, 'average_relevant_docs_per_query': 98.2, 'max_relevant_docs_per_query': 199, 'unique_relevant_docs': 6322}} | -| [RParisHardI2IRetrieval](https://openaccess.thecvf.com/content_cvpr_2018/html/Radenovic_Revisiting_Paris_and_CVPR_2018_paper.html) (Radenovi{'c, 2018) | ['eng'] | Any2AnyRetrieval | i2i | [Web] | {'test': 6392} | {'test': {'number_of_characters': 0, 'num_samples': 6392, 'num_queries': 70, 'num_documents': 6322, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'min_document_image_width': 256, 'average_document_image_width': 256.0, 'max_document_image_width': 256, 'min_document_image_height': 256, 'average_document_image_height': 256.0, 'max_document_image_height': 256, 'num_document_images': 6322, 'min_query_length': 0, 'average_query_length': 0, 'max_query_length': 0, 'unique_queries': 0, 'num_query_images': 70, 'min_query_image_width': 256, 'average_query_image_width': 256.0, 'max_query_image_width': 256, 'min_query_image_height': 256, 'average_query_image_height': 256.0, 'max_query_image_height': 256, 'min_relevant_docs_per_query': 34, 'average_relevant_docs_per_query': 147.86, 'max_relevant_docs_per_query': 556, 'unique_relevant_docs': 6322}} | -| [RParisMediumI2IRetrieval](https://openaccess.thecvf.com/content_cvpr_2018/html/Radenovic_Revisiting_Paris_and_CVPR_2018_paper.html) (Radenovi{'c, 2018) | ['eng'] | Any2AnyRetrieval | i2i | [Web] | {'test': 6392} | {'test': {'number_of_characters': 0, 'num_samples': 6392, 'num_queries': 70, 'num_documents': 6322, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'min_document_image_width': 256, 'average_document_image_width': 256.0, 'max_document_image_width': 256, 'min_document_image_height': 256, 'average_document_image_height': 256.0, 'max_document_image_height': 256, 'num_document_images': 6322, 'min_query_length': 0, 'average_query_length': 0, 'max_query_length': 0, 'unique_queries': 0, 'num_query_images': 70, 'min_query_image_width': 256, 'average_query_image_width': 256.0, 'max_query_image_width': 256, 'min_query_image_height': 256, 'average_query_image_height': 256.0, 'max_query_image_height': 256, 'min_relevant_docs_per_query': 76, 'average_relevant_docs_per_query': 246.06, 'max_relevant_docs_per_query': 636, 'unique_relevant_docs': 6322}} | -| [RTE3](https://aclanthology.org/W07-1401/) | ['deu', 'eng', 'fra', 'ita'] | PairClassification | s2s | [Encyclopaedic, News, Web, Written] | None | None | -| [RUParaPhraserSTS](https://aclanthology.org/2020.ngt-1.6) (Pivovarova et al., 2017) | ['rus'] | STS | s2s | [News, Written] | None | None | -| [ReMuQIT2TRetrieval](https://github.com/luomancs/ReMuQ) | ['eng'] | Any2AnyRetrieval | it2t | [Encyclopaedic] | {'test': 142403} | {'test': {'number_of_characters': 29161615, 'num_samples': 142403, 'num_queries': 3609, 'num_documents': 138794, 'min_document_length': 9, 'average_document_length': 208.19, 'max_document_length': 508, 'unique_documents': 138794, 'num_document_images': 0, 'min_query_length': 18, 'average_query_length': 73.86, 'max_query_length': 218, 'unique_queries': 3608, 'num_query_images': 3609, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 3607}} | -| [RedditClustering.v2](https://arxiv.org/abs/2104.07081) (Gregor Geigle and Nils Reimers and Andreas R{"u, 2021) | ['eng'] | Clustering | s2s | [Social, Web, Written] | None | None | -| [RedditClusteringP2P.v2](https://arxiv.org/abs/2104.07081) (Gregor Geigle and Nils Reimers and Andreas R{"u, 2021) | ['eng'] | Clustering | p2p | [Social, Web, Written] | {'test': 459389} | {'test': {'num_samples': 459389, 'number_of_characters': 334286895, 'min_text_length': 79, 'average_text_length': 727.68, 'max_text_length': 4359, 'min_labels_per_text': 2, 'average_labels_per_text': 1.0, 'max_labels_per_text': 77908, 'unique_labels': 440, 'labels': {'FortNiteBR': {'count': 436}, 'buildapc': {'count': 8484}, 'offmychest': {'count': 570}, 'nus': {'count': 45}, 'relationship_advice': {'count': 16651}, 'premed': {'count': 201}, 'dogecoin': {'count': 8108}, 'GamingLaptops': {'count': 183}, 'asktransgender': {'count': 326}, 'MachineLearning': {'count': 61}, 'puppy101': {'count': 1597}, 'GunAccessoriesForSale': {'count': 2619}, 'Random_Acts_Of_Amazon': {'count': 1115}, 'Catholicism': {'count': 183}, 'MonsterHunter': {'count': 218}, 'tipofmypenis': {'count': 87}, 'samsung': {'count': 69}, 'PersonalFinanceCanada': {'count': 341}, 'Dyson_Sphere_Program': {'count': 55}, 'bleach': {'count': 41}, 'AmItheAsshole': {'count': 3730}, 'WallStreetbetsELITE': {'count': 328}, 'GlobalPowers': {'count': 35}, 'ABraThatFits': {'count': 159}, 'PokemonGoFriends': {'count': 1165}, 'NoMansSkyTheGame': {'count': 259}, 'masseffect': {'count': 233}, 'dating_advice': {'count': 559}, 'yoga': {'count': 50}, 'depression': {'count': 515}, 'COVID19positive': {'count': 180}, 'generationology': {'count': 37}, 'feedthebeast': {'count': 192}, 'EliteDangerous': {'count': 270}, 'alcoholicsanonymous': {'count': 93}, 'GoRVing': {'count': 35}, 'thedivision': {'count': 111}, 'breakingmom': {'count': 105}, 'AskAnAmerican': {'count': 80}, 'HypnoFair': {'count': 5}, 'JustUnsubbed': {'count': 13}, 'socialanxiety': {'count': 123}, 'dirtykikpals': {'count': 202}, 'askTO': {'count': 126}, 'AskCulinary': {'count': 108}, 'Bogleheads': {'count': 71}, 'dragonquest': {'count': 45}, 'NoContract': {'count': 30}, 'gorillaz': {'count': 14}, 'MondoGore': {'count': 8}, 'comicswap': {'count': 56}, 'VirtualYoutubers': {'count': 92}, 'Gta5Modding': {'count': 28}, 'obs': {'count': 61}, 'vcu': {'count': 9}, 'KingkillerChronicle': {'count': 17}, 'AmongUs': {'count': 41}, 'wireshark': {'count': 3}, 'Dodocodes': {'count': 46}, 'Aliexpress': {'count': 40}, 'LearnerDriverUK': {'count': 12}, 'PanicAttack': {'count': 23}, 'KassadinMains': {'count': 10}, 'islam': {'count': 93}, 'chronotrigger': {'count': 4}, 'skincareexchange': {'count': 13}, 'PokemonHome': {'count': 21}, 'survivinginfidelity': {'count': 71}, 'igcse': {'count': 21}, 'C25K': {'count': 21}, 'aorus': {'count': 2}, 'idleon': {'count': 19}, 'photography': {'count': 22}, 'cryptocoins': {'count': 7}, 'CanaryWharfBets': {'count': 7}, 'KillingEve': {'count': 7}, 'GameBuilderGarage': {'count': 16}, 'SauceSharingCommunity': {'count': 7}, 'turo': {'count': 9}, 'foodscience': {'count': 14}, 'HIMYM': {'count': 20}, 'HauntingOfHillHouse': {'count': 4}, 'GoodNotes': {'count': 8}, 'RedditWritesSeinfeld': {'count': 6}, 'AirReps': {'count': 2}, 'ADHD': {'count': 3811}, 'BuddyCrossing': {'count': 446}, 'libraryofruina': {'count': 98}, 'SluttyConfessions': {'count': 2787}, 'tipofmytongue': {'count': 7145}, 'fleshlight': {'count': 128}, 'amcstock': {'count': 13910}, 'teenagers': {'count': 77908}, 'suggestmeabook': {'count': 1540}, 'dirtypenpals': {'count': 5587}, 'MinecraftServer': {'count': 177}, 'CreditCards': {'count': 669}, 'Guitar': {'count': 10952}, 'rpg': {'count': 529}, 'NoFap': {'count': 14853}, 'lfg': {'count': 1093}, 'MarsWallStreet': {'count': 935}, 'SummonSign': {'count': 931}, 'AssassinsCreedValhala': {'count': 295}, 'hoi4': {'count': 432}, 'Coins4Sale': {'count': 260}, 'xbox': {'count': 459}, 'TooAfraidToAsk': {'count': 7404}, 'NBA2k': {'count': 553}, 'KGBTR': {'count': 943}, 'roblox': {'count': 220}, 'salesforce': {'count': 214}, 'TwoXChromosomes': {'count': 1736}, 'mechmarket': {'count': 4863}, 'Gaming_Headsets': {'count': 103}, 'pittsburgh': {'count': 189}, 'CryptoMars': {'count': 1606}, 'FridayNightFunkin': {'count': 378}, 'vaginismus': {'count': 122}, 'transpositive': {'count': 10}, 'comicbooks': {'count': 274}, 'BDSMcommunity': {'count': 185}, 'aliens': {'count': 201}, 'Scotch': {'count': 64}, 'KikRoleplay': {'count': 141}, 'Kayaking': {'count': 91}, '196': {'count': 47}, 'digimon': {'count': 140}, 'Evernote': {'count': 42}, 'logh': {'count': 22}, 'arlington': {'count': 15}, 'Adopted': {'count': 8}, 'DissonautUniverse': {'count': 4}, 'Midsommar': {'count': 12}, 'SofiawithanF': {'count': 83}, 'xmpp': {'count': 6}, 'ZombsRoyale': {'count': 16}, 'accesscontrol': {'count': 8}, 'WetlanderHumor': {'count': 2}, 'PoonamPandeyFanatics': {'count': 2}, 'screenplaychallenge': {'count': 2}, 'scatstories': {'count': 2}, 'techsupport': {'count': 290}, 'whatcarshouldIbuy': {'count': 79}, 'Stormlight_Archive': {'count': 15}, 'deadbydaylight': {'count': 126}, 'bicycling': {'count': 27}, 'oculus': {'count': 64}, 'Cartalk': {'count': 33}, 'Sims4': {'count': 43}, 'NoFeeAC': {'count': 95}, 'Crypto_com': {'count': 37}, 'ITCareerQuestions': {'count': 259}, 'aromantic': {'count': 18}, 'Revu': {'count': 3}, 'exalted': {'count': 2}, 'HilariaBaldwin': {'count': 20}, 'Testosterone': {'count': 35}, 'Screenwriting': {'count': 170}, 'LifeProTips': {'count': 49}, 'steinsgate': {'count': 13}, 'Baystreetbets': {'count': 10}, 'AskGirls': {'count': 7}, 'idlechampions': {'count': 7}, 'facebook': {'count': 17}, 'tf2trade': {'count': 4}, 'mfdoom': {'count': 3}, 'FiddlesticksMains': {'count': 2}, 'HFY': {'count': 10}, 'FiestaST': {'count': 2}, 'whatsthatbook': {'count': 994}, 'GearsOfWar': {'count': 879}, 'KazuhaMains': {'count': 175}, 'RepTime': {'count': 211}, 'AstroGaming': {'count': 141}, 'metalgearsolid': {'count': 152}, 'qBittorrent': {'count': 39}, 'ELLIPAL_Official': {'count': 24}, 'raisedbynarcissists': {'count': 4895}, 'unpopularopinion': {'count': 14901}, 'ACTrade': {'count': 5679}, 'askcarsales': {'count': 1339}, 'AskVet': {'count': 1357}, 'whowouldwin': {'count': 4493}, 'playstation': {'count': 1362}, 'anime': {'count': 6531}, 'GME': {'count': 12577}, 'DotA2': {'count': 2004}, 'cryptostreetbets': {'count': 2241}, 'MonsterHunterWorld': {'count': 698}, 'Market76': {'count': 14274}, 'DnD': {'count': 5092}, 'leagueoflegends': {'count': 3683}, 'doordash_drivers': {'count': 1626}, 'theta_network': {'count': 489}, 'exmuslim': {'count': 1369}, 'gonewildaudio': {'count': 2998}, 'conspiracy': {'count': 3587}, 'heroesofthestorm': {'count': 535}, 'FanFiction': {'count': 2782}, 'Doom': {'count': 1251}, 'texas': {'count': 269}, 'Vent': {'count': 1738}, 'selfimprovement': {'count': 1284}, 'youtubers': {'count': 706}, 'askseddit': {'count': 237}, 'boardgames': {'count': 1237}, 'bravelydefault': {'count': 347}, 'ConquerorsBlade': {'count': 238}, 'ChronicPain': {'count': 527}, 'teenagersnew': {'count': 256}, 'brasil': {'count': 1092}, 'MatthiasSubmissions': {'count': 921}, 'MarylandUnemployment': {'count': 314}, 'SaltLakeCity': {'count': 411}, 'BokunoheroFanfiction': {'count': 155}, 'BenignExistence': {'count': 125}, 'GayYoungOldDating': {'count': 156}, 'Bible': {'count': 202}, 'haskell': {'count': 154}, 'seduction': {'count': 400}, 'fantasywriters': {'count': 262}, 'HiveOS': {'count': 100}, 'PerkByDaylight': {'count': 15}, 'Hedgehog': {'count': 73}, 'xmen': {'count': 263}, 'HyperRP': {'count': 122}, 'emotestories': {'count': 3}, 'tutanota': {'count': 135}, 'CultoftheFranklin': {'count': 46}, 'langrisser': {'count': 62}, 'CozyGrove': {'count': 61}, 'Sverigesforsvarsmakt': {'count': 12}, 'silverbugbets': {'count': 21}, 'WreckingBallMains': {'count': 5}, 'capitalism_in_decay': {'count': 8}, 'paintdotnet': {'count': 11}, 'u_mawadom118': {'count': 4}, 'xboxfindfriends': {'count': 2}, 'CPTSD': {'count': 540}, 'destiny2': {'count': 318}, 'Wallstreetsilver': {'count': 1013}, 'DestinyTheGame': {'count': 1107}, 'blackopscoldwar': {'count': 400}, 'InstacartShoppers': {'count': 202}, 'RocketLeagueExchange': {'count': 832}, 'apexlegends': {'count': 3265}, 'kansascity': {'count': 53}, 'namenerds': {'count': 235}, 'help': {'count': 152}, 'Kengan_Ashura': {'count': 132}, 'thetagang': {'count': 165}, 'GameSale': {'count': 262}, 'Reduction': {'count': 109}, 'sex': {'count': 906}, 'bostonr4r': {'count': 75}, 'LegendsOfRuneterra': {'count': 231}, 'overlord': {'count': 48}, 'madisonwi': {'count': 53}, 'steelseries': {'count': 79}, 'ClashOfClansRecruit': {'count': 214}, 'CharacterRant': {'count': 55}, 'AirForce': {'count': 94}, 'sexstories': {'count': 92}, 'NameThatSong': {'count': 162}, 'depressed': {'count': 74}, 'ibs': {'count': 150}, '40kLore': {'count': 269}, 'podcasts': {'count': 88}, 'miraculousladybug': {'count': 150}, 'ask': {'count': 224}, 'EverMerge': {'count': 31}, 'TMJ': {'count': 54}, 'BitLifeApp': {'count': 39}, 'FireEmblemHeroes': {'count': 100}, 'software': {'count': 62}, 'ShieldAndroidTV': {'count': 70}, 'GriefSupport': {'count': 125}, 'onewheel': {'count': 37}, 'MensRights': {'count': 80}, 'nhl': {'count': 22}, 'ClashOfClans': {'count': 107}, 'ps3homebrew': {'count': 33}, 'LightNovels': {'count': 77}, 'redsox': {'count': 34}, 'CryptoMarkets': {'count': 44}, 'ugly': {'count': 47}, 'GCXRep': {'count': 12}, 'cscareerquestionsEU': {'count': 65}, 'MindHunter': {'count': 6}, 'starcraft2coop': {'count': 15}, 'nanocurrency': {'count': 1421}, 'ModelCars': {'count': 8}, 'UKJobs': {'count': 30}, 'Netherlands': {'count': 44}, 'clonewars': {'count': 8}, 'Julia': {'count': 11}, 'Prolactinoma': {'count': 9}, 'sofi': {'count': 11}, 'royalfamily': {'count': 6}, 'ConnecticutR4R': {'count': 8}, 'weather': {'count': 5}, 'oneui': {'count': 7}, 'KTM': {'count': 5}, 'Aerials': {'count': 3}, 'seoul': {'count': 2}, 'exjw': {'count': 3281}, 'ModernMagic': {'count': 699}, 'Paladins': {'count': 1242}, 'kdramarecommends': {'count': 1611}, 'hitbtc': {'count': 330}, 'endocrinology': {'count': 75}, 'Bath': {'count': 43}, 'NassauCountyHookups': {'count': 5}, 'feminineboys': {'count': 1248}, 'dreamsmp': {'count': 2018}, 'SquaredCircle': {'count': 2255}, 'Minecraft': {'count': 8753}, 'spirituality': {'count': 1809}, 'Eldenring': {'count': 1471}, 'Sat': {'count': 1172}, 'bonnaroo': {'count': 194}, 'gardening': {'count': 1892}, 'Unemployment': {'count': 6185}, 'mac': {'count': 1847}, 'Bestbuy': {'count': 437}, 'quittingkratom': {'count': 1081}, 'lawschooladmissions': {'count': 3436}, 'NiceHash': {'count': 2135}, 'McMaster': {'count': 815}, 'covidlonghaulers': {'count': 1299}, 'stalker': {'count': 758}, 'MLBTheShow': {'count': 2721}, 'FortniteCompetitive': {'count': 998}, 'dpdr': {'count': 514}, 'appliancerepair': {'count': 720}, 'thomasthetankengine': {'count': 207}, 'delhi': {'count': 217}, 'Huel': {'count': 300}, 'leafs': {'count': 203}, 'HotWheels': {'count': 170}, '90dayfianceuncensored': {'count': 550}, 'Throwers': {'count': 142}, 'Wavyhair': {'count': 270}, 'CryptoHorde': {'count': 128}, 'ShuumatsuNoValkyrie': {'count': 453}, 'TeensMeetTeens': {'count': 432}, 'dbrand': {'count': 108}, 'SLFmeetups': {'count': 18}, '1200isplentyketo': {'count': 48}, 'passive_income': {'count': 211}, 'BroadCity': {'count': 16}, 'RevenantMain': {'count': 71}, 'extrarfl': {'count': 25}, 'AgonGame': {'count': 5}, 'FitnessDE': {'count': 3}, 'gaming': {'count': 1277}, 'livesound': {'count': 91}, 'IBO': {'count': 1896}, 'EscapefromTarkov': {'count': 1300}, 'amex': {'count': 145}, 'DMAcademy': {'count': 1411}, 'VinylCollectors': {'count': 556}, 'cardano': {'count': 716}, 'brave_browser': {'count': 159}, 'dating': {'count': 952}, 'OculusQuest': {'count': 942}, 'Superstonk': {'count': 3089}, 'MtF': {'count': 957}, 'findaleague': {'count': 207}, 'Nioh': {'count': 398}, 'IRS': {'count': 715}, 'transgendercirclejerk': {'count': 353}, 'learnmath': {'count': 489}, 'piano': {'count': 263}, 'LeagueConnect': {'count': 216}, 'eu4': {'count': 561}, 'Wordpress': {'count': 345}, 'RoleplayingForReddit': {'count': 31}, 'LOONA': {'count': 89}, 'newtothenavy': {'count': 167}, 'HaircareScience': {'count': 118}, 'appletv': {'count': 167}, 'sissypersonals': {'count': 102}, 'raleigh': {'count': 168}, 'realonlyfansreviews': {'count': 21}, 'AskGames': {'count': 49}, 'PokemonTCG': {'count': 325}, 'controlgame': {'count': 109}, 'GoogleDataStudio': {'count': 16}, 'WhiteWolfRPG': {'count': 139}, 'MECoOp': {'count': 31}, 'snuffrp': {'count': 46}, 'lockpicking': {'count': 103}, 'wicked_edge': {'count': 105}, 'BMW': {'count': 99}, 'choiceofgames': {'count': 24}, 'hisdarkmaterials': {'count': 12}, 'SakuraGakuin': {'count': 24}, 'detrans': {'count': 55}, 'Smallville': {'count': 37}, 'kingofqueens': {'count': 7}, 'JamesHoffmann': {'count': 22}, 'stashinvest': {'count': 16}, 'ABA': {'count': 79}, 'ladybusiness': {'count': 10}, 'gamegrumps': {'count': 32}, 'GodEater': {'count': 21}, 'tomorrow': {'count': 39}, 'Tomorrowland': {'count': 9}, 'BlackCountryNewRoad': {'count': 5}, 'STAYC': {'count': 3}, 'SatoshiStreetBets': {'count': 3828}, 'AskLosAngeles': {'count': 1036}, 'buildapcforme': {'count': 1689}, 'ApplyingToCollege': {'count': 10675}, 'watercooling': {'count': 1209}, 'BreakUps': {'count': 4914}, 'FIFA': {'count': 3811}, 'emacs': {'count': 712}, 'trakstocks': {'count': 691}, 'Shittyaskflying': {'count': 147}, 'AmazonFC': {'count': 1178}, 'stocks': {'count': 4610}, 'BangaloreMains': {'count': 26}, 'pokemon': {'count': 3953}, 'religion': {'count': 684}, 'cuboulder': {'count': 269}, 'self': {'count': 1688}, 'tarot': {'count': 912}, 'turtles': {'count': 49}, 'TheMagnusArchives': {'count': 300}, 'Superhero_Ideas': {'count': 34}, 'NTU': {'count': 308}, 'touhou': {'count': 623}, 'JoJolion': {'count': 50}, 'lasers': {'count': 27}, 'popperpigs': {'count': 67}, 'aggretsuko': {'count': 20}, 'Library': {'count': 5}}}} | +| [RParisEasyI2IRetrieval](https://openaccess.thecvf.com/content_cvpr_2018/html/Radenovic_Revisiting_Paris_and_CVPR_2018_paper.html) (Radenovi{\'c, 2018) | ['eng'] | Any2AnyRetrieval | i2i | [Web] | {'test': 6392} | {'test': {'number_of_characters': 0, 'num_samples': 6392, 'num_queries': 70, 'num_documents': 6322, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'min_document_image_width': 256, 'average_document_image_width': 256.0, 'max_document_image_width': 256, 'min_document_image_height': 256, 'average_document_image_height': 256.0, 'max_document_image_height': 256, 'num_document_images': 6322, 'min_query_length': 0, 'average_query_length': 0, 'max_query_length': 0, 'unique_queries': 0, 'num_query_images': 70, 'min_query_image_width': 256, 'average_query_image_width': 256.0, 'max_query_image_width': 256, 'min_query_image_height': 256, 'average_query_image_height': 256.0, 'max_query_image_height': 256, 'min_relevant_docs_per_query': 2, 'average_relevant_docs_per_query': 98.2, 'max_relevant_docs_per_query': 199, 'unique_relevant_docs': 6322}} | +| [RParisHardI2IRetrieval](https://openaccess.thecvf.com/content_cvpr_2018/html/Radenovic_Revisiting_Paris_and_CVPR_2018_paper.html) (Radenovi{\'c, 2018) | ['eng'] | Any2AnyRetrieval | i2i | [Web] | {'test': 6392} | {'test': {'number_of_characters': 0, 'num_samples': 6392, 'num_queries': 70, 'num_documents': 6322, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'min_document_image_width': 256, 'average_document_image_width': 256.0, 'max_document_image_width': 256, 'min_document_image_height': 256, 'average_document_image_height': 256.0, 'max_document_image_height': 256, 'num_document_images': 6322, 'min_query_length': 0, 'average_query_length': 0, 'max_query_length': 0, 'unique_queries': 0, 'num_query_images': 70, 'min_query_image_width': 256, 'average_query_image_width': 256.0, 'max_query_image_width': 256, 'min_query_image_height': 256, 'average_query_image_height': 256.0, 'max_query_image_height': 256, 'min_relevant_docs_per_query': 34, 'average_relevant_docs_per_query': 147.86, 'max_relevant_docs_per_query': 556, 'unique_relevant_docs': 6322}} | +| [RParisMediumI2IRetrieval](https://openaccess.thecvf.com/content_cvpr_2018/html/Radenovic_Revisiting_Paris_and_CVPR_2018_paper.html) (Radenovi{\'c, 2018) | ['eng'] | Any2AnyRetrieval | i2i | [Web] | {'test': 6392} | {'test': {'number_of_characters': 0, 'num_samples': 6392, 'num_queries': 70, 'num_documents': 6322, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'min_document_image_width': 256, 'average_document_image_width': 256.0, 'max_document_image_width': 256, 'min_document_image_height': 256, 'average_document_image_height': 256.0, 'max_document_image_height': 256, 'num_document_images': 6322, 'min_query_length': 0, 'average_query_length': 0, 'max_query_length': 0, 'unique_queries': 0, 'num_query_images': 70, 'min_query_image_width': 256, 'average_query_image_width': 256.0, 'max_query_image_width': 256, 'min_query_image_height': 256, 'average_query_image_height': 256.0, 'max_query_image_height': 256, 'min_relevant_docs_per_query': 76, 'average_relevant_docs_per_query': 246.06, 'max_relevant_docs_per_query': 636, 'unique_relevant_docs': 6322}} | +| [RTE3](https://aclanthology.org/W07-1401/) (Giampiccolo et al., 2007) | ['deu', 'eng', 'fra', 'ita'] | PairClassification | s2s | [Encyclopaedic, News, Web, Written] | None | None | +| [RUParaPhraserSTS](https://aclanthology.org/2020.ngt-1.6) (Gudkov et al., 2020) | ['rus'] | STS | s2s | [News, Written] | None | None | +| [RavdessZeroshot](https://huggingface.co/datasets/narad/ravdess) (Livingstone et al., 2018) | ['eng'] | AudioZeroshotClassification | a2t | [Spoken] | None | None | +| [ReMuQIT2TRetrieval](https://github.com/luomancs/ReMuQ) (Luo et al., 2023) | ['eng'] | Any2AnyRetrieval | it2t | [Encyclopaedic] | {'test': 142403} | {'test': {'number_of_characters': 29161615, 'num_samples': 142403, 'num_queries': 3609, 'num_documents': 138794, 'min_document_length': 9, 'average_document_length': 208.19, 'max_document_length': 508, 'unique_documents': 138794, 'num_document_images': 0, 'min_query_length': 18, 'average_query_length': 73.86, 'max_query_length': 218, 'unique_queries': 3608, 'num_query_images': 3609, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 3607}} | +| [RedditClustering.v2](https://arxiv.org/abs/2104.07081) (Gregor Geigle and Nils Reimers and Andreas R{\"u, 2021) | ['eng'] | Clustering | s2s | [Social, Web, Written] | None | None | +| [RedditClusteringP2P.v2](https://arxiv.org/abs/2104.07081) (Gregor Geigle and Nils Reimers and Andreas R{\"u, 2021) | ['eng'] | Clustering | p2p | [Social, Web, Written] | {'test': 459389} | {'test': {'num_samples': 459389, 'number_of_characters': 334286895, 'min_text_length': 79, 'average_text_length': 727.68, 'max_text_length': 4359, 'min_labels_per_text': 2, 'average_labels_per_text': 1.0, 'max_labels_per_text': 77908, 'unique_labels': 440, 'labels': {'FortNiteBR': {'count': 436}, 'buildapc': {'count': 8484}, 'offmychest': {'count': 570}, 'nus': {'count': 45}, 'relationship_advice': {'count': 16651}, 'premed': {'count': 201}, 'dogecoin': {'count': 8108}, 'GamingLaptops': {'count': 183}, 'asktransgender': {'count': 326}, 'MachineLearning': {'count': 61}, 'puppy101': {'count': 1597}, 'GunAccessoriesForSale': {'count': 2619}, 'Random_Acts_Of_Amazon': {'count': 1115}, 'Catholicism': {'count': 183}, 'MonsterHunter': {'count': 218}, 'tipofmypenis': {'count': 87}, 'samsung': {'count': 69}, 'PersonalFinanceCanada': {'count': 341}, 'Dyson_Sphere_Program': {'count': 55}, 'bleach': {'count': 41}, 'AmItheAsshole': {'count': 3730}, 'WallStreetbetsELITE': {'count': 328}, 'GlobalPowers': {'count': 35}, 'ABraThatFits': {'count': 159}, 'PokemonGoFriends': {'count': 1165}, 'NoMansSkyTheGame': {'count': 259}, 'masseffect': {'count': 233}, 'dating_advice': {'count': 559}, 'yoga': {'count': 50}, 'depression': {'count': 515}, 'COVID19positive': {'count': 180}, 'generationology': {'count': 37}, 'feedthebeast': {'count': 192}, 'EliteDangerous': {'count': 270}, 'alcoholicsanonymous': {'count': 93}, 'GoRVing': {'count': 35}, 'thedivision': {'count': 111}, 'breakingmom': {'count': 105}, 'AskAnAmerican': {'count': 80}, 'HypnoFair': {'count': 5}, 'JustUnsubbed': {'count': 13}, 'socialanxiety': {'count': 123}, 'dirtykikpals': {'count': 202}, 'askTO': {'count': 126}, 'AskCulinary': {'count': 108}, 'Bogleheads': {'count': 71}, 'dragonquest': {'count': 45}, 'NoContract': {'count': 30}, 'gorillaz': {'count': 14}, 'MondoGore': {'count': 8}, 'comicswap': {'count': 56}, 'VirtualYoutubers': {'count': 92}, 'Gta5Modding': {'count': 28}, 'obs': {'count': 61}, 'vcu': {'count': 9}, 'KingkillerChronicle': {'count': 17}, 'AmongUs': {'count': 41}, 'wireshark': {'count': 3}, 'Dodocodes': {'count': 46}, 'Aliexpress': {'count': 40}, 'LearnerDriverUK': {'count': 12}, 'PanicAttack': {'count': 23}, 'KassadinMains': {'count': 10}, 'islam': {'count': 93}, 'chronotrigger': {'count': 4}, 'skincareexchange': {'count': 13}, 'PokemonHome': {'count': 21}, 'survivinginfidelity': {'count': 71}, 'igcse': {'count': 21}, 'C25K': {'count': 21}, 'aorus': {'count': 2}, 'idleon': {'count': 19}, 'photography': {'count': 22}, 'cryptocoins': {'count': 7}, 'CanaryWharfBets': {'count': 7}, 'KillingEve': {'count': 7}, 'GameBuilderGarage': {'count': 16}, 'SauceSharingCommunity': {'count': 7}, 'turo': {'count': 9}, 'foodscience': {'count': 14}, 'HIMYM': {'count': 20}, 'HauntingOfHillHouse': {'count': 4}, 'GoodNotes': {'count': 8}, 'RedditWritesSeinfeld': {'count': 6}, 'AirReps': {'count': 2}, 'ADHD': {'count': 3811}, 'BuddyCrossing': {'count': 446}, 'libraryofruina': {'count': 98}, 'SluttyConfessions': {'count': 2787}, 'tipofmytongue': {'count': 7145}, 'fleshlight': {'count': 128}, 'amcstock': {'count': 13910}, 'teenagers': {'count': 77908}, 'suggestmeabook': {'count': 1540}, 'dirtypenpals': {'count': 5587}, 'MinecraftServer': {'count': 177}, 'CreditCards': {'count': 669}, 'Guitar': {'count': 10952}, 'rpg': {'count': 529}, 'NoFap': {'count': 14853}, 'lfg': {'count': 1093}, 'MarsWallStreet': {'count': 935}, 'SummonSign': {'count': 931}, 'AssassinsCreedValhala': {'count': 295}, 'hoi4': {'count': 432}, 'Coins4Sale': {'count': 260}, 'xbox': {'count': 459}, 'TooAfraidToAsk': {'count': 7404}, 'NBA2k': {'count': 553}, 'KGBTR': {'count': 943}, 'roblox': {'count': 220}, 'salesforce': {'count': 214}, 'TwoXChromosomes': {'count': 1736}, 'mechmarket': {'count': 4863}, 'Gaming_Headsets': {'count': 103}, 'pittsburgh': {'count': 189}, 'CryptoMars': {'count': 1606}, 'FridayNightFunkin': {'count': 378}, 'vaginismus': {'count': 122}, 'transpositive': {'count': 10}, 'comicbooks': {'count': 274}, 'BDSMcommunity': {'count': 185}, 'aliens': {'count': 201}, 'Scotch': {'count': 64}, 'KikRoleplay': {'count': 141}, 'Kayaking': {'count': 91}, '196': {'count': 47}, 'digimon': {'count': 140}, 'Evernote': {'count': 42}, 'logh': {'count': 22}, 'arlington': {'count': 15}, 'Adopted': {'count': 8}, 'DissonautUniverse': {'count': 4}, 'Midsommar': {'count': 12}, 'SofiawithanF': {'count': 83}, 'xmpp': {'count': 6}, 'ZombsRoyale': {'count': 16}, 'accesscontrol': {'count': 8}, 'WetlanderHumor': {'count': 2}, 'PoonamPandeyFanatics': {'count': 2}, 'screenplaychallenge': {'count': 2}, 'scatstories': {'count': 2}, 'techsupport': {'count': 290}, 'whatcarshouldIbuy': {'count': 79}, 'Stormlight_Archive': {'count': 15}, 'deadbydaylight': {'count': 126}, 'bicycling': {'count': 27}, 'oculus': {'count': 64}, 'Cartalk': {'count': 33}, 'Sims4': {'count': 43}, 'NoFeeAC': {'count': 95}, 'Crypto_com': {'count': 37}, 'ITCareerQuestions': {'count': 259}, 'aromantic': {'count': 18}, 'Revu': {'count': 3}, 'exalted': {'count': 2}, 'HilariaBaldwin': {'count': 20}, 'Testosterone': {'count': 35}, 'Screenwriting': {'count': 170}, 'LifeProTips': {'count': 49}, 'steinsgate': {'count': 13}, 'Baystreetbets': {'count': 10}, 'AskGirls': {'count': 7}, 'idlechampions': {'count': 7}, 'facebook': {'count': 17}, 'tf2trade': {'count': 4}, 'mfdoom': {'count': 3}, 'FiddlesticksMains': {'count': 2}, 'HFY': {'count': 10}, 'FiestaST': {'count': 2}, 'whatsthatbook': {'count': 994}, 'GearsOfWar': {'count': 879}, 'KazuhaMains': {'count': 175}, 'RepTime': {'count': 211}, 'AstroGaming': {'count': 141}, 'metalgearsolid': {'count': 152}, 'qBittorrent': {'count': 39}, 'ELLIPAL_Official': {'count': 24}, 'raisedbynarcissists': {'count': 4895}, 'unpopularopinion': {'count': 14901}, 'ACTrade': {'count': 5679}, 'askcarsales': {'count': 1339}, 'AskVet': {'count': 1357}, 'whowouldwin': {'count': 4493}, 'playstation': {'count': 1362}, 'anime': {'count': 6531}, 'GME': {'count': 12577}, 'DotA2': {'count': 2004}, 'cryptostreetbets': {'count': 2241}, 'MonsterHunterWorld': {'count': 698}, 'Market76': {'count': 14274}, 'DnD': {'count': 5092}, 'leagueoflegends': {'count': 3683}, 'doordash_drivers': {'count': 1626}, 'theta_network': {'count': 489}, 'exmuslim': {'count': 1369}, 'gonewildaudio': {'count': 2998}, 'conspiracy': {'count': 3587}, 'heroesofthestorm': {'count': 535}, 'FanFiction': {'count': 2782}, 'Doom': {'count': 1251}, 'texas': {'count': 269}, 'Vent': {'count': 1738}, 'selfimprovement': {'count': 1284}, 'youtubers': {'count': 706}, 'askseddit': {'count': 237}, 'boardgames': {'count': 1237}, 'bravelydefault': {'count': 347}, 'ConquerorsBlade': {'count': 238}, 'ChronicPain': {'count': 527}, 'teenagersnew': {'count': 256}, 'brasil': {'count': 1092}, 'MatthiasSubmissions': {'count': 921}, 'MarylandUnemployment': {'count': 314}, 'SaltLakeCity': {'count': 411}, 'BokunoheroFanfiction': {'count': 155}, 'BenignExistence': {'count': 125}, 'GayYoungOldDating': {'count': 156}, 'Bible': {'count': 202}, 'haskell': {'count': 154}, 'seduction': {'count': 400}, 'fantasywriters': {'count': 262}, 'HiveOS': {'count': 100}, 'PerkByDaylight': {'count': 15}, 'Hedgehog': {'count': 73}, 'xmen': {'count': 263}, 'HyperRP': {'count': 122}, 'emotestories': {'count': 3}, 'tutanota': {'count': 135}, 'CultoftheFranklin': {'count': 46}, 'langrisser': {'count': 62}, 'CozyGrove': {'count': 61}, 'Sverigesforsvarsmakt': {'count': 12}, 'silverbugbets': {'count': 21}, 'WreckingBallMains': {'count': 5}, 'capitalism_in_decay': {'count': 8}, 'paintdotnet': {'count': 11}, 'u_mawadom118': {'count': 4}, 'xboxfindfriends': {'count': 2}, 'CPTSD': {'count': 540}, 'destiny2': {'count': 318}, 'Wallstreetsilver': {'count': 1013}, 'DestinyTheGame': {'count': 1107}, 'blackopscoldwar': {'count': 400}, 'InstacartShoppers': {'count': 202}, 'RocketLeagueExchange': {'count': 832}, 'apexlegends': {'count': 3265}, 'kansascity': {'count': 53}, 'namenerds': {'count': 235}, 'help': {'count': 152}, 'Kengan_Ashura': {'count': 132}, 'thetagang': {'count': 165}, 'GameSale': {'count': 262}, 'Reduction': {'count': 109}, 'sex': {'count': 906}, 'bostonr4r': {'count': 75}, 'LegendsOfRuneterra': {'count': 231}, 'overlord': {'count': 48}, 'madisonwi': {'count': 53}, 'steelseries': {'count': 79}, 'ClashOfClansRecruit': {'count': 214}, 'CharacterRant': {'count': 55}, 'AirForce': {'count': 94}, 'sexstories': {'count': 92}, 'NameThatSong': {'count': 162}, 'depressed': {'count': 74}, 'ibs': {'count': 150}, '40kLore': {'count': 269}, 'podcasts': {'count': 88}, 'miraculousladybug': {'count': 150}, 'ask': {'count': 224}, 'EverMerge': {'count': 31}, 'TMJ': {'count': 54}, 'BitLifeApp': {'count': 39}, 'FireEmblemHeroes': {'count': 100}, 'software': {'count': 62}, 'ShieldAndroidTV': {'count': 70}, 'GriefSupport': {'count': 125}, 'onewheel': {'count': 37}, 'MensRights': {'count': 80}, 'nhl': {'count': 22}, 'ClashOfClans': {'count': 107}, 'ps3homebrew': {'count': 33}, 'LightNovels': {'count': 77}, 'redsox': {'count': 34}, 'CryptoMarkets': {'count': 44}, 'ugly': {'count': 47}, 'GCXRep': {'count': 12}, 'cscareerquestionsEU': {'count': 65}, 'MindHunter': {'count': 6}, 'starcraft2coop': {'count': 15}, 'nanocurrency': {'count': 1421}, 'ModelCars': {'count': 8}, 'UKJobs': {'count': 30}, 'Netherlands': {'count': 44}, 'clonewars': {'count': 8}, 'Julia': {'count': 11}, 'Prolactinoma': {'count': 9}, 'sofi': {'count': 11}, 'royalfamily': {'count': 6}, 'ConnecticutR4R': {'count': 8}, 'weather': {'count': 5}, 'oneui': {'count': 7}, 'KTM': {'count': 5}, 'Aerials': {'count': 3}, 'seoul': {'count': 2}, 'exjw': {'count': 3281}, 'ModernMagic': {'count': 699}, 'Paladins': {'count': 1242}, 'kdramarecommends': {'count': 1611}, 'hitbtc': {'count': 330}, 'endocrinology': {'count': 75}, 'Bath': {'count': 43}, 'NassauCountyHookups': {'count': 5}, 'feminineboys': {'count': 1248}, 'dreamsmp': {'count': 2018}, 'SquaredCircle': {'count': 2255}, 'Minecraft': {'count': 8753}, 'spirituality': {'count': 1809}, 'Eldenring': {'count': 1471}, 'Sat': {'count': 1172}, 'bonnaroo': {'count': 194}, 'gardening': {'count': 1892}, 'Unemployment': {'count': 6185}, 'mac': {'count': 1847}, 'Bestbuy': {'count': 437}, 'quittingkratom': {'count': 1081}, 'lawschooladmissions': {'count': 3436}, 'NiceHash': {'count': 2135}, 'McMaster': {'count': 815}, 'covidlonghaulers': {'count': 1299}, 'stalker': {'count': 758}, 'MLBTheShow': {'count': 2721}, 'FortniteCompetitive': {'count': 998}, 'dpdr': {'count': 514}, 'appliancerepair': {'count': 720}, 'thomasthetankengine': {'count': 207}, 'delhi': {'count': 217}, 'Huel': {'count': 300}, 'leafs': {'count': 203}, 'HotWheels': {'count': 170}, '90dayfianceuncensored': {'count': 550}, 'Throwers': {'count': 142}, 'Wavyhair': {'count': 270}, 'CryptoHorde': {'count': 128}, 'ShuumatsuNoValkyrie': {'count': 453}, 'TeensMeetTeens': {'count': 432}, 'dbrand': {'count': 108}, 'SLFmeetups': {'count': 18}, '1200isplentyketo': {'count': 48}, 'passive_income': {'count': 211}, 'BroadCity': {'count': 16}, 'RevenantMain': {'count': 71}, 'extrarfl': {'count': 25}, 'AgonGame': {'count': 5}, 'FitnessDE': {'count': 3}, 'gaming': {'count': 1277}, 'livesound': {'count': 91}, 'IBO': {'count': 1896}, 'EscapefromTarkov': {'count': 1300}, 'amex': {'count': 145}, 'DMAcademy': {'count': 1411}, 'VinylCollectors': {'count': 556}, 'cardano': {'count': 716}, 'brave_browser': {'count': 159}, 'dating': {'count': 952}, 'OculusQuest': {'count': 942}, 'Superstonk': {'count': 3089}, 'MtF': {'count': 957}, 'findaleague': {'count': 207}, 'Nioh': {'count': 398}, 'IRS': {'count': 715}, 'transgendercirclejerk': {'count': 353}, 'learnmath': {'count': 489}, 'piano': {'count': 263}, 'LeagueConnect': {'count': 216}, 'eu4': {'count': 561}, 'Wordpress': {'count': 345}, 'RoleplayingForReddit': {'count': 31}, 'LOONA': {'count': 89}, 'newtothenavy': {'count': 167}, 'HaircareScience': {'count': 118}, 'appletv': {'count': 167}, 'sissypersonals': {'count': 102}, 'raleigh': {'count': 168}, 'realonlyfansreviews': {'count': 21}, 'AskGames': {'count': 49}, 'PokemonTCG': {'count': 325}, 'controlgame': {'count': 109}, 'GoogleDataStudio': {'count': 16}, 'WhiteWolfRPG': {'count': 139}, 'MECoOp': {'count': 31}, 'snuffrp': {'count': 46}, 'lockpicking': {'count': 103}, 'wicked_edge': {'count': 105}, 'BMW': {'count': 99}, 'choiceofgames': {'count': 24}, 'hisdarkmaterials': {'count': 12}, 'SakuraGakuin': {'count': 24}, 'detrans': {'count': 55}, 'Smallville': {'count': 37}, 'kingofqueens': {'count': 7}, 'JamesHoffmann': {'count': 22}, 'stashinvest': {'count': 16}, 'ABA': {'count': 79}, 'ladybusiness': {'count': 10}, 'gamegrumps': {'count': 32}, 'GodEater': {'count': 21}, 'tomorrow': {'count': 39}, 'Tomorrowland': {'count': 9}, 'BlackCountryNewRoad': {'count': 5}, 'STAYC': {'count': 3}, 'SatoshiStreetBets': {'count': 3828}, 'AskLosAngeles': {'count': 1036}, 'buildapcforme': {'count': 1689}, 'ApplyingToCollege': {'count': 10675}, 'watercooling': {'count': 1209}, 'BreakUps': {'count': 4914}, 'FIFA': {'count': 3811}, 'emacs': {'count': 712}, 'trakstocks': {'count': 691}, 'Shittyaskflying': {'count': 147}, 'AmazonFC': {'count': 1178}, 'stocks': {'count': 4610}, 'BangaloreMains': {'count': 26}, 'pokemon': {'count': 3953}, 'religion': {'count': 684}, 'cuboulder': {'count': 269}, 'self': {'count': 1688}, 'tarot': {'count': 912}, 'turtles': {'count': 49}, 'TheMagnusArchives': {'count': 300}, 'Superhero_Ideas': {'count': 34}, 'NTU': {'count': 308}, 'touhou': {'count': 623}, 'JoJolion': {'count': 50}, 'lasers': {'count': 27}, 'popperpigs': {'count': 67}, 'aggretsuko': {'count': 20}, 'Library': {'count': 5}}}} | | [RenderedSST2](https://huggingface.co/datasets/clip-benchmark/wds_renderedsst2) | ['eng'] | ZeroShotClassification | i2t | [Reviews] | {'test': 1821} | {'test': {'num_samples': 1821, 'unique_num_labels': 2, 'min_image_width': 448, 'average_image_width': 448.0, 'max_image_width': 448, 'min_image_height': 448, 'average_image_height': 448.0, 'max_image_height': 448, 'min_label_text_length': 28, 'average_label_text_length': 28.0, 'max_label_text_length': 28, 'labels': {'0': {'count': 912}, '1': {'count': 909}}}} | | [RestaurantReviewSentimentClassification](https://link.springer.com/chapter/10.1007/978-3-319-18117-2_2) (ElSahar et al., 2015) | ['ara'] | Classification | s2s | [Reviews, Written] | None | None | | [RiaNewsRetrieval](https://arxiv.org/abs/1901.07786) (Gavrilov et al., 2019) | ['rus'] | Retrieval | s2p | [News, Written] | None | None | @@ -621,23 +641,26 @@ The following tables give you an overview of the tasks in MTEB. | [RonSTS](https://openreview.net/forum?id=JH61CD7afTv) (Dumitrescu et al., 2021) | ['ron'] | STS | s2s | [News, Social, Web, Written] | None | None | | [RuBQReranking](https://openreview.net/pdf?id=P5UQFFoQ4PJ) (Ivan Rybin, 2021) | ['rus'] | Reranking | s2p | [Encyclopaedic, Written] | None | None | | [RuBQRetrieval](https://openreview.net/pdf?id=P5UQFFoQ4PJ) (Ivan Rybin, 2021) | ['rus'] | Retrieval | s2p | [Encyclopaedic, Written] | None | None | +| [RuNLUIntentClassification](https://arxiv.org/abs/1903.05566) (Xingkun Liu, 2019) | ['rus'] | Classification | t2t | | None | None | | [RuReviewsClassification](https://github.com/sismetanin/rureviews) (Sergey Smetanin, 2019) | ['rus'] | Classification | p2p | [Reviews, Written] | None | None | | [RuSTSBenchmarkSTS](https://github.com/PhilipMay/stsb-multi-mt/) (Philip May, 2021) | ['rus'] | STS | s2s | [News, Social, Web, Written] | None | None | | [RuSciBenchGRNTIClassification](https://github.com/mlsa-iai-msu-lab/ru_sci_bench/) | ['rus'] | Classification | p2p | [Academic, Written] | None | None | | [RuSciBenchGRNTIClusteringP2P](https://github.com/mlsa-iai-msu-lab/ru_sci_bench/) | ['rus'] | Clustering | p2p | [Academic, Written] | {'test': 2048} | {'test': {'num_samples': 2048, 'number_of_characters': 1822339, 'min_text_length': 84, 'average_text_length': 889.81, 'max_text_length': 3143, 'min_labels_per_text': 73, 'average_labels_per_text': 1.0, 'max_labels_per_text': 74, 'unique_labels': 28, 'labels': {'3': {'count': 73}, '4': {'count': 73}, '20': {'count': 73}, '9': {'count': 73}, '21': {'count': 73}, '15': {'count': 73}, '16': {'count': 74}, '2': {'count': 73}, '8': {'count': 73}, '23': {'count': 73}, '6': {'count': 73}, '24': {'count': 73}, '10': {'count': 73}, '1': {'count': 73}, '17': {'count': 74}, '14': {'count': 74}, '18': {'count': 73}, '27': {'count': 73}, '19': {'count': 73}, '22': {'count': 73}, '12': {'count': 73}, '25': {'count': 73}, '5': {'count': 74}, '0': {'count': 73}, '26': {'count': 73}, '11': {'count': 73}, '13': {'count': 73}, '7': {'count': 73}}}} | | [RuSciBenchOECDClassification](https://github.com/mlsa-iai-msu-lab/ru_sci_bench/) | ['rus'] | Classification | p2p | [Academic, Written] | None | None | | [RuSciBenchOECDClusteringP2P](https://github.com/mlsa-iai-msu-lab/ru_sci_bench/) | ['rus'] | Clustering | p2p | [Academic, Written] | None | None | +| [RuToxicOKMLCUPClassification](https://cups.online/ru/contests/okmlcup2020) | ['rus'] | Classification | t2t | | None | None | +| [RuToxicOKMLCUPMultilabelClassification](https://cups.online/ru/contests/okmlcup2020) | ['rus'] | Classification | t2t | | None | None | | [SAMSumFa](https://huggingface.co/datasets/MCINext/samsum-fa) | ['fas'] | BitextMining | s2p | [Spoken] | None | None | -| [SCDBPAccountabilityLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | -| [SCDBPAuditsLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | -| [SCDBPCertificationLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | -| [SCDBPTrainingLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | -| [SCDBPVerificationLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | -| [SCDDAccountabilityLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | -| [SCDDAuditsLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | -| [SCDDCertificationLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | -| [SCDDTrainingLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | -| [SCDDVerificationLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | +| [SCDBPAccountabilityLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Chilton et al., 2017) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | +| [SCDBPAuditsLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Chilton et al., 2017) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | +| [SCDBPCertificationLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Chilton et al., 2017) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | +| [SCDBPTrainingLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Chilton et al., 2017) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | +| [SCDBPVerificationLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Chilton et al., 2017) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | +| [SCDDAccountabilityLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Chilton et al., 2017) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | +| [SCDDAuditsLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Chilton et al., 2017) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | +| [SCDDCertificationLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Chilton et al., 2017) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | +| [SCDDTrainingLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Chilton et al., 2017) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | +| [SCDDVerificationLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Chilton et al., 2017) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | | [SCIDOCS](https://allenai.org/data/scidocs) (Arman Cohan, 2020) | ['eng'] | Retrieval | s2p | [Academic, Non-fiction, Written] | None | None | | [SCIDOCS-Fa](https://huggingface.co/datasets/MCINext/scidocs-fa) | ['fas'] | Retrieval | s2p | [Academic] | None | None | | [SCIDOCS-NL](https://huggingface.co/datasets/clips/beir-nl-scidocs) (Nikolay Banar, 2024) | ['nld'] | Retrieval | s2p | [Academic, Non-fiction, Written] | None | None | @@ -646,15 +669,16 @@ The following tables give you an overview of the tasks in MTEB. | [SDSGlovesClassification](https://arxiv.org/abs/2412.00532) (Kasmaee et al., 2024) | ['eng'] | Classification | s2p | [Chemistry] | None | None | | [SIB200Classification](https://arxiv.org/abs/2309.07445) (Adelani et al., 2023) | ['ace', 'acm', 'acq', 'aeb', 'afr', 'ajp', 'aka', 'als', 'amh', 'apc', 'arb', 'ars', 'ary', 'arz', 'asm', 'ast', 'awa', 'ayr', 'azb', 'azj', 'bak', 'bam', 'ban', 'bel', 'bem', 'ben', 'bho', 'bjn', 'bod', 'bos', 'bug', 'bul', 'cat', 'ceb', 'ces', 'cjk', 'ckb', 'crh', 'cym', 'dan', 'deu', 'dik', 'dyu', 'dzo', 'ell', 'eng', 'epo', 'est', 'eus', 'ewe', 'fao', 'fij', 'fin', 'fon', 'fra', 'fur', 'fuv', 'gaz', 'gla', 'gle', 'glg', 'grn', 'guj', 'hat', 'hau', 'heb', 'hin', 'hne', 'hrv', 'hun', 'hye', 'ibo', 'ilo', 'ind', 'isl', 'ita', 'jav', 'jpn', 'kab', 'kac', 'kam', 'kan', 'kas', 'kat', 'kaz', 'kbp', 'kea', 'khk', 'khm', 'kik', 'kin', 'kir', 'kmb', 'kmr', 'knc', 'kon', 'kor', 'lao', 'lij', 'lim', 'lin', 'lit', 'lmo', 'ltg', 'ltz', 'lua', 'lug', 'luo', 'lus', 'lvs', 'mag', 'mai', 'mal', 'mar', 'min', 'mkd', 'mlt', 'mni', 'mos', 'mri', 'mya', 'nld', 'nno', 'nob', 'npi', 'nqo', 'nso', 'nus', 'nya', 'oci', 'ory', 'pag', 'pan', 'pap', 'pbt', 'pes', 'plt', 'pol', 'por', 'prs', 'quy', 'ron', 'run', 'rus', 'sag', 'san', 'sat', 'scn', 'shn', 'sin', 'slk', 'slv', 'smo', 'sna', 'snd', 'som', 'sot', 'spa', 'srd', 'srp', 'ssw', 'sun', 'swe', 'swh', 'szl', 'tam', 'taq', 'tat', 'tel', 'tgk', 'tgl', 'tha', 'tir', 'tpi', 'tsn', 'tso', 'tuk', 'tum', 'tur', 'twi', 'tzm', 'uig', 'ukr', 'umb', 'urd', 'uzn', 'vec', 'vie', 'war', 'wol', 'xho', 'ydd', 'yor', 'yue', 'zho', 'zsm', 'zul'] | Classification | s2s | [News, Written] | None | None | | [SIB200ClusteringS2S](https://arxiv.org/abs/2309.07445) (Adelani et al., 2023) | ['ace', 'acm', 'acq', 'aeb', 'afr', 'ajp', 'aka', 'als', 'amh', 'apc', 'arb', 'ars', 'ary', 'arz', 'asm', 'ast', 'awa', 'ayr', 'azb', 'azj', 'bak', 'bam', 'ban', 'bel', 'bem', 'ben', 'bho', 'bjn', 'bod', 'bos', 'bug', 'bul', 'cat', 'ceb', 'ces', 'cjk', 'ckb', 'crh', 'cym', 'dan', 'deu', 'dik', 'dyu', 'dzo', 'ell', 'eng', 'epo', 'est', 'eus', 'ewe', 'fao', 'fij', 'fin', 'fon', 'fra', 'fur', 'fuv', 'gaz', 'gla', 'gle', 'glg', 'grn', 'guj', 'hat', 'hau', 'heb', 'hin', 'hne', 'hrv', 'hun', 'hye', 'ibo', 'ilo', 'ind', 'isl', 'ita', 'jav', 'jpn', 'kab', 'kac', 'kam', 'kan', 'kas', 'kat', 'kaz', 'kbp', 'kea', 'khk', 'khm', 'kik', 'kin', 'kir', 'kmb', 'kmr', 'knc', 'kon', 'kor', 'lao', 'lij', 'lim', 'lin', 'lit', 'lmo', 'ltg', 'ltz', 'lua', 'lug', 'luo', 'lus', 'lvs', 'mag', 'mai', 'mal', 'mar', 'min', 'mkd', 'mlt', 'mni', 'mos', 'mri', 'mya', 'nld', 'nno', 'nob', 'npi', 'nqo', 'nso', 'nus', 'nya', 'oci', 'ory', 'pag', 'pan', 'pap', 'pbt', 'pes', 'plt', 'pol', 'por', 'prs', 'quy', 'ron', 'run', 'rus', 'sag', 'san', 'sat', 'scn', 'shn', 'sin', 'slk', 'slv', 'smo', 'sna', 'snd', 'som', 'sot', 'spa', 'srd', 'srp', 'ssw', 'sun', 'swe', 'swh', 'szl', 'tam', 'taq', 'tat', 'tel', 'tgk', 'tgl', 'tha', 'tir', 'tpi', 'tsn', 'tso', 'tuk', 'tum', 'tur', 'twi', 'tzm', 'uig', 'ukr', 'umb', 'urd', 'uzn', 'vec', 'vie', 'war', 'wol', 'xho', 'ydd', 'yor', 'yue', 'zho', 'zsm', 'zul'] | Clustering | s2s | [News, Written] | None | None | -| [SICK-BR-PC](https://linux.ime.usp.br/~thalen/SICK_PT.pdf) | ['por'] | PairClassification | s2s | [Web, Written] | None | None | -| [SICK-BR-STS](https://linux.ime.usp.br/~thalen/SICK_PT.pdf) | ['por'] | STS | s2s | [Web, Written] | None | None | -| [SICK-E-PL](https://aclanthology.org/2020.lrec-1.207) | ['pol'] | PairClassification | s2s | | None | None | -| [SICK-R](https://aclanthology.org/L14-1314/) | ['eng'] | STS | s2s | [Web, Written] | None | None | -| [SICK-R-PL](https://aclanthology.org/2020.lrec-1.207) | ['pol'] | STS | s2s | [Web, Written] | None | None | +| [SIBFLEURS](https://huggingface.co/datasets/WueNLP/sib-fleurs) (Fabian David Schmidt, 2025) | ['eng'] | AudioMultilabelClassification | a2t | [Encyclopaedic] | None | None | +| [SICK-BR-PC](https://linux.ime.usp.br/~thalen/SICK_PT.pdf) (Real et al., 2018) | ['por'] | PairClassification | s2s | [Web, Written] | None | None | +| [SICK-BR-STS](https://linux.ime.usp.br/~thalen/SICK_PT.pdf) (Real et al., 2018) | ['por'] | STS | s2s | [Web, Written] | None | None | +| [SICK-E-PL](https://aclanthology.org/2020.lrec-1.207) (Dadas et al., 2020) | ['pol'] | PairClassification | s2s | [Reviews] | None | None | +| [SICK-R](https://aclanthology.org/L14-1314/) (Marelli et al., 2014) | ['eng'] | STS | s2s | [Web, Written] | None | None | +| [SICK-R-PL](https://aclanthology.org/2020.lrec-1.207) (Dadas et al., 2020) | ['pol'] | STS | s2s | [Web, Written] | None | None | | [SICKFr](https://huggingface.co/datasets/Lajavaness/SICK-fr) | ['fra'] | STS | s2s | | None | None | | [SIDClassification](https://mcinext.com/) | ['fas'] | Classification | p2p | [Academic] | None | None | | [SIDClustring](https://www.sid.com/) | ['fas'] | Clustering | p2p | [Academic] | None | None | -| [SIQA](https://leaderboard.allenai.org/socialiqa/submissions/get-started) (Xiao et al., 2024) | ['eng'] | Retrieval | s2s | [Encyclopaedic, Written] | None | None | +| [SIQA](https://leaderboard.allenai.org/socialiqa/submissions/get-started) (Sap et al., 2019) | ['eng'] | Retrieval | s2s | [Encyclopaedic, Written] | None | None | | [SKQuadRetrieval](https://huggingface.co/datasets/TUKE-KEMT/retrieval-skquad) | ['slk'] | Retrieval | s2s | [Encyclopaedic] | None | None | | [SNLHierarchicalClusteringP2P](https://huggingface.co/datasets/navjordj/SNL_summarization) (Navjord et al., 2023) | ['nob'] | Clustering | p2p | [Encyclopaedic, Non-fiction, Written] | None | None | | [SNLHierarchicalClusteringS2S](https://huggingface.co/datasets/navjordj/SNL_summarization) (Navjord et al., 2023) | ['nob'] | Clustering | s2s | [Encyclopaedic, Non-fiction, Written] | None | None | @@ -667,26 +691,27 @@ The following tables give you an overview of the tasks in MTEB. | [STS12VisualSTS](https://arxiv.org/abs/2402.08183/) (Xiao et al., 2024) | ['eng'] | VisualSTS(eng) | i2i | [Encyclopaedic, News, Written] | {'test': 3108} | {'test': {'num_samples': 3108, 'min_image1_width': 448, 'average_image1_width': 448.0, 'max_image1_width': 448, 'min_image1_height': 448, 'average_image1_height': 448.0, 'max_image1_height': 448, 'min_image2_width': 448, 'average_image2_width': 448.0, 'max_image2_width': 448, 'min_image2_height': 448, 'average_image2_height': 448.0, 'max_image2_height': 448, 'min_score': 0.0, 'avg_score': 3.51, 'max_score': 5.0}} | | [STS13](https://www.aclweb.org/anthology/S13-1004/) (Eneko Agirre, 2013) | ['eng'] | STS | s2s | [News, Non-fiction, Web, Written] | None | None | | [STS13VisualSTS](https://arxiv.org/abs/2402.08183/) (Xiao et al., 2024) | ['eng'] | VisualSTS(eng) | i2i | [News, Non-fiction, Web, Written] | {'test': 1500} | {'test': {'num_samples': 1500, 'min_image1_width': 448, 'average_image1_width': 448.0, 'max_image1_width': 448, 'min_image1_height': 448, 'average_image1_height': 448.0, 'max_image1_height': 448, 'min_image2_width': 448, 'average_image2_width': 448.0, 'max_image2_width': 448, 'min_image2_height': 448, 'average_image2_height': 448.0, 'max_image2_height': 448, 'min_score': 0.0, 'avg_score': 2.34, 'max_score': 5.0}} | -| [STS14](https://www.aclweb.org/anthology/S14-1002) | ['eng'] | STS | s2s | [Blog, Spoken, Web] | None | None | +| [STS14](https://www.aclweb.org/anthology/S14-1002) (Bandhakavi et al., 2014) | ['eng'] | STS | s2s | [Blog, Spoken, Web] | None | None | | [STS14VisualSTS](https://arxiv.org/abs/2402.08183/) (Xiao et al., 2024) | ['eng'] | VisualSTS(eng) | i2i | [Blog, Spoken, Web] | {'test': 3750} | {'test': {'num_samples': 3750, 'min_image1_width': 448, 'average_image1_width': 448.0, 'max_image1_width': 448, 'min_image1_height': 448, 'average_image1_height': 448.0, 'max_image1_height': 448, 'min_image2_width': 448, 'average_image2_width': 448.0, 'max_image2_width': 448, 'min_image2_height': 448, 'average_image2_height': 448.0, 'max_image2_height': 448, 'min_score': 0.0, 'avg_score': 2.81, 'max_score': 5.0}} | -| [STS15](https://www.aclweb.org/anthology/S15-2010) | ['eng'] | STS | s2s | [Blog, News, Spoken, Web, Written] | None | None | +| [STS15](https://www.aclweb.org/anthology/S15-2010) (Bi{\c{c, 2015) | ['eng'] | STS | s2s | [Blog, News, Spoken, Web, Written] | None | None | | [STS15VisualSTS](https://arxiv.org/abs/2402.08183/) (Xiao et al., 2024) | ['eng'] | VisualSTS(eng) | i2i | [Blog, News, Spoken, Web, Written] | {'test': 3000} | {'test': {'num_samples': 3000, 'min_image1_width': 448, 'average_image1_width': 448.0, 'max_image1_width': 448, 'min_image1_height': 448, 'average_image1_height': 448.0, 'max_image1_height': 448, 'min_image2_width': 448, 'average_image2_width': 448.0, 'max_image2_width': 448, 'min_image2_height': 448, 'average_image2_height': 448.0, 'max_image2_height': 448, 'min_score': 0.0, 'avg_score': 2.41, 'max_score': 5.0}} | -| [STS16](https://www.aclweb.org/anthology/S16-1001) | ['eng'] | STS | s2s | [Blog, Spoken, Web] | None | None | +| [STS16](https://www.aclweb.org/anthology/S16-1001) (Nakov et al., 2016) | ['eng'] | STS | s2s | [Blog, Spoken, Web] | None | None | | [STS16VisualSTS](https://arxiv.org/abs/2402.08183/) (Xiao et al., 2024) | ['eng'] | VisualSTS(eng) | i2i | [Blog, Spoken, Web] | {'test': 1186} | {'test': {'num_samples': 1186, 'min_image1_width': 448, 'average_image1_width': 448.0, 'max_image1_width': 448, 'min_image1_height': 448, 'average_image1_height': 448.0, 'max_image1_height': 448, 'min_image2_width': 448, 'average_image2_width': 448.0, 'max_image2_width': 448, 'min_image2_height': 448, 'average_image2_height': 448.0, 'max_image2_height': 448, 'min_score': 0.0, 'avg_score': 2.41, 'max_score': 5.0}} | -| [STS17](https://alt.qcri.org/semeval2017/task1/) | ['ara', 'deu', 'eng', 'fra', 'ita', 'kor', 'nld', 'spa', 'tur'] | STS | s2s | [News, Web, Written] | {'test': 5346} | {'test': {'num_samples': 5346, 'number_of_characters': 400264, 'min_sentence1_length': 6, 'average_sentence1_len': 38.15, 'max_sentence1_length': 976, 'unique_sentence1': 4900, 'min_sentence2_length': 6, 'average_sentence2_len': 36.73, 'max_sentence2_length': 1007, 'unique_sentence2': 4470, 'min_score': 0.0, 'avg_score': 2.36, 'max_score': 5.0, 'hf_subset_descriptive_stats': {'ko-ko': {'num_samples': 2846, 'number_of_characters': 183387, 'min_sentence1_length': 6, 'average_sentence1_len': 31.99, 'max_sentence1_length': 976, 'unique_sentence1': 2650, 'min_sentence2_length': 6, 'average_sentence2_len': 32.44, 'max_sentence2_length': 1007, 'unique_sentence2': 2720, 'min_score': 0.0, 'avg_score': 2.47, 'max_score': 5.0}, 'ar-ar': {'num_samples': 250, 'number_of_characters': 16247, 'min_sentence1_length': 11, 'average_sentence1_len': 32.21, 'max_sentence1_length': 99, 'unique_sentence1': 250, 'min_sentence2_length': 9, 'average_sentence2_len': 32.78, 'max_sentence2_length': 83, 'unique_sentence2': 250, 'min_score': 0.0, 'avg_score': 2.22, 'max_score': 5.0}, 'en-ar': {'num_samples': 250, 'number_of_characters': 18764, 'min_sentence1_length': 13, 'average_sentence1_len': 42.36, 'max_sentence1_length': 105, 'unique_sentence1': 250, 'min_sentence2_length': 10, 'average_sentence2_len': 32.7, 'max_sentence2_length': 104, 'unique_sentence2': 250, 'min_score': 0.0, 'avg_score': 2.14, 'max_score': 5.0}, 'en-de': {'num_samples': 250, 'number_of_characters': 22177, 'min_sentence1_length': 12, 'average_sentence1_len': 43.95, 'max_sentence1_length': 94, 'unique_sentence1': 250, 'min_sentence2_length': 15, 'average_sentence2_len': 44.76, 'max_sentence2_length': 104, 'unique_sentence2': 250, 'min_score': 0.0, 'avg_score': 2.28, 'max_score': 5.0}, 'en-en': {'num_samples': 250, 'number_of_characters': 21669, 'min_sentence1_length': 12, 'average_sentence1_len': 43.95, 'max_sentence1_length': 94, 'unique_sentence1': 250, 'min_sentence2_length': 15, 'average_sentence2_len': 42.72, 'max_sentence2_length': 101, 'unique_sentence2': 250, 'min_score': 0.0, 'avg_score': 2.28, 'max_score': 5.0}, 'en-tr': {'num_samples': 250, 'number_of_characters': 20879, 'min_sentence1_length': 15, 'average_sentence1_len': 41.92, 'max_sentence1_length': 101, 'unique_sentence1': 250, 'min_sentence2_length': 10, 'average_sentence2_len': 41.6, 'max_sentence2_length': 107, 'unique_sentence2': 250, 'min_score': 0.0, 'avg_score': 2.13, 'max_score': 5.0}, 'es-en': {'num_samples': 250, 'number_of_characters': 23216, 'min_sentence1_length': 12, 'average_sentence1_len': 50.84, 'max_sentence1_length': 160, 'unique_sentence1': 250, 'min_sentence2_length': 14, 'average_sentence2_len': 42.02, 'max_sentence2_length': 117, 'unique_sentence2': 250, 'min_score': 0.0, 'avg_score': 2.15, 'max_score': 5.0}, 'es-es': {'num_samples': 250, 'number_of_characters': 25265, 'min_sentence1_length': 18, 'average_sentence1_len': 49.84, 'max_sentence1_length': 136, 'unique_sentence1': 250, 'min_sentence2_length': 13, 'average_sentence2_len': 51.22, 'max_sentence2_length': 129, 'unique_sentence2': 250, 'min_score': 0.0, 'avg_score': 2.23, 'max_score': 5.0}, 'fr-en': {'num_samples': 250, 'number_of_characters': 23087, 'min_sentence1_length': 19, 'average_sentence1_len': 49.62, 'max_sentence1_length': 115, 'unique_sentence1': 250, 'min_sentence2_length': 15, 'average_sentence2_len': 42.72, 'max_sentence2_length': 101, 'unique_sentence2': 250, 'min_score': 0.0, 'avg_score': 2.28, 'max_score': 5.0}, 'it-en': {'num_samples': 250, 'number_of_characters': 23188, 'min_sentence1_length': 15, 'average_sentence1_len': 50.03, 'max_sentence1_length': 113, 'unique_sentence1': 250, 'min_sentence2_length': 15, 'average_sentence2_len': 42.72, 'max_sentence2_length': 101, 'unique_sentence2': 250, 'min_score': 0.0, 'avg_score': 2.28, 'max_score': 5.0}, 'nl-en': {'num_samples': 250, 'number_of_characters': 22385, 'min_sentence1_length': 14, 'average_sentence1_len': 46.82, 'max_sentence1_length': 123, 'unique_sentence1': 250, 'min_sentence2_length': 15, 'average_sentence2_len': 42.72, 'max_sentence2_length': 101, 'unique_sentence2': 250, 'min_score': 0.0, 'avg_score': 2.28, 'max_score': 5.0}}}} | +| [STS17](https://alt.qcri.org/semeval2017/task1/) (Cer et al., 2017) | ['ara', 'deu', 'eng', 'fra', 'ita', 'kor', 'nld', 'spa', 'tur'] | STS | s2s | [News, Web, Written] | {'test': 5346} | {'test': {'num_samples': 5346, 'number_of_characters': 400264, 'min_sentence1_length': 6, 'average_sentence1_len': 38.15, 'max_sentence1_length': 976, 'unique_sentence1': 4900, 'min_sentence2_length': 6, 'average_sentence2_len': 36.73, 'max_sentence2_length': 1007, 'unique_sentence2': 4470, 'min_score': 0.0, 'avg_score': 2.36, 'max_score': 5.0, 'hf_subset_descriptive_stats': {'ko-ko': {'num_samples': 2846, 'number_of_characters': 183387, 'min_sentence1_length': 6, 'average_sentence1_len': 31.99, 'max_sentence1_length': 976, 'unique_sentence1': 2650, 'min_sentence2_length': 6, 'average_sentence2_len': 32.44, 'max_sentence2_length': 1007, 'unique_sentence2': 2720, 'min_score': 0.0, 'avg_score': 2.47, 'max_score': 5.0}, 'ar-ar': {'num_samples': 250, 'number_of_characters': 16247, 'min_sentence1_length': 11, 'average_sentence1_len': 32.21, 'max_sentence1_length': 99, 'unique_sentence1': 250, 'min_sentence2_length': 9, 'average_sentence2_len': 32.78, 'max_sentence2_length': 83, 'unique_sentence2': 250, 'min_score': 0.0, 'avg_score': 2.22, 'max_score': 5.0}, 'en-ar': {'num_samples': 250, 'number_of_characters': 18764, 'min_sentence1_length': 13, 'average_sentence1_len': 42.36, 'max_sentence1_length': 105, 'unique_sentence1': 250, 'min_sentence2_length': 10, 'average_sentence2_len': 32.7, 'max_sentence2_length': 104, 'unique_sentence2': 250, 'min_score': 0.0, 'avg_score': 2.14, 'max_score': 5.0}, 'en-de': {'num_samples': 250, 'number_of_characters': 22177, 'min_sentence1_length': 12, 'average_sentence1_len': 43.95, 'max_sentence1_length': 94, 'unique_sentence1': 250, 'min_sentence2_length': 15, 'average_sentence2_len': 44.76, 'max_sentence2_length': 104, 'unique_sentence2': 250, 'min_score': 0.0, 'avg_score': 2.28, 'max_score': 5.0}, 'en-en': {'num_samples': 250, 'number_of_characters': 21669, 'min_sentence1_length': 12, 'average_sentence1_len': 43.95, 'max_sentence1_length': 94, 'unique_sentence1': 250, 'min_sentence2_length': 15, 'average_sentence2_len': 42.72, 'max_sentence2_length': 101, 'unique_sentence2': 250, 'min_score': 0.0, 'avg_score': 2.28, 'max_score': 5.0}, 'en-tr': {'num_samples': 250, 'number_of_characters': 20879, 'min_sentence1_length': 15, 'average_sentence1_len': 41.92, 'max_sentence1_length': 101, 'unique_sentence1': 250, 'min_sentence2_length': 10, 'average_sentence2_len': 41.6, 'max_sentence2_length': 107, 'unique_sentence2': 250, 'min_score': 0.0, 'avg_score': 2.13, 'max_score': 5.0}, 'es-en': {'num_samples': 250, 'number_of_characters': 23216, 'min_sentence1_length': 12, 'average_sentence1_len': 50.84, 'max_sentence1_length': 160, 'unique_sentence1': 250, 'min_sentence2_length': 14, 'average_sentence2_len': 42.02, 'max_sentence2_length': 117, 'unique_sentence2': 250, 'min_score': 0.0, 'avg_score': 2.15, 'max_score': 5.0}, 'es-es': {'num_samples': 250, 'number_of_characters': 25265, 'min_sentence1_length': 18, 'average_sentence1_len': 49.84, 'max_sentence1_length': 136, 'unique_sentence1': 250, 'min_sentence2_length': 13, 'average_sentence2_len': 51.22, 'max_sentence2_length': 129, 'unique_sentence2': 250, 'min_score': 0.0, 'avg_score': 2.23, 'max_score': 5.0}, 'fr-en': {'num_samples': 250, 'number_of_characters': 23087, 'min_sentence1_length': 19, 'average_sentence1_len': 49.62, 'max_sentence1_length': 115, 'unique_sentence1': 250, 'min_sentence2_length': 15, 'average_sentence2_len': 42.72, 'max_sentence2_length': 101, 'unique_sentence2': 250, 'min_score': 0.0, 'avg_score': 2.28, 'max_score': 5.0}, 'it-en': {'num_samples': 250, 'number_of_characters': 23188, 'min_sentence1_length': 15, 'average_sentence1_len': 50.03, 'max_sentence1_length': 113, 'unique_sentence1': 250, 'min_sentence2_length': 15, 'average_sentence2_len': 42.72, 'max_sentence2_length': 101, 'unique_sentence2': 250, 'min_score': 0.0, 'avg_score': 2.28, 'max_score': 5.0}, 'nl-en': {'num_samples': 250, 'number_of_characters': 22385, 'min_sentence1_length': 14, 'average_sentence1_len': 46.82, 'max_sentence1_length': 123, 'unique_sentence1': 250, 'min_sentence2_length': 15, 'average_sentence2_len': 42.72, 'max_sentence2_length': 101, 'unique_sentence2': 250, 'min_score': 0.0, 'avg_score': 2.28, 'max_score': 5.0}}}} | | [STS17MultilingualVisualSTS](https://arxiv.org/abs/2402.08183/) (Xiao et al., 2024) | ['ara', 'deu', 'eng', 'fra', 'ita', 'kor', 'nld', 'spa', 'tur'] | VisualSTS(multi) | i2i | [News, Social, Spoken, Web, Written] | {'test': 5346} | {'test': {'num_samples': 5346, 'min_image1_width': 448, 'average_image1_width': 448.0, 'max_image1_width': 448, 'min_image1_height': 448, 'average_image1_height': 448.0, 'max_image1_height': 448, 'min_image2_width': 448, 'average_image2_width': 448.0, 'max_image2_width': 448, 'min_image2_height': 448, 'average_image2_height': 448.0, 'max_image2_height': 448, 'min_score': 0.0, 'avg_score': 2.36, 'max_score': 5.0, 'hf_subset_descriptive_stats': {'ko-ko': {'num_samples': 2846, 'min_image1_width': 448, 'average_image1_width': 448.0, 'max_image1_width': 448, 'min_image1_height': 448, 'average_image1_height': 448.0, 'max_image1_height': 448, 'min_image2_width': 448, 'average_image2_width': 448.0, 'max_image2_width': 448, 'min_image2_height': 448, 'average_image2_height': 448.0, 'max_image2_height': 448, 'min_score': 0.0, 'avg_score': 2.47, 'max_score': 5.0}, 'ar-ar': {'num_samples': 250, 'min_image1_width': 448, 'average_image1_width': 448.0, 'max_image1_width': 448, 'min_image1_height': 448, 'average_image1_height': 448.0, 'max_image1_height': 448, 'min_image2_width': 448, 'average_image2_width': 448.0, 'max_image2_width': 448, 'min_image2_height': 448, 'average_image2_height': 448.0, 'max_image2_height': 448, 'min_score': 0.0, 'avg_score': 2.22, 'max_score': 5.0}, 'en-ar': {'num_samples': 250, 'min_image1_width': 448, 'average_image1_width': 448.0, 'max_image1_width': 448, 'min_image1_height': 448, 'average_image1_height': 448.0, 'max_image1_height': 448, 'min_image2_width': 448, 'average_image2_width': 448.0, 'max_image2_width': 448, 'min_image2_height': 448, 'average_image2_height': 448.0, 'max_image2_height': 448, 'min_score': 0.0, 'avg_score': 2.14, 'max_score': 5.0}, 'en-de': {'num_samples': 250, 'min_image1_width': 448, 'average_image1_width': 448.0, 'max_image1_width': 448, 'min_image1_height': 448, 'average_image1_height': 448.0, 'max_image1_height': 448, 'min_image2_width': 448, 'average_image2_width': 448.0, 'max_image2_width': 448, 'min_image2_height': 448, 'average_image2_height': 448.0, 'max_image2_height': 448, 'min_score': 0.0, 'avg_score': 2.28, 'max_score': 5.0}, 'en-en': {'num_samples': 250, 'min_image1_width': 448, 'average_image1_width': 448.0, 'max_image1_width': 448, 'min_image1_height': 448, 'average_image1_height': 448.0, 'max_image1_height': 448, 'min_image2_width': 448, 'average_image2_width': 448.0, 'max_image2_width': 448, 'min_image2_height': 448, 'average_image2_height': 448.0, 'max_image2_height': 448, 'min_score': 0.0, 'avg_score': 2.28, 'max_score': 5.0}, 'en-tr': {'num_samples': 250, 'min_image1_width': 448, 'average_image1_width': 448.0, 'max_image1_width': 448, 'min_image1_height': 448, 'average_image1_height': 448.0, 'max_image1_height': 448, 'min_image2_width': 448, 'average_image2_width': 448.0, 'max_image2_width': 448, 'min_image2_height': 448, 'average_image2_height': 448.0, 'max_image2_height': 448, 'min_score': 0.0, 'avg_score': 2.13, 'max_score': 5.0}, 'es-en': {'num_samples': 250, 'min_image1_width': 448, 'average_image1_width': 448.0, 'max_image1_width': 448, 'min_image1_height': 448, 'average_image1_height': 448.0, 'max_image1_height': 448, 'min_image2_width': 448, 'average_image2_width': 448.0, 'max_image2_width': 448, 'min_image2_height': 448, 'average_image2_height': 448.0, 'max_image2_height': 448, 'min_score': 0.0, 'avg_score': 2.15, 'max_score': 5.0}, 'es-es': {'num_samples': 250, 'min_image1_width': 448, 'average_image1_width': 448.0, 'max_image1_width': 448, 'min_image1_height': 448, 'average_image1_height': 448.0, 'max_image1_height': 448, 'min_image2_width': 448, 'average_image2_width': 448.0, 'max_image2_width': 448, 'min_image2_height': 448, 'average_image2_height': 448.0, 'max_image2_height': 448, 'min_score': 0.0, 'avg_score': 2.23, 'max_score': 5.0}, 'fr-en': {'num_samples': 250, 'min_image1_width': 448, 'average_image1_width': 448.0, 'max_image1_width': 448, 'min_image1_height': 448, 'average_image1_height': 448.0, 'max_image1_height': 448, 'min_image2_width': 448, 'average_image2_width': 448.0, 'max_image2_width': 448, 'min_image2_height': 448, 'average_image2_height': 448.0, 'max_image2_height': 448, 'min_score': 0.0, 'avg_score': 2.28, 'max_score': 5.0}, 'it-en': {'num_samples': 250, 'min_image1_width': 448, 'average_image1_width': 448.0, 'max_image1_width': 448, 'min_image1_height': 448, 'average_image1_height': 448.0, 'max_image1_height': 448, 'min_image2_width': 448, 'average_image2_width': 448.0, 'max_image2_width': 448, 'min_image2_height': 448, 'average_image2_height': 448.0, 'max_image2_height': 448, 'min_score': 0.0, 'avg_score': 2.28, 'max_score': 5.0}, 'nl-en': {'num_samples': 250, 'min_image1_width': 448, 'average_image1_width': 448.0, 'max_image1_width': 448, 'min_image1_height': 448, 'average_image1_height': 448.0, 'max_image1_height': 448, 'min_image2_width': 448, 'average_image2_width': 448.0, 'max_image2_width': 448, 'min_image2_height': 448, 'average_image2_height': 448.0, 'max_image2_height': 448, 'min_score': 0.0, 'avg_score': 2.28, 'max_score': 5.0}}}} | -| [STS22.v2](https://competitions.codalab.org/competitions/33835) | ['ara', 'cmn', 'deu', 'eng', 'fra', 'ita', 'pol', 'rus', 'spa', 'tur'] | STS | p2p | [News, Written] | None | None | +| [STS22.v2](https://competitions.codalab.org/competitions/33835) (Chen et al., 2022) | ['ara', 'cmn', 'deu', 'eng', 'fra', 'ita', 'pol', 'rus', 'spa', 'tur'] | STS | p2p | [News, Written] | None | None | | [STSB](https://aclanthology.org/2021.emnlp-main.357) (Shitao Xiao, 2024) | ['cmn'] | STS | s2s | | None | None | | [STSBenchmark](https://github.com/PhilipMay/stsb-multi-mt/) (Philip May, 2021) | ['eng'] | STS | s2s | [Blog, News, Written] | None | None | | [STSBenchmarkMultilingualSTS](https://github.com/PhilipMay/stsb-multi-mt/) (Philip May, 2021) | ['cmn', 'deu', 'eng', 'fra', 'ita', 'nld', 'pol', 'por', 'rus', 'spa'] | STS | s2s | [News, Social, Spoken, Web, Written] | None | None | | [STSBenchmarkMultilingualVisualSTS](https://arxiv.org/abs/2402.08183/) (Xiao et al., 2024) | ['cmn', 'deu', 'eng', 'fra', 'ita', 'nld', 'pol', 'por', 'rus', 'spa'] | VisualSTS(multi) | i2i | [News, Social, Spoken, Web, Written] | {'dev': 15000, 'test': 13790} | {'dev': {'num_samples': 15000, 'min_image1_width': 448, 'average_image1_width': 448.0, 'max_image1_width': 448, 'min_image1_height': 448, 'average_image1_height': 448.0, 'max_image1_height': 448, 'min_image2_width': 448, 'average_image2_width': 448.0, 'max_image2_width': 448, 'min_image2_height': 448, 'average_image2_height': 448.0, 'max_image2_height': 448, 'min_score': 0.0, 'avg_score': 2.36, 'max_score': 5.0, 'hf_subset_descriptive_stats': {'en': {'num_samples': 1500, 'min_image1_width': 448, 'average_image1_width': 448.0, 'max_image1_width': 448, 'min_image1_height': 448, 'average_image1_height': 448.0, 'max_image1_height': 448, 'min_image2_width': 448, 'average_image2_width': 448.0, 'max_image2_width': 448, 'min_image2_height': 448, 'average_image2_height': 448.0, 'max_image2_height': 448, 'min_score': 0.0, 'avg_score': 2.36, 'max_score': 5.0}, 'de': {'num_samples': 1500, 'min_image1_width': 448, 'average_image1_width': 448.0, 'max_image1_width': 448, 'min_image1_height': 448, 'average_image1_height': 448.0, 'max_image1_height': 448, 'min_image2_width': 448, 'average_image2_width': 448.0, 'max_image2_width': 448, 'min_image2_height': 448, 'average_image2_height': 448.0, 'max_image2_height': 448, 'min_score': 0.0, 'avg_score': 2.36, 'max_score': 5.0}, 'es': {'num_samples': 1500, 'min_image1_width': 448, 'average_image1_width': 448.0, 'max_image1_width': 448, 'min_image1_height': 448, 'average_image1_height': 448.0, 'max_image1_height': 448, 'min_image2_width': 448, 'average_image2_width': 448.0, 'max_image2_width': 448, 'min_image2_height': 448, 'average_image2_height': 448.0, 'max_image2_height': 448, 'min_score': 0.0, 'avg_score': 2.36, 'max_score': 5.0}, 'fr': {'num_samples': 1500, 'min_image1_width': 448, 'average_image1_width': 448.0, 'max_image1_width': 448, 'min_image1_height': 448, 'average_image1_height': 448.0, 'max_image1_height': 448, 'min_image2_width': 448, 'average_image2_width': 448.0, 'max_image2_width': 448, 'min_image2_height': 448, 'average_image2_height': 448.0, 'max_image2_height': 448, 'min_score': 0.0, 'avg_score': 2.36, 'max_score': 5.0}, 'it': {'num_samples': 1500, 'min_image1_width': 448, 'average_image1_width': 448.0, 'max_image1_width': 448, 'min_image1_height': 448, 'average_image1_height': 448.0, 'max_image1_height': 448, 'min_image2_width': 448, 'average_image2_width': 448.0, 'max_image2_width': 448, 'min_image2_height': 448, 'average_image2_height': 448.0, 'max_image2_height': 448, 'min_score': 0.0, 'avg_score': 2.36, 'max_score': 5.0}, 'nl': {'num_samples': 1500, 'min_image1_width': 448, 'average_image1_width': 448.0, 'max_image1_width': 448, 'min_image1_height': 448, 'average_image1_height': 448.0, 'max_image1_height': 448, 'min_image2_width': 448, 'average_image2_width': 448.0, 'max_image2_width': 448, 'min_image2_height': 448, 'average_image2_height': 448.0, 'max_image2_height': 448, 'min_score': 0.0, 'avg_score': 2.36, 'max_score': 5.0}, 'pl': {'num_samples': 1500, 'min_image1_width': 448, 'average_image1_width': 448.0, 'max_image1_width': 448, 'min_image1_height': 448, 'average_image1_height': 448.0, 'max_image1_height': 448, 'min_image2_width': 448, 'average_image2_width': 448.0, 'max_image2_width': 448, 'min_image2_height': 448, 'average_image2_height': 448.0, 'max_image2_height': 448, 'min_score': 0.0, 'avg_score': 2.36, 'max_score': 5.0}, 'pt': {'num_samples': 1500, 'min_image1_width': 448, 'average_image1_width': 448.0, 'max_image1_width': 448, 'min_image1_height': 448, 'average_image1_height': 448.0, 'max_image1_height': 448, 'min_image2_width': 448, 'average_image2_width': 448.0, 'max_image2_width': 448, 'min_image2_height': 448, 'average_image2_height': 448.0, 'max_image2_height': 448, 'min_score': 0.0, 'avg_score': 2.36, 'max_score': 5.0}, 'ru': {'num_samples': 1500, 'min_image1_width': 448, 'average_image1_width': 448.0, 'max_image1_width': 448, 'min_image1_height': 448, 'average_image1_height': 448.0, 'max_image1_height': 448, 'min_image2_width': 448, 'average_image2_width': 448.0, 'max_image2_width': 448, 'min_image2_height': 448, 'average_image2_height': 448.0, 'max_image2_height': 448, 'min_score': 0.0, 'avg_score': 2.36, 'max_score': 5.0}, 'zh': {'num_samples': 1500, 'min_image1_width': 448, 'average_image1_width': 448.0, 'max_image1_width': 448, 'min_image1_height': 448, 'average_image1_height': 448.0, 'max_image1_height': 448, 'min_image2_width': 448, 'average_image2_width': 448.0, 'max_image2_width': 448, 'min_image2_height': 448, 'average_image2_height': 448.0, 'max_image2_height': 448, 'min_score': 0.0, 'avg_score': 2.36, 'max_score': 5.0}}}, 'test': {'num_samples': 13790, 'min_image1_width': 448, 'average_image1_width': 448.0, 'max_image1_width': 448, 'min_image1_height': 448, 'average_image1_height': 448.0, 'max_image1_height': 448, 'min_image2_width': 448, 'average_image2_width': 448.0, 'max_image2_width': 448, 'min_image2_height': 448, 'average_image2_height': 448.0, 'max_image2_height': 448, 'min_score': 0.0, 'avg_score': 2.61, 'max_score': 5.0, 'hf_subset_descriptive_stats': {'en': {'num_samples': 1379, 'min_image1_width': 448, 'average_image1_width': 448.0, 'max_image1_width': 448, 'min_image1_height': 448, 'average_image1_height': 448.0, 'max_image1_height': 448, 'min_image2_width': 448, 'average_image2_width': 448.0, 'max_image2_width': 448, 'min_image2_height': 448, 'average_image2_height': 448.0, 'max_image2_height': 448, 'min_score': 0.0, 'avg_score': 2.61, 'max_score': 5.0}, 'de': {'num_samples': 1379, 'min_image1_width': 448, 'average_image1_width': 448.0, 'max_image1_width': 448, 'min_image1_height': 448, 'average_image1_height': 448.0, 'max_image1_height': 448, 'min_image2_width': 448, 'average_image2_width': 448.0, 'max_image2_width': 448, 'min_image2_height': 448, 'average_image2_height': 448.0, 'max_image2_height': 448, 'min_score': 0.0, 'avg_score': 2.61, 'max_score': 5.0}, 'es': {'num_samples': 1379, 'min_image1_width': 448, 'average_image1_width': 448.0, 'max_image1_width': 448, 'min_image1_height': 448, 'average_image1_height': 448.0, 'max_image1_height': 448, 'min_image2_width': 448, 'average_image2_width': 448.0, 'max_image2_width': 448, 'min_image2_height': 448, 'average_image2_height': 448.0, 'max_image2_height': 448, 'min_score': 0.0, 'avg_score': 2.61, 'max_score': 5.0}, 'fr': {'num_samples': 1379, 'min_image1_width': 448, 'average_image1_width': 448.0, 'max_image1_width': 448, 'min_image1_height': 448, 'average_image1_height': 448.0, 'max_image1_height': 448, 'min_image2_width': 448, 'average_image2_width': 448.0, 'max_image2_width': 448, 'min_image2_height': 448, 'average_image2_height': 448.0, 'max_image2_height': 448, 'min_score': 0.0, 'avg_score': 2.61, 'max_score': 5.0}, 'it': {'num_samples': 1379, 'min_image1_width': 448, 'average_image1_width': 448.0, 'max_image1_width': 448, 'min_image1_height': 448, 'average_image1_height': 448.0, 'max_image1_height': 448, 'min_image2_width': 448, 'average_image2_width': 448.0, 'max_image2_width': 448, 'min_image2_height': 448, 'average_image2_height': 448.0, 'max_image2_height': 448, 'min_score': 0.0, 'avg_score': 2.61, 'max_score': 5.0}, 'nl': {'num_samples': 1379, 'min_image1_width': 448, 'average_image1_width': 448.0, 'max_image1_width': 448, 'min_image1_height': 448, 'average_image1_height': 448.0, 'max_image1_height': 448, 'min_image2_width': 448, 'average_image2_width': 448.0, 'max_image2_width': 448, 'min_image2_height': 448, 'average_image2_height': 448.0, 'max_image2_height': 448, 'min_score': 0.0, 'avg_score': 2.61, 'max_score': 5.0}, 'pl': {'num_samples': 1379, 'min_image1_width': 448, 'average_image1_width': 448.0, 'max_image1_width': 448, 'min_image1_height': 448, 'average_image1_height': 448.0, 'max_image1_height': 448, 'min_image2_width': 448, 'average_image2_width': 448.0, 'max_image2_width': 448, 'min_image2_height': 448, 'average_image2_height': 448.0, 'max_image2_height': 448, 'min_score': 0.0, 'avg_score': 2.61, 'max_score': 5.0}, 'pt': {'num_samples': 1379, 'min_image1_width': 448, 'average_image1_width': 448.0, 'max_image1_width': 448, 'min_image1_height': 448, 'average_image1_height': 448.0, 'max_image1_height': 448, 'min_image2_width': 448, 'average_image2_width': 448.0, 'max_image2_width': 448, 'min_image2_height': 448, 'average_image2_height': 448.0, 'max_image2_height': 448, 'min_score': 0.0, 'avg_score': 2.61, 'max_score': 5.0}, 'ru': {'num_samples': 1379, 'min_image1_width': 448, 'average_image1_width': 448.0, 'max_image1_width': 448, 'min_image1_height': 448, 'average_image1_height': 448.0, 'max_image1_height': 448, 'min_image2_width': 448, 'average_image2_width': 448.0, 'max_image2_width': 448, 'min_image2_height': 448, 'average_image2_height': 448.0, 'max_image2_height': 448, 'min_score': 0.0, 'avg_score': 2.61, 'max_score': 5.0}, 'zh': {'num_samples': 1379, 'min_image1_width': 448, 'average_image1_width': 448.0, 'max_image1_width': 448, 'min_image1_height': 448, 'average_image1_height': 448.0, 'max_image1_height': 448, 'min_image2_width': 448, 'average_image2_width': 448.0, 'max_image2_width': 448, 'min_image2_height': 448, 'average_image2_height': 448.0, 'max_image2_height': 448, 'min_score': 0.0, 'avg_score': 2.61, 'max_score': 5.0}}}} | -| [STSES](https://huggingface.co/datasets/PlanTL-GOB-ES/sts-es) (Agirre et al., 2015) | ['spa'] | STS | s2s | [Written] | None | None | +| [STSES](https://huggingface.co/datasets/PlanTL-GOB-ES/sts-es) (Agirre et al., 2014) | ['spa'] | STS | s2s | [Written] | None | None | | [SUN397](https://ieeexplore.ieee.org/abstract/document/5539970) (Xiao et al., 2010) | ['eng'] | ImageClassification | i2i | [Encyclopaedic] | {'test': 21750} | {'test': {'num_samples': 21750, 'unique_num_labels': 397, 'min_image_width': 125, 'average_image_width': 354.22, 'max_image_width': 696, 'min_image_height': 94, 'average_image_height': 291.17, 'max_image_height': 595, 'labels': {'227': {'count': 439}, '213': {'count': 335}, '53': {'count': 23}, '350': {'count': 40}, '73': {'count': 38}, '316': {'count': 63}, '177': {'count': 80}, '25': {'count': 39}, '275': {'count': 31}, '328': {'count': 33}, '263': {'count': 47}, '239': {'count': 26}, '41': {'count': 213}, '319': {'count': 51}, '91': {'count': 16}, '95': {'count': 183}, '396': {'count': 20}, '259': {'count': 36}, '107': {'count': 167}, '381': {'count': 164}, '174': {'count': 167}, '246': {'count': 44}, '67': {'count': 31}, '374': {'count': 28}, '354': {'count': 22}, '72': {'count': 100}, '97': {'count': 32}, '256': {'count': 57}, '247': {'count': 57}, '159': {'count': 49}, '270': {'count': 135}, '133': {'count': 215}, '197': {'count': 40}, '12': {'count': 38}, '2': {'count': 226}, '115': {'count': 75}, '200': {'count': 93}, '47': {'count': 103}, '9': {'count': 37}, '22': {'count': 76}, '255': {'count': 34}, '267': {'count': 22}, '244': {'count': 93}, '85': {'count': 115}, '342': {'count': 87}, '55': {'count': 50}, '7': {'count': 41}, '337': {'count': 99}, '38': {'count': 28}, '269': {'count': 69}, '106': {'count': 15}, '298': {'count': 27}, '361': {'count': 53}, '8': {'count': 108}, '166': {'count': 47}, '280': {'count': 51}, '35': {'count': 61}, '147': {'count': 82}, '214': {'count': 26}, '284': {'count': 28}, '286': {'count': 66}, '113': {'count': 67}, '83': {'count': 38}, '82': {'count': 236}, '365': {'count': 17}, '242': {'count': 116}, '186': {'count': 38}, '87': {'count': 111}, '274': {'count': 48}, '27': {'count': 95}, '283': {'count': 22}, '4': {'count': 76}, '334': {'count': 139}, '364': {'count': 21}, '48': {'count': 408}, '311': {'count': 41}, '101': {'count': 64}, '131': {'count': 55}, '172': {'count': 31}, '355': {'count': 28}, '308': {'count': 56}, '5': {'count': 47}, '318': {'count': 155}, '86': {'count': 87}, '46': {'count': 230}, '111': {'count': 69}, '88': {'count': 54}, '23': {'count': 47}, '70': {'count': 61}, '217': {'count': 34}, '11': {'count': 76}, '193': {'count': 207}, '0': {'count': 99}, '303': {'count': 23}, '324': {'count': 47}, '377': {'count': 19}, '345': {'count': 39}, '154': {'count': 49}, '393': {'count': 68}, '152': {'count': 58}, '317': {'count': 27}, '384': {'count': 46}, '257': {'count': 38}, '294': {'count': 47}, '145': {'count': 23}, '289': {'count': 33}, '375': {'count': 19}, '57': {'count': 42}, '15': {'count': 62}, '109': {'count': 24}, '139': {'count': 24}, '66': {'count': 26}, '340': {'count': 32}, '150': {'count': 41}, '118': {'count': 105}, '333': {'count': 27}, '126': {'count': 55}, '366': {'count': 116}, '358': {'count': 151}, '251': {'count': 37}, '309': {'count': 35}, '54': {'count': 20}, '327': {'count': 38}, '3': {'count': 60}, '21': {'count': 56}, '17': {'count': 62}, '146': {'count': 84}, '94': {'count': 42}, '243': {'count': 48}, '335': {'count': 85}, '245': {'count': 141}, '279': {'count': 187}, '360': {'count': 25}, '192': {'count': 105}, '49': {'count': 31}, '230': {'count': 81}, '357': {'count': 22}, '64': {'count': 72}, '112': {'count': 26}, '338': {'count': 70}, '216': {'count': 99}, '234': {'count': 183}, '300': {'count': 153}, '188': {'count': 48}, '254': {'count': 41}, '184': {'count': 183}, '373': {'count': 47}, '221': {'count': 86}, '84': {'count': 49}, '81': {'count': 119}, '161': {'count': 97}, '352': {'count': 21}, '105': {'count': 43}, '39': {'count': 59}, '383': {'count': 40}, '341': {'count': 56}, '63': {'count': 158}, '125': {'count': 29}, '302': {'count': 83}, '262': {'count': 40}, '392': {'count': 51}, '326': {'count': 173}, '228': {'count': 93}, '339': {'count': 25}, '80': {'count': 73}, '30': {'count': 42}, '264': {'count': 112}, '56': {'count': 94}, '321': {'count': 16}, '395': {'count': 52}, '68': {'count': 45}, '211': {'count': 45}, '44': {'count': 26}, '299': {'count': 21}, '220': {'count': 35}, '61': {'count': 20}, '138': {'count': 55}, '108': {'count': 111}, '10': {'count': 35}, '386': {'count': 28}, '297': {'count': 49}, '210': {'count': 36}, '175': {'count': 77}, '260': {'count': 68}, '391': {'count': 69}, '102': {'count': 77}, '26': {'count': 44}, '232': {'count': 54}, '6': {'count': 158}, '124': {'count': 43}, '14': {'count': 23}, '201': {'count': 39}, '168': {'count': 18}, '202': {'count': 26}, '140': {'count': 31}, '261': {'count': 60}, '104': {'count': 27}, '356': {'count': 22}, '34': {'count': 147}, '225': {'count': 111}, '60': {'count': 84}, '156': {'count': 35}, '237': {'count': 45}, '268': {'count': 87}, '310': {'count': 31}, '249': {'count': 73}, '281': {'count': 46}, '75': {'count': 89}, '77': {'count': 53}, '132': {'count': 45}, '235': {'count': 42}, '336': {'count': 84}, '123': {'count': 27}, '349': {'count': 90}, '180': {'count': 49}, '378': {'count': 17}, '332': {'count': 30}, '185': {'count': 29}, '389': {'count': 60}, '382': {'count': 77}, '198': {'count': 54}, '74': {'count': 48}, '231': {'count': 85}, '76': {'count': 54}, '151': {'count': 64}, '182': {'count': 17}, '209': {'count': 39}, '344': {'count': 37}, '204': {'count': 67}, '329': {'count': 23}, '380': {'count': 91}, '388': {'count': 32}, '116': {'count': 29}, '24': {'count': 103}, '199': {'count': 33}, '369': {'count': 14}, '359': {'count': 77}, '325': {'count': 39}, '323': {'count': 34}, '162': {'count': 35}, '33': {'count': 46}, '129': {'count': 21}, '287': {'count': 30}, '155': {'count': 24}, '170': {'count': 157}, '296': {'count': 40}, '110': {'count': 102}, '304': {'count': 21}, '164': {'count': 37}, '278': {'count': 23}, '71': {'count': 18}, '194': {'count': 24}, '136': {'count': 117}, '103': {'count': 134}, '330': {'count': 26}, '347': {'count': 26}, '206': {'count': 50}, '178': {'count': 43}, '362': {'count': 26}, '119': {'count': 111}, '208': {'count': 33}, '165': {'count': 44}, '90': {'count': 36}, '167': {'count': 40}, '187': {'count': 26}, '99': {'count': 50}, '390': {'count': 64}, '205': {'count': 16}, '65': {'count': 30}, '293': {'count': 23}, '223': {'count': 19}, '96': {'count': 31}, '305': {'count': 44}, '100': {'count': 57}, '385': {'count': 18}, '78': {'count': 42}, '59': {'count': 20}, '37': {'count': 59}, '219': {'count': 76}, '212': {'count': 28}, '1': {'count': 26}, '122': {'count': 35}, '92': {'count': 62}, '43': {'count': 39}, '196': {'count': 56}, '19': {'count': 25}, '128': {'count': 35}, '376': {'count': 77}, '313': {'count': 30}, '114': {'count': 54}, '121': {'count': 31}, '169': {'count': 62}, '331': {'count': 55}, '238': {'count': 16}, '179': {'count': 31}, '127': {'count': 31}, '370': {'count': 98}, '149': {'count': 47}, '346': {'count': 41}, '250': {'count': 22}, '276': {'count': 25}, '163': {'count': 43}, '18': {'count': 33}, '282': {'count': 23}, '215': {'count': 33}, '258': {'count': 60}, '240': {'count': 29}, '233': {'count': 14}, '93': {'count': 27}, '69': {'count': 23}, '266': {'count': 26}, '387': {'count': 55}, '141': {'count': 18}, '191': {'count': 26}, '183': {'count': 42}, '271': {'count': 22}, '120': {'count': 32}, '98': {'count': 53}, '29': {'count': 34}, '28': {'count': 21}, '144': {'count': 26}, '351': {'count': 50}, '368': {'count': 20}, '314': {'count': 27}, '45': {'count': 17}, '218': {'count': 50}, '348': {'count': 25}, '157': {'count': 35}, '117': {'count': 24}, '367': {'count': 24}, '13': {'count': 31}, '363': {'count': 22}, '79': {'count': 28}, '312': {'count': 27}, '372': {'count': 29}, '189': {'count': 21}, '50': {'count': 22}, '160': {'count': 35}, '16': {'count': 39}, '222': {'count': 21}, '58': {'count': 37}, '153': {'count': 64}, '62': {'count': 21}, '290': {'count': 25}, '292': {'count': 24}, '285': {'count': 25}, '343': {'count': 32}, '301': {'count': 19}, '190': {'count': 46}, '195': {'count': 24}, '135': {'count': 30}, '315': {'count': 25}, '203': {'count': 29}, '307': {'count': 18}, '142': {'count': 25}, '173': {'count': 28}, '236': {'count': 41}, '171': {'count': 23}, '371': {'count': 17}, '130': {'count': 15}, '277': {'count': 39}, '248': {'count': 22}, '181': {'count': 35}, '40': {'count': 20}, '322': {'count': 15}, '273': {'count': 23}, '148': {'count': 23}, '295': {'count': 25}, '32': {'count': 21}, '320': {'count': 25}, '137': {'count': 32}, '253': {'count': 36}, '31': {'count': 19}, '306': {'count': 27}, '51': {'count': 19}, '52': {'count': 29}, '176': {'count': 31}, '241': {'count': 23}, '265': {'count': 32}, '394': {'count': 26}, '158': {'count': 26}, '226': {'count': 28}, '288': {'count': 21}, '353': {'count': 19}, '291': {'count': 21}, '224': {'count': 26}, '36': {'count': 38}, '20': {'count': 22}, '252': {'count': 18}, '134': {'count': 24}, '143': {'count': 21}, '207': {'count': 28}, '89': {'count': 16}, '272': {'count': 23}, '379': {'count': 24}, '229': {'count': 20}, '42': {'count': 23}}}} | | [SUN397ZeroShot](https://ieeexplore.ieee.org/abstract/document/5539970) (Xiao et al., 2010) | ['eng'] | ZeroShotClassification | i2t | [Encyclopaedic] | {'test': 21750} | {'test': {'num_samples': 21750, 'unique_num_labels': 397, 'min_image_width': 125, 'average_image_width': 354.22, 'max_image_width': 696, 'min_image_height': 94, 'average_image_height': 291.17, 'max_image_height': 595, 'min_label_text_length': 17, 'average_label_text_length': 25.9, 'max_label_text_length': 41, 'labels': {'227': {'count': 439}, '213': {'count': 335}, '53': {'count': 23}, '350': {'count': 40}, '73': {'count': 38}, '316': {'count': 63}, '177': {'count': 80}, '25': {'count': 39}, '275': {'count': 31}, '328': {'count': 33}, '263': {'count': 47}, '239': {'count': 26}, '41': {'count': 213}, '319': {'count': 51}, '91': {'count': 16}, '95': {'count': 183}, '396': {'count': 20}, '259': {'count': 36}, '107': {'count': 167}, '381': {'count': 164}, '174': {'count': 167}, '246': {'count': 44}, '67': {'count': 31}, '374': {'count': 28}, '354': {'count': 22}, '72': {'count': 100}, '97': {'count': 32}, '256': {'count': 57}, '247': {'count': 57}, '159': {'count': 49}, '270': {'count': 135}, '133': {'count': 215}, '197': {'count': 40}, '12': {'count': 38}, '2': {'count': 226}, '115': {'count': 75}, '200': {'count': 93}, '47': {'count': 103}, '9': {'count': 37}, '22': {'count': 76}, '255': {'count': 34}, '267': {'count': 22}, '244': {'count': 93}, '85': {'count': 115}, '342': {'count': 87}, '55': {'count': 50}, '7': {'count': 41}, '337': {'count': 99}, '38': {'count': 28}, '269': {'count': 69}, '106': {'count': 15}, '298': {'count': 27}, '361': {'count': 53}, '8': {'count': 108}, '166': {'count': 47}, '280': {'count': 51}, '35': {'count': 61}, '147': {'count': 82}, '214': {'count': 26}, '284': {'count': 28}, '286': {'count': 66}, '113': {'count': 67}, '83': {'count': 38}, '82': {'count': 236}, '365': {'count': 17}, '242': {'count': 116}, '186': {'count': 38}, '87': {'count': 111}, '274': {'count': 48}, '27': {'count': 95}, '283': {'count': 22}, '4': {'count': 76}, '334': {'count': 139}, '364': {'count': 21}, '48': {'count': 408}, '311': {'count': 41}, '101': {'count': 64}, '131': {'count': 55}, '172': {'count': 31}, '355': {'count': 28}, '308': {'count': 56}, '5': {'count': 47}, '318': {'count': 155}, '86': {'count': 87}, '46': {'count': 230}, '111': {'count': 69}, '88': {'count': 54}, '23': {'count': 47}, '70': {'count': 61}, '217': {'count': 34}, '11': {'count': 76}, '193': {'count': 207}, '0': {'count': 99}, '303': {'count': 23}, '324': {'count': 47}, '377': {'count': 19}, '345': {'count': 39}, '154': {'count': 49}, '393': {'count': 68}, '152': {'count': 58}, '317': {'count': 27}, '384': {'count': 46}, '257': {'count': 38}, '294': {'count': 47}, '145': {'count': 23}, '289': {'count': 33}, '375': {'count': 19}, '57': {'count': 42}, '15': {'count': 62}, '109': {'count': 24}, '139': {'count': 24}, '66': {'count': 26}, '340': {'count': 32}, '150': {'count': 41}, '118': {'count': 105}, '333': {'count': 27}, '126': {'count': 55}, '366': {'count': 116}, '358': {'count': 151}, '251': {'count': 37}, '309': {'count': 35}, '54': {'count': 20}, '327': {'count': 38}, '3': {'count': 60}, '21': {'count': 56}, '17': {'count': 62}, '146': {'count': 84}, '94': {'count': 42}, '243': {'count': 48}, '335': {'count': 85}, '245': {'count': 141}, '279': {'count': 187}, '360': {'count': 25}, '192': {'count': 105}, '49': {'count': 31}, '230': {'count': 81}, '357': {'count': 22}, '64': {'count': 72}, '112': {'count': 26}, '338': {'count': 70}, '216': {'count': 99}, '234': {'count': 183}, '300': {'count': 153}, '188': {'count': 48}, '254': {'count': 41}, '184': {'count': 183}, '373': {'count': 47}, '221': {'count': 86}, '84': {'count': 49}, '81': {'count': 119}, '161': {'count': 97}, '352': {'count': 21}, '105': {'count': 43}, '39': {'count': 59}, '383': {'count': 40}, '341': {'count': 56}, '63': {'count': 158}, '125': {'count': 29}, '302': {'count': 83}, '262': {'count': 40}, '392': {'count': 51}, '326': {'count': 173}, '228': {'count': 93}, '339': {'count': 25}, '80': {'count': 73}, '30': {'count': 42}, '264': {'count': 112}, '56': {'count': 94}, '321': {'count': 16}, '395': {'count': 52}, '68': {'count': 45}, '211': {'count': 45}, '44': {'count': 26}, '299': {'count': 21}, '220': {'count': 35}, '61': {'count': 20}, '138': {'count': 55}, '108': {'count': 111}, '10': {'count': 35}, '386': {'count': 28}, '297': {'count': 49}, '210': {'count': 36}, '175': {'count': 77}, '260': {'count': 68}, '391': {'count': 69}, '102': {'count': 77}, '26': {'count': 44}, '232': {'count': 54}, '6': {'count': 158}, '124': {'count': 43}, '14': {'count': 23}, '201': {'count': 39}, '168': {'count': 18}, '202': {'count': 26}, '140': {'count': 31}, '261': {'count': 60}, '104': {'count': 27}, '356': {'count': 22}, '34': {'count': 147}, '225': {'count': 111}, '60': {'count': 84}, '156': {'count': 35}, '237': {'count': 45}, '268': {'count': 87}, '310': {'count': 31}, '249': {'count': 73}, '281': {'count': 46}, '75': {'count': 89}, '77': {'count': 53}, '132': {'count': 45}, '235': {'count': 42}, '336': {'count': 84}, '123': {'count': 27}, '349': {'count': 90}, '180': {'count': 49}, '378': {'count': 17}, '332': {'count': 30}, '185': {'count': 29}, '389': {'count': 60}, '382': {'count': 77}, '198': {'count': 54}, '74': {'count': 48}, '231': {'count': 85}, '76': {'count': 54}, '151': {'count': 64}, '182': {'count': 17}, '209': {'count': 39}, '344': {'count': 37}, '204': {'count': 67}, '329': {'count': 23}, '380': {'count': 91}, '388': {'count': 32}, '116': {'count': 29}, '24': {'count': 103}, '199': {'count': 33}, '369': {'count': 14}, '359': {'count': 77}, '325': {'count': 39}, '323': {'count': 34}, '162': {'count': 35}, '33': {'count': 46}, '129': {'count': 21}, '287': {'count': 30}, '155': {'count': 24}, '170': {'count': 157}, '296': {'count': 40}, '110': {'count': 102}, '304': {'count': 21}, '164': {'count': 37}, '278': {'count': 23}, '71': {'count': 18}, '194': {'count': 24}, '136': {'count': 117}, '103': {'count': 134}, '330': {'count': 26}, '347': {'count': 26}, '206': {'count': 50}, '178': {'count': 43}, '362': {'count': 26}, '119': {'count': 111}, '208': {'count': 33}, '165': {'count': 44}, '90': {'count': 36}, '167': {'count': 40}, '187': {'count': 26}, '99': {'count': 50}, '390': {'count': 64}, '205': {'count': 16}, '65': {'count': 30}, '293': {'count': 23}, '223': {'count': 19}, '96': {'count': 31}, '305': {'count': 44}, '100': {'count': 57}, '385': {'count': 18}, '78': {'count': 42}, '59': {'count': 20}, '37': {'count': 59}, '219': {'count': 76}, '212': {'count': 28}, '1': {'count': 26}, '122': {'count': 35}, '92': {'count': 62}, '43': {'count': 39}, '196': {'count': 56}, '19': {'count': 25}, '128': {'count': 35}, '376': {'count': 77}, '313': {'count': 30}, '114': {'count': 54}, '121': {'count': 31}, '169': {'count': 62}, '331': {'count': 55}, '238': {'count': 16}, '179': {'count': 31}, '127': {'count': 31}, '370': {'count': 98}, '149': {'count': 47}, '346': {'count': 41}, '250': {'count': 22}, '276': {'count': 25}, '163': {'count': 43}, '18': {'count': 33}, '282': {'count': 23}, '215': {'count': 33}, '258': {'count': 60}, '240': {'count': 29}, '233': {'count': 14}, '93': {'count': 27}, '69': {'count': 23}, '266': {'count': 26}, '387': {'count': 55}, '141': {'count': 18}, '191': {'count': 26}, '183': {'count': 42}, '271': {'count': 22}, '120': {'count': 32}, '98': {'count': 53}, '29': {'count': 34}, '28': {'count': 21}, '144': {'count': 26}, '351': {'count': 50}, '368': {'count': 20}, '314': {'count': 27}, '45': {'count': 17}, '218': {'count': 50}, '348': {'count': 25}, '157': {'count': 35}, '117': {'count': 24}, '367': {'count': 24}, '13': {'count': 31}, '363': {'count': 22}, '79': {'count': 28}, '312': {'count': 27}, '372': {'count': 29}, '189': {'count': 21}, '50': {'count': 22}, '160': {'count': 35}, '16': {'count': 39}, '222': {'count': 21}, '58': {'count': 37}, '153': {'count': 64}, '62': {'count': 21}, '290': {'count': 25}, '292': {'count': 24}, '285': {'count': 25}, '343': {'count': 32}, '301': {'count': 19}, '190': {'count': 46}, '195': {'count': 24}, '135': {'count': 30}, '315': {'count': 25}, '203': {'count': 29}, '307': {'count': 18}, '142': {'count': 25}, '173': {'count': 28}, '236': {'count': 41}, '171': {'count': 23}, '371': {'count': 17}, '130': {'count': 15}, '277': {'count': 39}, '248': {'count': 22}, '181': {'count': 35}, '40': {'count': 20}, '322': {'count': 15}, '273': {'count': 23}, '148': {'count': 23}, '295': {'count': 25}, '32': {'count': 21}, '320': {'count': 25}, '137': {'count': 32}, '253': {'count': 36}, '31': {'count': 19}, '306': {'count': 27}, '51': {'count': 19}, '52': {'count': 29}, '176': {'count': 31}, '241': {'count': 23}, '265': {'count': 32}, '394': {'count': 26}, '158': {'count': 26}, '226': {'count': 28}, '288': {'count': 21}, '353': {'count': 19}, '291': {'count': 21}, '224': {'count': 26}, '36': {'count': 38}, '20': {'count': 22}, '252': {'count': 18}, '134': {'count': 24}, '143': {'count': 21}, '207': {'count': 28}, '89': {'count': 16}, '272': {'count': 23}, '379': {'count': 24}, '229': {'count': 20}, '42': {'count': 23}}}} | | [SadeemQuestionRetrieval](https://huggingface.co/datasets/sadeem-ai/sadeem-ar-eval-retrieval-questions) | ['ara'] | Retrieval | s2p | [Written, Written] | None | None | -| [SanskritShlokasClassification](https://github.com/goru001/nlp-for-sanskrit) | ['san'] | Classification | s2s | [Religious, Written] | None | None | -| [ScalaClassification](https://aclanthology.org/2023.nodalida-1.20/) | ['dan', 'nno', 'nob', 'swe'] | Classification | s2s | [Blog, Fiction, News, Non-fiction, Spoken, Web, Written] | None | None | -| [SciDocsRR](https://allenai.org/data/scidocs) | ['eng'] | Reranking | s2s | [Academic, Non-fiction, Written] | None | None | +| [SanskritShlokasClassification](https://github.com/goru001/nlp-for-sanskrit) (Arora et al., 2020) | ['san'] | Classification | s2s | [Religious, Written] | None | None | +| [ScalaClassification](https://aclanthology.org/2023.nodalida-1.20/) (Nielsen et al., 2023) | ['dan', 'nno', 'nob', 'swe'] | Classification | s2s | [Blog, Fiction, News, Non-fiction, Spoken, Web, Written] | None | None | +| [ScandiSentClassification](https://github.com/timpal0l/ScandiSent) (Isbister et al., 2021) | ['dan', 'eng', 'fin', 'nob', 'swe'] | Classification | s2s | [Reviews, Written] | None | None | +| [SciDocsRR](https://allenai.org/data/scidocs) (Cohan et al., 2020) | ['eng'] | Reranking | s2s | [Academic, Non-fiction, Written] | None | None | | [SciFact](https://github.com/allenai/scifact) (Arman Cohan, 2020) | ['eng'] | Retrieval | s2p | [Academic, Medical, Written] | None | None | | [SciFact-Fa](https://huggingface.co/datasets/MCINext/scifact-fa) | ['fas'] | Retrieval | s2p | [Academic] | None | None | | [SciFact-NL](https://huggingface.co/datasets/clips/beir-nl-scifact) (Nikolay Banar, 2024) | ['nld'] | Retrieval | s2p | [Academic, Medical, Written] | None | None | @@ -695,7 +720,8 @@ The following tables give you an overview of the tasks in MTEB. | [SciMMIRI2TRetrieval](https://aclanthology.org/2024.findings-acl.746/) (Wu et al., 2024) | ['eng'] | Any2AnyRetrieval | i2t | [Academic] | {'test': 32526} | {'test': {'number_of_characters': 4247786, 'num_samples': 32526, 'num_queries': 16263, 'num_documents': 16263, 'min_document_length': 21, 'average_document_length': 261.19, 'max_document_length': 2184, 'unique_documents': 16263, 'num_document_images': 0, 'min_query_length': 0, 'average_query_length': 0, 'max_query_length': 0, 'unique_queries': 0, 'num_query_images': 16263, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 16263}} | | [SciMMIRT2IRetrieval](https://aclanthology.org/2024.findings-acl.746/) (Wu et al., 2024) | ['eng'] | Any2AnyRetrieval | t2i | [Academic] | {'test': 32526} | {'test': {'number_of_characters': 4247786, 'num_samples': 32526, 'num_queries': 16263, 'num_documents': 16263, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 16263, 'min_query_length': 21, 'average_query_length': 261.19, 'max_query_length': 2184, 'unique_queries': 16263, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 16263}} | | [SemRel24STS](https://huggingface.co/datasets/SemRel/SemRel2024) (Nedjma Ousidhoum, 2024) | ['afr', 'amh', 'arb', 'arq', 'ary', 'eng', 'hau', 'hin', 'ind', 'kin', 'mar', 'tel'] | STS | s2s | [Spoken, Written] | None | None | -| [SensitiveTopicsClassification](https://aclanthology.org/2021.bsnlp-1.4) | ['rus'] | MultilabelClassification | s2s | [Social, Web, Written] | None | None | +| [SensitiveTopicsClassification](https://aclanthology.org/2021.bsnlp-1.4) (Babakov et al., 2021) | ['rus'] | MultilabelClassification | s2s | [Social, Web, Written] | None | None | +| [SentiRuEval2016](https://github.com/mokoron/sentirueval) (Loukachevitch et al., 2016) | ['rus'] | Classification | t2t | | None | None | | [SentimentAnalysisHindi](https://huggingface.co/datasets/OdiaGenAI/sentiment_analysis_hindi) (Shantipriya Parida, 2023) | ['hin'] | Classification | s2s | [Reviews, Written] | None | None | | [SentimentDKSF](https://github.com/hezarai/hezar) | ['fas'] | Classification | s2p | [Reviews] | None | None | | [SinhalaNewsClassification](https://huggingface.co/datasets/NLPC-UOM/Sinhala-News-Category-classification) (Nisansa de Silva, 2015) | ['sin'] | Classification | s2s | [News, Written] | None | None | @@ -703,30 +729,31 @@ The following tables give you an overview of the tasks in MTEB. | [SiswatiNewsClassification](https://huggingface.co/datasets/dsfsi/za-isizulu-siswati-news) (Madodonga et al., 2023) | ['ssw'] | Classification | s2s | [News, Written] | None | None | | [SketchyI2IRetrieval](https://arxiv.org/abs/2202.01747) (Ypsilantis et al., 2021) | ['eng'] | Any2AnyRetrieval | i2i | [Encyclopaedic] | {'test': 477886} | {'test': {'number_of_characters': 0, 'num_samples': 477886, 'num_queries': 452886, 'num_documents': 25000, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 25000, 'min_query_length': 0, 'average_query_length': 0, 'max_query_length': 0, 'unique_queries': 0, 'num_query_images': 452886, 'min_relevant_docs_per_query': 100, 'average_relevant_docs_per_query': 100.0, 'max_relevant_docs_per_query': 100, 'unique_relevant_docs': 12500}} | | [SlovakHateSpeechClassification](https://huggingface.co/datasets/TUKE-KEMT/hate_speech_slovak) | ['slk'] | Classification | s2s | [Social, Written] | {'test': 1319, 'train': 11870} | {'test': {'num_samples': 1319, 'number_of_characters': 122279, 'num_texts_in_train': 46, 'min_text_length': 8, 'average_text_length': 92.71, 'max_text_length': 1584, 'unique_text': 1315, 'unique_labels': 2, 'labels': {'1': {'count': 360}, '0': {'count': 959}}}, 'train': {'num_samples': 11870, 'number_of_characters': 1130860, 'num_texts_in_train': None, 'min_text_length': 7, 'average_text_length': 95.27, 'max_text_length': 2112, 'unique_text': 11655, 'unique_labels': 2, 'labels': {'1': {'count': 3245}, '0': {'count': 8625}}}} | -| [SlovakMovieReviewSentimentClassification](https://arxiv.org/pdf/2304.01922) ({ {S, 2023) | ['svk'] | Classification | s2s | [Reviews, Written] | None | None | +| [SlovakMovieReviewSentimentClassification](https://arxiv.org/pdf/2304.01922) ({\v{S, 2023) | ['svk'] | Classification | s2s | [Reviews, Written] | None | None | | [SlovakSumRetrieval](https://huggingface.co/datasets/NaiveNeuron/slovaksum) | ['slk'] | Retrieval | s2s | [News, Social, Web, Written] | None | None | | [SouthAfricanLangClassification](https://www.kaggle.com/competitions/south-african-language-identification/) (ExploreAI Academy et al., 2022) | ['afr', 'eng', 'nbl', 'nso', 'sot', 'ssw', 'tsn', 'tso', 'ven', 'xho', 'zul'] | Classification | s2s | [Non-fiction, Web, Written] | None | None | | [SpanishNewsClassification](https://huggingface.co/datasets/MarcOrfilaCarreras/spanish-news) | ['spa'] | Classification | s2s | [News, Written] | None | None | | [SpanishNewsClusteringP2P](https://www.kaggle.com/datasets/kevinmorgado/spanish-news-classification) | ['spa'] | Clustering | p2p | | None | None | -| [SpanishPassageRetrievalS2P](https://mklab.iti.gr/results/spanish-passage-retrieval-dataset/) | ['spa'] | Retrieval | s2p | | None | None | -| [SpanishPassageRetrievalS2S](https://mklab.iti.gr/results/spanish-passage-retrieval-dataset/) | ['spa'] | Retrieval | s2s | | None | None | -| [SpanishSentimentClassification](https://huggingface.co/datasets/sepidmnorozy/Spanish_sentiment) | ['spa'] | Classification | s2s | [Reviews, Written] | None | None | -| [SpartQA](https://github.com/HLR/SpartQA_generation) (Xiao et al., 2024) | ['eng'] | Retrieval | s2s | [Encyclopaedic, Written] | None | None | -| [SprintDuplicateQuestions](https://www.aclweb.org/anthology/D18-1131/) | ['eng'] | PairClassification | s2s | [Programming, Written] | None | None | -| [StackExchangeClustering.v2](https://arxiv.org/abs/2104.07081) (Gregor Geigle and Nils Reimers and Andreas R{"u, 2021) | ['eng'] | Clustering | s2s | [Web, Written] | None | None | -| [StackExchangeClusteringP2P.v2](https://arxiv.org/abs/2104.07081) (Gregor Geigle and Nils Reimers and Andreas R{"u, 2021) | ['eng'] | Clustering | p2p | [Web, Written] | None | None | +| [SpanishPassageRetrievalS2P](https://mklab.iti.gr/results/spanish-passage-retrieval-dataset/) (Kamateri et al., 2019) | ['spa'] | Retrieval | s2p | | None | None | +| [SpanishPassageRetrievalS2S](https://mklab.iti.gr/results/spanish-passage-retrieval-dataset/) (Kamateri et al., 2019) | ['spa'] | Retrieval | s2s | | None | None | +| [SpanishSentimentClassification](https://huggingface.co/datasets/sepidmnorozy/Spanish_sentiment) (Mollanorozy et al., 2023) | ['spa'] | Classification | s2s | [Reviews, Written] | None | None | +| [SpartQA](https://github.com/HLR/SpartQA_generation) (Mirzaee et al., 2021) | ['eng'] | Retrieval | s2s | [Encyclopaedic, Written] | None | None | +| [SpokenQAforIC](https://huggingface.co/datasets/DynamicSuperb/SpokenQA_SLUE) (Suwon Shon, 2023) | ['eng'] | AudioClassification | a2t | [Spoken] | None | None | +| [SprintDuplicateQuestions](https://www.aclweb.org/anthology/D18-1131/) (Shah et al., 2018) | ['eng'] | PairClassification | s2s | [Programming, Written] | None | None | +| [StackExchangeClustering.v2](https://arxiv.org/abs/2104.07081) (Gregor Geigle and Nils Reimers and Andreas R{\"u, 2021) | ['eng'] | Clustering | s2s | [Web, Written] | None | None | +| [StackExchangeClusteringP2P.v2](https://arxiv.org/abs/2104.07081) (Gregor Geigle and Nils Reimers and Andreas R{\"u, 2021) | ['eng'] | Clustering | p2p | [Web, Written] | None | None | | [StackOverflowDupQuestions](https://www.microsoft.com/en-us/research/uploads/prod/2019/03/nl4se18LinkSO.pdf) (Xueqing Liu, 2018) | ['eng'] | Reranking | s2s | [Blog, Programming, Written] | None | None | | [StackOverflowQA](https://arxiv.org/abs/2407.02883) (Xiangyang Li, 2024) | ['eng'] | Retrieval | p2p | [Programming, Written] | {'test': 21925} | {'test': {'number_of_characters': 26584028, 'num_samples': 21925, 'num_queries': 1994, 'num_documents': 19931, 'min_document_length': 61, 'average_document_length': 130.32, 'max_document_length': 22234, 'unique_documents': 19931, 'min_query_length': 5, 'average_query_length': 12029.38, 'max_query_length': 46028, 'unique_queries': 1994, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 1994}} | | [StanfordCars](https://pure.mpg.de/rest/items/item_2029263/component/file_2029262/content) (Jonathan Krause, 2013) | ['eng'] | ImageClassification | i2i | [Encyclopaedic] | {'test': 8041} | {'test': {'num_samples': 8041, 'unique_num_labels': 196, 'min_image_width': 78, 'average_image_width': 701.18, 'max_image_width': 7800, 'min_image_height': 41, 'average_image_height': 483.75, 'max_image_height': 5400, 'labels': {'180': {'count': 38}, '102': {'count': 39}, '144': {'count': 44}, '186': {'count': 43}, '184': {'count': 38}, '77': {'count': 37}, '117': {'count': 41}, '164': {'count': 44}, '31': {'count': 41}, '59': {'count': 36}, '48': {'count': 37}, '107': {'count': 44}, '115': {'count': 37}, '134': {'count': 42}, '82': {'count': 40}, '50': {'count': 43}, '153': {'count': 42}, '32': {'count': 42}, '21': {'count': 42}, '150': {'count': 43}, '3': {'count': 42}, '80': {'count': 45}, '106': {'count': 44}, '190': {'count': 46}, '169': {'count': 44}, '194': {'count': 43}, '90': {'count': 38}, '4': {'count': 40}, '163': {'count': 43}, '147': {'count': 45}, '187': {'count': 43}, '43': {'count': 44}, '6': {'count': 39}, '30': {'count': 44}, '73': {'count': 43}, '29': {'count': 41}, '165': {'count': 41}, '179': {'count': 42}, '105': {'count': 41}, '2': {'count': 43}, '64': {'count': 45}, '34': {'count': 41}, '74': {'count': 44}, '84': {'count': 43}, '24': {'count': 39}, '167': {'count': 42}, '136': {'count': 43}, '133': {'count': 33}, '155': {'count': 39}, '119': {'count': 42}, '129': {'count': 41}, '127': {'count': 39}, '35': {'count': 41}, '170': {'count': 46}, '36': {'count': 38}, '63': {'count': 29}, '182': {'count': 42}, '42': {'count': 46}, '17': {'count': 42}, '75': {'count': 43}, '0': {'count': 44}, '62': {'count': 44}, '173': {'count': 41}, '16': {'count': 40}, '104': {'count': 43}, '49': {'count': 42}, '122': {'count': 44}, '81': {'count': 45}, '191': {'count': 42}, '92': {'count': 39}, '145': {'count': 43}, '95': {'count': 41}, '54': {'count': 39}, '114': {'count': 45}, '112': {'count': 42}, '151': {'count': 35}, '91': {'count': 40}, '188': {'count': 40}, '20': {'count': 42}, '33': {'count': 44}, '86': {'count': 44}, '128': {'count': 38}, '142': {'count': 40}, '19': {'count': 46}, '177': {'count': 41}, '11': {'count': 36}, '45': {'count': 43}, '60': {'count': 43}, '8': {'count': 41}, '56': {'count': 37}, '28': {'count': 42}, '120': {'count': 44}, '5': {'count': 44}, '85': {'count': 42}, '68': {'count': 38}, '22': {'count': 39}, '108': {'count': 44}, '89': {'count': 41}, '132': {'count': 42}, '125': {'count': 42}, '137': {'count': 39}, '158': {'count': 36}, '58': {'count': 44}, '123': {'count': 39}, '52': {'count': 44}, '27': {'count': 41}, '13': {'count': 42}, '70': {'count': 35}, '25': {'count': 34}, '185': {'count': 38}, '171': {'count': 44}, '9': {'count': 33}, '40': {'count': 35}, '178': {'count': 45}, '44': {'count': 32}, '97': {'count': 46}, '87': {'count': 39}, '159': {'count': 44}, '146': {'count': 44}, '51': {'count': 41}, '121': {'count': 40}, '1': {'count': 32}, '160': {'count': 48}, '78': {'count': 48}, '109': {'count': 43}, '103': {'count': 42}, '174': {'count': 30}, '181': {'count': 46}, '23': {'count': 45}, '111': {'count': 45}, '166': {'count': 47}, '172': {'count': 43}, '66': {'count': 38}, '192': {'count': 41}, '148': {'count': 42}, '72': {'count': 44}, '141': {'count': 32}, '71': {'count': 45}, '7': {'count': 45}, '152': {'count': 44}, '183': {'count': 40}, '98': {'count': 27}, '94': {'count': 45}, '126': {'count': 41}, '100': {'count': 42}, '131': {'count': 43}, '116': {'count': 42}, '39': {'count': 39}, '149': {'count': 36}, '101': {'count': 39}, '139': {'count': 42}, '69': {'count': 42}, '12': {'count': 41}, '14': {'count': 43}, '96': {'count': 42}, '41': {'count': 34}, '189': {'count': 43}, '10': {'count': 38}, '140': {'count': 34}, '26': {'count': 35}, '57': {'count': 44}, '88': {'count': 44}, '67': {'count': 40}, '93': {'count': 43}, '193': {'count': 45}, '161': {'count': 45}, '118': {'count': 68}, '110': {'count': 42}, '154': {'count': 42}, '138': {'count': 42}, '143': {'count': 46}, '61': {'count': 37}, '176': {'count': 44}, '113': {'count': 45}, '18': {'count': 40}, '53': {'count': 40}, '47': {'count': 42}, '157': {'count': 29}, '168': {'count': 38}, '124': {'count': 43}, '79': {'count': 43}, '130': {'count': 42}, '46': {'count': 35}, '55': {'count': 46}, '195': {'count': 40}, '38': {'count': 36}, '37': {'count': 40}, '99': {'count': 33}, '83': {'count': 42}, '162': {'count': 36}, '135': {'count': 24}, '175': {'count': 38}, '156': {'count': 36}, '15': {'count': 43}, '65': {'count': 41}, '76': {'count': 40}}}} | | [StanfordCarsI2IRetrieval](https://pure.mpg.de/rest/items/item_2029263/component/file_2029262/content) (Jonathan Krause, 2013) | ['eng'] | Any2AnyRetrieval | i2i | [Encyclopaedic] | {'test': 16082} | {'test': {'number_of_characters': 0, 'num_samples': 16082, 'num_queries': 8041, 'num_documents': 8041, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 8041, 'min_query_length': 0, 'average_query_length': 0, 'max_query_length': 0, 'unique_queries': 0, 'num_query_images': 8041, 'min_relevant_docs_per_query': 23, 'average_relevant_docs_per_query': 40.49, 'max_relevant_docs_per_query': 67, 'unique_relevant_docs': 8041}} | | [StanfordCarsZeroShot](https://pure.mpg.de/rest/items/item_2029263/component/file_2029262/content) (Jonathan Krause, 2013) | ['eng'] | ZeroShotClassification | i2t | [Scene] | {'test': 8041} | {'test': {'num_samples': 8041, 'unique_num_labels': 196, 'min_image_width': 78, 'average_image_width': 701.18, 'max_image_width': 7800, 'min_image_height': 41, 'average_image_height': 483.75, 'max_image_height': 5400, 'min_label_text_length': 29, 'average_label_text_length': 40.83, 'max_label_text_length': 68, 'labels': {'180': {'count': 38}, '102': {'count': 39}, '144': {'count': 44}, '186': {'count': 43}, '184': {'count': 38}, '77': {'count': 37}, '117': {'count': 41}, '164': {'count': 44}, '31': {'count': 41}, '59': {'count': 36}, '48': {'count': 37}, '107': {'count': 44}, '115': {'count': 37}, '134': {'count': 42}, '82': {'count': 40}, '50': {'count': 43}, '153': {'count': 42}, '32': {'count': 42}, '21': {'count': 42}, '150': {'count': 43}, '3': {'count': 42}, '80': {'count': 45}, '106': {'count': 44}, '190': {'count': 46}, '169': {'count': 44}, '194': {'count': 43}, '90': {'count': 38}, '4': {'count': 40}, '163': {'count': 43}, '147': {'count': 45}, '187': {'count': 43}, '43': {'count': 44}, '6': {'count': 39}, '30': {'count': 44}, '73': {'count': 43}, '29': {'count': 41}, '165': {'count': 41}, '179': {'count': 42}, '105': {'count': 41}, '2': {'count': 43}, '64': {'count': 45}, '34': {'count': 41}, '74': {'count': 44}, '84': {'count': 43}, '24': {'count': 39}, '167': {'count': 42}, '136': {'count': 43}, '133': {'count': 33}, '155': {'count': 39}, '119': {'count': 42}, '129': {'count': 41}, '127': {'count': 39}, '35': {'count': 41}, '170': {'count': 46}, '36': {'count': 38}, '63': {'count': 29}, '182': {'count': 42}, '42': {'count': 46}, '17': {'count': 42}, '75': {'count': 43}, '0': {'count': 44}, '62': {'count': 44}, '173': {'count': 41}, '16': {'count': 40}, '104': {'count': 43}, '49': {'count': 42}, '122': {'count': 44}, '81': {'count': 45}, '191': {'count': 42}, '92': {'count': 39}, '145': {'count': 43}, '95': {'count': 41}, '54': {'count': 39}, '114': {'count': 45}, '112': {'count': 42}, '151': {'count': 35}, '91': {'count': 40}, '188': {'count': 40}, '20': {'count': 42}, '33': {'count': 44}, '86': {'count': 44}, '128': {'count': 38}, '142': {'count': 40}, '19': {'count': 46}, '177': {'count': 41}, '11': {'count': 36}, '45': {'count': 43}, '60': {'count': 43}, '8': {'count': 41}, '56': {'count': 37}, '28': {'count': 42}, '120': {'count': 44}, '5': {'count': 44}, '85': {'count': 42}, '68': {'count': 38}, '22': {'count': 39}, '108': {'count': 44}, '89': {'count': 41}, '132': {'count': 42}, '125': {'count': 42}, '137': {'count': 39}, '158': {'count': 36}, '58': {'count': 44}, '123': {'count': 39}, '52': {'count': 44}, '27': {'count': 41}, '13': {'count': 42}, '70': {'count': 35}, '25': {'count': 34}, '185': {'count': 38}, '171': {'count': 44}, '9': {'count': 33}, '40': {'count': 35}, '178': {'count': 45}, '44': {'count': 32}, '97': {'count': 46}, '87': {'count': 39}, '159': {'count': 44}, '146': {'count': 44}, '51': {'count': 41}, '121': {'count': 40}, '1': {'count': 32}, '160': {'count': 48}, '78': {'count': 48}, '109': {'count': 43}, '103': {'count': 42}, '174': {'count': 30}, '181': {'count': 46}, '23': {'count': 45}, '111': {'count': 45}, '166': {'count': 47}, '172': {'count': 43}, '66': {'count': 38}, '192': {'count': 41}, '148': {'count': 42}, '72': {'count': 44}, '141': {'count': 32}, '71': {'count': 45}, '7': {'count': 45}, '152': {'count': 44}, '183': {'count': 40}, '98': {'count': 27}, '94': {'count': 45}, '126': {'count': 41}, '100': {'count': 42}, '131': {'count': 43}, '116': {'count': 42}, '39': {'count': 39}, '149': {'count': 36}, '101': {'count': 39}, '139': {'count': 42}, '69': {'count': 42}, '12': {'count': 41}, '14': {'count': 43}, '96': {'count': 42}, '41': {'count': 34}, '189': {'count': 43}, '10': {'count': 38}, '140': {'count': 34}, '26': {'count': 35}, '57': {'count': 44}, '88': {'count': 44}, '67': {'count': 40}, '93': {'count': 43}, '193': {'count': 45}, '161': {'count': 45}, '118': {'count': 68}, '110': {'count': 42}, '154': {'count': 42}, '138': {'count': 42}, '143': {'count': 46}, '61': {'count': 37}, '176': {'count': 44}, '113': {'count': 45}, '18': {'count': 40}, '53': {'count': 40}, '47': {'count': 42}, '157': {'count': 29}, '168': {'count': 38}, '124': {'count': 43}, '79': {'count': 43}, '130': {'count': 42}, '46': {'count': 35}, '55': {'count': 46}, '195': {'count': 40}, '38': {'count': 36}, '37': {'count': 40}, '99': {'count': 33}, '83': {'count': 42}, '162': {'count': 36}, '135': {'count': 24}, '175': {'count': 38}, '156': {'count': 36}, '15': {'count': 43}, '65': {'count': 41}, '76': {'count': 40}}}} | -| [StatcanDialogueDatasetRetrieval](https://mcgill-nlp.github.io/statcan-dialogue-dataset/) | ['eng', 'fra'] | Retrieval | s2p | [Government, Web, Written] | None | None | +| [StatcanDialogueDatasetRetrieval](https://mcgill-nlp.github.io/statcan-dialogue-dataset/) (Lu et al., 2023) | ['eng', 'fra'] | Retrieval | s2p | [Government, Web, Written] | None | None | | [SugarCrepe](https://proceedings.neurips.cc/paper_files/paper/2023/hash/63461de0b4cb760fc498e85b18a7fe81-Abstract-Datasets_and_Benchmarks.html) (Hsieh et al., 2024) | ['eng'] | Compositionality | i2t | [Encyclopaedic] | {'test': 7511} | {'test': {'num_samples': 7511, 'num_images': 7511, 'num_texts': 15022, 'num_unique_texts': 11844, 'min_text_length': 24, 'average_text_length': 56.49, 'max_text_length': 210}} | | [SummEvalFrSummarization.v2](https://github.com/Yale-LILY/SummEval) (Fabbri et al., 2020) | ['fra'] | Summarization | p2p | [News, Written] | None | None | | [SummEvalSummarization.v2](https://github.com/Yale-LILY/SummEval) (Fabbri et al., 2020) | ['eng'] | Summarization | p2p | [News, Written] | None | None | -| [SwahiliNewsClassification](https://huggingface.co/datasets/Mollel/SwahiliNewsClassification) | ['swa'] | Classification | s2s | [News, Written] | None | None | -| [SweFaqRetrieval](https://spraakbanken.gu.se/en/resources/superlim) (Berdi{ {c, 2023) | ['swe'] | Retrieval | s2s | [Government, Non-fiction, Written] | None | None | -| [SweRecClassification](https://aclanthology.org/2023.nodalida-1.20/) | ['swe'] | Classification | s2s | [Reviews, Written] | None | None | +| [SwahiliNewsClassification](https://huggingface.co/datasets/Mollel/SwahiliNewsClassification) (Davis et al., 2020) | ['swa'] | Classification | s2s | [News, Written] | None | None | +| [SweFaqRetrieval](https://spraakbanken.gu.se/en/resources/superlim) (Berdi{\v{c, 2023) | ['swe'] | Retrieval | s2s | [Government, Non-fiction, Written] | None | None | +| [SweRecClassification](https://aclanthology.org/2023.nodalida-1.20/) (Nielsen et al., 2023) | ['swe'] | Classification | s2s | [Reviews, Written] | None | None | | [SwedishSentimentClassification](https://huggingface.co/datasets/swedish_reviews) | ['swe'] | Classification | s2s | [Reviews, Written] | None | None | | [SwednClusteringP2P](https://spraakbanken.gu.se/en/resources/swedn) (Monsen et al., 2021) | ['swe'] | Clustering | p2p | [News, Non-fiction, Written] | None | None | | [SwednClusteringS2S](https://spraakbanken.gu.se/en/resources/swedn) (Monsen et al., 2021) | ['swe'] | Clustering | s2s | [News, Non-fiction, Written] | None | None | @@ -763,28 +790,29 @@ The following tables give you an overview of the tasks in MTEB. | [SyntecRetrieval](https://huggingface.co/datasets/lyon-nlp/mteb-fr-retrieval-syntec-s2p) (Mathieu Ciancone, 2024) | ['fra'] | Retrieval | s2p | [Legal, Written] | None | None | | [SyntheticText2SQL](https://huggingface.co/datasets/gretelai/synthetic_text_to_sql) (Meyer et al., 2024) | ['eng', 'sql'] | Retrieval | p2p | [Programming, Written] | {'test': 111702} | {'test': {'number_of_characters': 14041553, 'num_samples': 111702, 'num_queries': 5851, 'num_documents': 105851, 'min_document_length': 13, 'average_document_length': 4.58, 'max_document_length': 281, 'unique_documents': 105851, 'min_query_length': 17, 'average_query_length': 2316.95, 'max_query_length': 762, 'unique_queries': 5851, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 5851}} | | [T2Reranking](https://arxiv.org/abs/2304.03679) (Xiaohui Xie, 2023) | ['cmn'] | Reranking | s2s | | None | None | -| [T2Retrieval](https://arxiv.org/abs/2304.03679) (Xiaohui Xie, 2023) | ['cmn'] | Retrieval | s2p | | None | None | +| [T2Retrieval](https://arxiv.org/abs/2304.03679) (Xiaohui Xie, 2023) | ['cmn'] | Retrieval | s2p | [Academic, Financial, Government, Medical, Non-fiction] | None | None | | [TERRa](https://arxiv.org/pdf/2010.15925) (Shavrina et al., 2020) | ['rus'] | PairClassification | s2s | [News, Web, Written] | None | None | -| [TNews](https://www.cluebenchmarks.com/introduce.html) | ['cmn'] | Classification | s2s | | None | None | +| [TNews](https://www.cluebenchmarks.com/introduce.html) (Xu et al., 2020) | ['cmn'] | Classification | s2s | | None | None | | [TRECCOVID](https://ir.nist.gov/covidSubmit/index.html) (Kirk Roberts, 2021) | ['eng'] | Retrieval | s2p | [Academic, Medical, Written] | None | None | | [TRECCOVID-Fa](https://huggingface.co/datasets/MCINext/trec-covid-fa) | ['fas'] | Retrieval | s2p | [Medical] | None | None | | [TRECCOVID-NL](https://colab.research.google.com/drive/1R99rjeAGt8S9IfAIRR3wS052sNu3Bjo-#scrollTo=4HduGW6xHnrZ) (Nikolay Banar, 2024) | ['nld'] | Retrieval | s2p | [Academic, Medical, Written] | None | None | | [TRECCOVID-PL](https://ir.nist.gov/covidSubmit/index.html) (Konrad Wojtasik, 2024) | ['pol'] | Retrieval | s2p | [Academic, Medical, Non-fiction, Written] | None | None | | [TUBerlinT2IRetrieval](https://dl.acm.org/doi/pdf/10.1145/2185520.2185540?casa_token=tq-eUx5UROYAAAAA:_694nPzE7tali6LCkxQc0M-mlo9xslasPMcVnFPMy9tDfvt7lg7p1RTe-k8VWCjuv9gmkQqasKUZ) (Eitz et al., 2012) | ['eng'] | Any2AnyRetrieval | t2i | [Encyclopaedic] | {'test': 20250} | {'test': {'number_of_characters': 1810, 'num_samples': 20250, 'num_queries': 250, 'num_documents': 20000, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 20000, 'min_query_length': 2, 'average_query_length': 7.24, 'max_query_length': 18, 'unique_queries': 250, 'num_query_images': 0, 'min_relevant_docs_per_query': 80, 'average_relevant_docs_per_query': 80.0, 'max_relevant_docs_per_query': 80, 'unique_relevant_docs': 20000}} | -| [TV2Nordretrieval](https://huggingface.co/datasets/alexandrainst/nordjylland-news-summarization) | ['dan'] | Retrieval | p2p | [News, Non-fiction, Written] | None | None | +| [TV2Nordretrieval](https://huggingface.co/datasets/alexandrainst/nordjylland-news-summarization) (Flansmose Mikkelsen et al., 2022) | ['dan'] | Retrieval | p2p | [News, Non-fiction, Written] | None | None | +| [TalemaaderPC](https://sprogteknologi.dk/dataset/1000-talemader-evalueringsdatasaet) ({Det Danske Sprog- og Litteraturselskab, 2024) | ['dan'] | PairClassification | s2s | [Academic, Written] | None | None | | [TamilNewsClassification](https://github.com/vanangamudi/tamil-news-classification) (Anoop Kunchukuttan, 2020) | ['tam'] | Classification | s2s | [News, Written] | None | None | | [Tatoeba](https://github.com/facebookresearch/LASER/tree/main/data/tatoeba/v1) (Tatoeba community, 2021) | ['afr', 'amh', 'ang', 'ara', 'arq', 'arz', 'ast', 'awa', 'aze', 'bel', 'ben', 'ber', 'bos', 'bre', 'bul', 'cat', 'cbk', 'ceb', 'ces', 'cha', 'cmn', 'cor', 'csb', 'cym', 'dan', 'deu', 'dsb', 'dtp', 'ell', 'eng', 'epo', 'est', 'eus', 'fao', 'fin', 'fra', 'fry', 'gla', 'gle', 'glg', 'gsw', 'heb', 'hin', 'hrv', 'hsb', 'hun', 'hye', 'ido', 'ile', 'ina', 'ind', 'isl', 'ita', 'jav', 'jpn', 'kab', 'kat', 'kaz', 'khm', 'kor', 'kur', 'kzj', 'lat', 'lfn', 'lit', 'lvs', 'mal', 'mar', 'max', 'mhr', 'mkd', 'mon', 'nds', 'nld', 'nno', 'nob', 'nov', 'oci', 'orv', 'pam', 'pes', 'pms', 'pol', 'por', 'ron', 'rus', 'slk', 'slv', 'spa', 'sqi', 'srp', 'swe', 'swg', 'swh', 'tam', 'tat', 'tel', 'tgl', 'tha', 'tuk', 'tur', 'tzl', 'uig', 'ukr', 'urd', 'uzb', 'vie', 'war', 'wuu', 'xho', 'yid', 'yue', 'zsm'] | BitextMining | s2s | [Written] | None | None | | [TbilisiCityHallBitextMining](https://huggingface.co/datasets/jupyterjazz/tbilisi-city-hall-titles) | ['eng', 'kat'] | BitextMining | s2s | [News, Written] | {'test': 3640} | {'test': {'num_samples': 3640, 'number_of_characters': 572146, 'unique_pairs': 3640, 'min_sentence1_length': 13, 'average_sentence1_length': 78.59, 'max_sentence1_length': 203, 'unique_sentence1': 3636, 'min_sentence2_length': 13, 'average_sentence2_length': 78.59, 'max_sentence2_length': 203, 'unique_sentence2': 3636, 'hf_subset_descriptive_stats': {'kat_Geor-eng_Latn': {'num_samples': 1820, 'number_of_characters': 286073, 'unique_pairs': 1820, 'min_sentence1_length': 30, 'average_sentence1_length': 76.07, 'max_sentence1_length': 189, 'unique_sentence1': 1820, 'min_sentence2_length': 13, 'average_sentence2_length': 81.12, 'max_sentence2_length': 203, 'unique_sentence2': 1816}, 'eng_Latn-kat_Geor': {'num_samples': 1820, 'number_of_characters': 286073, 'unique_pairs': 1820, 'min_sentence1_length': 13, 'average_sentence1_length': 81.12, 'max_sentence1_length': 203, 'unique_sentence1': 1816, 'min_sentence2_length': 30, 'average_sentence2_length': 76.07, 'max_sentence2_length': 189, 'unique_sentence2': 1820}}}} | | [TelemarketingSalesRuleLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | | [TeluguAndhraJyotiNewsClassification](https://github.com/AnushaMotamarri/Telugu-Newspaper-Article-Dataset) | ['tel'] | Classification | s2s | [News, Written] | None | None | -| [TempReasonL1](https://github.com/DAMO-NLP-SG/TempReason) (Xiao et al., 2024) | ['eng'] | Retrieval | s2s | [Encyclopaedic, Written] | None | None | -| [TempReasonL2Context](https://github.com/DAMO-NLP-SG/TempReason) (Xiao et al., 2024) | ['eng'] | Retrieval | s2s | [Encyclopaedic, Written] | None | None | -| [TempReasonL2Fact](https://github.com/DAMO-NLP-SG/TempReason) (Xiao et al., 2024) | ['eng'] | Retrieval | s2s | [Encyclopaedic, Written] | None | None | -| [TempReasonL2Pure](https://github.com/DAMO-NLP-SG/TempReason) (Xiao et al., 2024) | ['eng'] | Retrieval | s2s | [Encyclopaedic, Written] | None | None | -| [TempReasonL3Context](https://github.com/DAMO-NLP-SG/TempReason) (Xiao et al., 2024) | ['eng'] | Retrieval | s2s | [Encyclopaedic, Written] | None | None | -| [TempReasonL3Fact](https://github.com/DAMO-NLP-SG/TempReason) (Xiao et al., 2024) | ['eng'] | Retrieval | s2s | [Encyclopaedic, Written] | None | None | -| [TempReasonL3Pure](https://github.com/DAMO-NLP-SG/TempReason) (Xiao et al., 2024) | ['eng'] | Retrieval | s2s | [Encyclopaedic, Written] | None | None | -| [TenKGnadClassification](https://tblock.github.io/10kGNAD/) | ['deu'] | Classification | p2p | [News, Written] | None | None | +| [TempReasonL1](https://github.com/DAMO-NLP-SG/TempReason) (Tan et al., 2023) | ['eng'] | Retrieval | s2s | [Encyclopaedic, Written] | None | None | +| [TempReasonL2Context](https://github.com/DAMO-NLP-SG/TempReason) (Tan et al., 2023) | ['eng'] | Retrieval | s2s | [Encyclopaedic, Written] | None | None | +| [TempReasonL2Fact](https://github.com/DAMO-NLP-SG/TempReason) (Tan et al., 2023) | ['eng'] | Retrieval | s2s | [Encyclopaedic, Written] | None | None | +| [TempReasonL2Pure](https://github.com/DAMO-NLP-SG/TempReason) (Tan et al., 2023) | ['eng'] | Retrieval | s2s | [Encyclopaedic, Written] | None | None | +| [TempReasonL3Context](https://github.com/DAMO-NLP-SG/TempReason) (Tan et al., 2023) | ['eng'] | Retrieval | s2s | [Encyclopaedic, Written] | None | None | +| [TempReasonL3Fact](https://github.com/DAMO-NLP-SG/TempReason) (Tan et al., 2023) | ['eng'] | Retrieval | s2s | [Encyclopaedic, Written] | None | None | +| [TempReasonL3Pure](https://github.com/DAMO-NLP-SG/TempReason) (Tan et al., 2023) | ['eng'] | Retrieval | s2s | [Encyclopaedic, Written] | None | None | +| [TenKGnadClassification](https://tblock.github.io/10kGNAD/) (Dietmar Schabus, 2017) | ['deu'] | Classification | p2p | [News, Written] | None | None | | [TenKGnadClusteringP2P.v2](https://tblock.github.io/10kGNAD/) | ['deu'] | Clustering | p2p | [News, Non-fiction, Written] | None | None | | [TenKGnadClusteringS2S.v2](https://tblock.github.io/10kGNAD/) | ['deu'] | Clustering | s2s | [News, Non-fiction, Written] | None | None | | [TextualismToolDictionariesLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | @@ -796,7 +824,7 @@ The following tables give you an overview of the tasks in MTEB. | [TopiOCQAHardNegatives](https://mcgill-nlp.github.io/topiocqa) (Vaibhav Adlakha, 2022) | ['eng'] | Retrieval | s2p | [Encyclopaedic, Written] | None | None | | [Touche2020-Fa](https://huggingface.co/datasets/MCINext/touche2020-fa) | ['fas'] | Retrieval | s2p | [Spoken] | None | None | | [Touche2020-NL](https://huggingface.co/datasets/clips/beir-nl-webis-touche2020) (Nikolay Banar, 2024) | ['nld'] | Retrieval | s2p | [Academic, Non-fiction] | None | None | -| [Touche2020Retrieval.v3](https://github.com/castorini/touche-error-analysis) | ['eng'] | Retrieval | s2p | [Academic] | {'test': 303781} | {'test': {'number_of_characters': 637047138, 'num_samples': 303781, 'num_queries': 49, 'num_documents': 303732, 'min_document_length': 16, 'average_document_length': 0.01, 'max_document_length': 83, 'unique_documents': 303732, 'min_query_length': 41, 'average_query_length': 13000918.57, 'max_query_length': 105983, 'unique_queries': 49, 'min_relevant_docs_per_query': 40, 'average_relevant_docs_per_query': 58.14, 'max_relevant_docs_per_query': 87, 'unique_relevant_docs': 2732}} | +| [Touche2020Retrieval.v3](https://github.com/castorini/touche-error-analysis) (Nandan Thakur, 2024) | ['eng'] | Retrieval | s2p | [Academic] | {'test': 303781} | {'test': {'number_of_characters': 637047138, 'num_samples': 303781, 'num_queries': 49, 'num_documents': 303732, 'min_document_length': 16, 'average_document_length': 0.01, 'max_document_length': 83, 'unique_documents': 303732, 'min_query_length': 41, 'average_query_length': 13000918.57, 'max_query_length': 105983, 'unique_queries': 49, 'min_relevant_docs_per_query': 40, 'average_relevant_docs_per_query': 58.14, 'max_relevant_docs_per_query': 87, 'unique_relevant_docs': 2732}} | | [ToxicChatClassification](https://aclanthology.org/2023.findings-emnlp.311/) (Zi Lin, 2023) | ['eng'] | Classification | s2s | [Constructed, Written] | None | None | | [ToxicConversationsClassification](https://www.kaggle.com/competitions/jigsaw-unintended-bias-in-toxicity-classification/overview) (cjadams, 2019) | ['eng'] | Classification | s2s | [Social, Written] | None | None | | [TswanaNewsClassification](https://link.springer.com/chapter/10.1007/978-3-031-49002-6_17) (Vukosi Marivate, 2023) | ['tsn'] | Classification | s2s | [News, Written] | None | None | @@ -805,26 +833,28 @@ The following tables give you an overview of the tasks in MTEB. | [TurkishMovieSentimentClassification](https://www.win.tue.nl/~mpechen/publications/pubs/MT_WISDOM2013.pdf) (Erkin Demirtas, 2013) | ['tur'] | Classification | s2s | [Reviews, Written] | None | None | | [TurkishProductSentimentClassification](https://www.win.tue.nl/~mpechen/publications/pubs/MT_WISDOM2013.pdf) (Erkin Demirtas, 2013) | ['tur'] | Classification | s2s | [Reviews, Written] | None | None | | [TweetEmotionClassification](https://link.springer.com/chapter/10.1007/978-3-319-77116-8_8) (Al-Khatib et al., 2018) | ['ara'] | Classification | s2s | [Social, Written] | None | None | -| [TweetSarcasmClassification](https://aclanthology.org/2020.osact-1.5/) | ['ara'] | Classification | s2s | [Social, Written] | None | None | -| [TweetSentimentClassification](https://aclanthology.org/2022.lrec-1.27) | ['ara', 'deu', 'eng', 'fra', 'hin', 'ita', 'por', 'spa'] | Classification | s2s | [Social, Written] | None | None | +| [TweetSarcasmClassification](https://aclanthology.org/2020.osact-1.5/) (Abu Farha et al., 2020) | ['ara'] | Classification | s2s | [Social, Written] | None | None | +| [TweetSentimentClassification](https://aclanthology.org/2022.lrec-1.27) (Barbieri et al., 2022) | ['ara', 'deu', 'eng', 'fra', 'hin', 'ita', 'por', 'spa'] | Classification | s2s | [Social, Written] | None | None | | [TweetSentimentExtractionClassification](https://www.kaggle.com/competitions/tweet-sentiment-extraction/overview) (Maggie et al., 2020) | ['eng'] | Classification | s2s | [Social, Written] | None | None | -| [TweetTopicSingleClassification](https://arxiv.org/abs/2209.09824) | ['eng'] | Classification | s2s | [News, Social, Written] | None | None | +| [TweetTopicSingleClassification](https://arxiv.org/abs/2209.09824) (Antypas et al., 2022) | ['eng'] | Classification | s2s | [News, Social, Written] | None | None | | [TwentyNewsgroupsClustering.v2](https://scikit-learn.org/0.19/datasets/twenty_newsgroups.html) (Ken Lang, 1995) | ['eng'] | Clustering | s2s | [News, Written] | {'test': 59545} | {'test': {'num_samples': 59545, 'number_of_characters': 1907719, 'min_text_length': 11, 'average_text_length': 32.04, 'max_text_length': 120, 'min_labels_per_text': 2082, 'average_labels_per_text': 1.0, 'max_labels_per_text': 3236, 'unique_labels': 20, 'labels': {'12': {'count': 3137}, '6': {'count': 3070}, '0': {'count': 2613}, '2': {'count': 3155}, '10': {'count': 3220}, '17': {'count': 2986}, '14': {'count': 3106}, '13': {'count': 3055}, '1': {'count': 3056}, '16': {'count': 2911}, '9': {'count': 2984}, '3': {'count': 3070}, '15': {'count': 3090}, '7': {'count': 3036}, '5': {'count': 3124}, '11': {'count': 3236}, '18': {'count': 2483}, '8': {'count': 3090}, '19': {'count': 2082}, '4': {'count': 3041}}}} | | [TwitterHjerneRetrieval](https://huggingface.co/datasets/sorenmulli/da-hashtag-twitterhjerne) (Holm et al., 2024) | ['dan'] | Retrieval | p2p | [Social, Written] | None | None | -| [TwitterSemEval2015](https://alt.qcri.org/semeval2015/task1/) | ['eng'] | PairClassification | s2s | [Social, Written] | None | None | -| [TwitterURLCorpus](https://languagenet.github.io/) | ['eng'] | PairClassification | s2s | [Social, Written] | {'test': 51534} | {'test': {'num_samples': 51534, 'number_of_characters': 8659940, 'min_sentence1_length': 24, 'avg_sentence1_length': 79.49, 'max_sentence1_length': 126, 'unique_sentence1': 4329, 'min_sentence2_length': 6, 'avg_sentence2_length': 88.55, 'max_sentence2_length': 608, 'unique_sentence2': 41304, 'unique_labels': 2, 'labels': {'0': {'count': 38546}, '1': {'count': 12988}}}} | +| [TwitterSemEval2015](https://alt.qcri.org/semeval2015/task1/) (Xu et al., 2015) | ['eng'] | PairClassification | s2s | [Social, Written] | None | None | +| [TwitterURLCorpus](https://languagenet.github.io/) (Lan et al., 2017) | ['eng'] | PairClassification | s2s | [Social, Written] | {'test': 51534} | {'test': {'num_samples': 51534, 'number_of_characters': 8659940, 'min_sentence1_length': 24, 'avg_sentence1_length': 79.49, 'max_sentence1_length': 126, 'unique_sentence1': 4329, 'min_sentence2_length': 6, 'avg_sentence2_length': 88.55, 'max_sentence2_length': 608, 'unique_sentence2': 41304, 'unique_labels': 2, 'labels': {'0': {'count': 38546}, '1': {'count': 12988}}}} | | [UCCVCommonLawLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | | [UCF101](https://huggingface.co/datasets/flwrlabs/ucf101) (Khurram Soomro, 2012) | ['eng'] | ImageClassification | i2i | [Scene] | {'test': 697222} | {'test': {'num_samples': 697222, 'unique_num_labels': 101, 'min_image_width': 320, 'average_image_width': 320.12, 'max_image_width': 400, 'min_image_height': 226, 'average_image_height': 239.98, 'max_image_height': 240, 'labels': {'0': {'count': 7475}, '1': {'count': 6341}, '2': {'count': 6181}, '3': {'count': 6320}, '4': {'count': 3708}, '5': {'count': 7296}, '6': {'count': 4004}, '7': {'count': 3923}, '8': {'count': 2267}, '9': {'count': 5587}, '10': {'count': 8946}, '11': {'count': 12714}, '12': {'count': 6053}, '13': {'count': 3191}, '14': {'count': 3696}, '15': {'count': 5468}, '16': {'count': 10032}, '17': {'count': 8346}, '18': {'count': 5098}, '19': {'count': 10811}, '20': {'count': 6378}, '21': {'count': 3385}, '22': {'count': 3974}, '23': {'count': 4781}, '24': {'count': 5867}, '25': {'count': 7904}, '26': {'count': 12181}, '27': {'count': 4511}, '28': {'count': 4402}, '29': {'count': 5513}, '30': {'count': 3236}, '31': {'count': 7160}, '32': {'count': 6455}, '33': {'count': 3766}, '34': {'count': 8362}, '35': {'count': 3521}, '36': {'count': 3263}, '37': {'count': 5112}, '38': {'count': 9685}, '39': {'count': 4598}, '40': {'count': 6682}, '41': {'count': 8690}, '42': {'count': 3591}, '43': {'count': 11432}, '44': {'count': 3458}, '45': {'count': 10080}, '46': {'count': 16507}, '47': {'count': 3001}, '48': {'count': 6524}, '49': {'count': 7786}, '50': {'count': 4657}, '51': {'count': 8795}, '52': {'count': 3992}, '53': {'count': 5668}, '54': {'count': 6575}, '55': {'count': 8662}, '56': {'count': 5253}, '57': {'count': 3761}, '58': {'count': 8679}, '59': {'count': 11986}, '60': {'count': 15720}, '61': {'count': 12080}, '62': {'count': 10634}, '63': {'count': 6161}, '64': {'count': 13934}, '65': {'count': 8393}, '66': {'count': 5452}, '67': {'count': 7905}, '68': {'count': 12354}, '69': {'count': 4060}, '70': {'count': 9075}, '71': {'count': 2689}, '72': {'count': 5435}, '73': {'count': 17655}, '74': {'count': 5693}, '75': {'count': 12572}, '76': {'count': 9543}, '77': {'count': 10793}, '78': {'count': 4134}, '79': {'count': 4832}, '80': {'count': 8977}, '81': {'count': 7381}, '82': {'count': 4927}, '83': {'count': 12469}, '84': {'count': 3843}, '85': {'count': 4945}, '86': {'count': 6724}, '87': {'count': 6582}, '88': {'count': 7046}, '89': {'count': 5874}, '90': {'count': 4878}, '91': {'count': 6417}, '92': {'count': 3762}, '93': {'count': 7349}, '94': {'count': 8149}, '95': {'count': 3925}, '96': {'count': 3378}, '97': {'count': 7721}, '98': {'count': 3671}, '99': {'count': 6292}, '100': {'count': 6508}}}} | | [UCF101ZeroShot](https://huggingface.co/datasets/flwrlabs/ucf101) (Khurram Soomro, 2012) | ['eng'] | ZeroShotClassification | i2t | [Scene] | {'test': 697222} | {'test': {'num_samples': 697222, 'unique_num_labels': 101, 'min_image_width': 320, 'average_image_width': 320.12, 'max_image_width': 400, 'min_image_height': 226, 'average_image_height': 239.98, 'max_image_height': 240, 'min_label_text_length': 15, 'average_label_text_length': 21.77, 'max_label_text_length': 29, 'labels': {'0': {'count': 7475}, '1': {'count': 6341}, '2': {'count': 6181}, '3': {'count': 6320}, '4': {'count': 3708}, '5': {'count': 7296}, '6': {'count': 4004}, '7': {'count': 3923}, '8': {'count': 2267}, '9': {'count': 5587}, '10': {'count': 8946}, '11': {'count': 12714}, '12': {'count': 6053}, '13': {'count': 3191}, '14': {'count': 3696}, '15': {'count': 5468}, '16': {'count': 10032}, '17': {'count': 8346}, '18': {'count': 5098}, '19': {'count': 10811}, '20': {'count': 6378}, '21': {'count': 3385}, '22': {'count': 3974}, '23': {'count': 4781}, '24': {'count': 5867}, '25': {'count': 7904}, '26': {'count': 12181}, '27': {'count': 4511}, '28': {'count': 4402}, '29': {'count': 5513}, '30': {'count': 3236}, '31': {'count': 7160}, '32': {'count': 6455}, '33': {'count': 3766}, '34': {'count': 8362}, '35': {'count': 3521}, '36': {'count': 3263}, '37': {'count': 5112}, '38': {'count': 9685}, '39': {'count': 4598}, '40': {'count': 6682}, '41': {'count': 8690}, '42': {'count': 3591}, '43': {'count': 11432}, '44': {'count': 3458}, '45': {'count': 10080}, '46': {'count': 16507}, '47': {'count': 3001}, '48': {'count': 6524}, '49': {'count': 7786}, '50': {'count': 4657}, '51': {'count': 8795}, '52': {'count': 3992}, '53': {'count': 5668}, '54': {'count': 6575}, '55': {'count': 8662}, '56': {'count': 5253}, '57': {'count': 3761}, '58': {'count': 8679}, '59': {'count': 11986}, '60': {'count': 15720}, '61': {'count': 12080}, '62': {'count': 10634}, '63': {'count': 6161}, '64': {'count': 13934}, '65': {'count': 8393}, '66': {'count': 5452}, '67': {'count': 7905}, '68': {'count': 12354}, '69': {'count': 4060}, '70': {'count': 9075}, '71': {'count': 2689}, '72': {'count': 5435}, '73': {'count': 17655}, '74': {'count': 5693}, '75': {'count': 12572}, '76': {'count': 9543}, '77': {'count': 10793}, '78': {'count': 4134}, '79': {'count': 4832}, '80': {'count': 8977}, '81': {'count': 7381}, '82': {'count': 4927}, '83': {'count': 12469}, '84': {'count': 3843}, '85': {'count': 4945}, '86': {'count': 6724}, '87': {'count': 6582}, '88': {'count': 7046}, '89': {'count': 5874}, '90': {'count': 4878}, '91': {'count': 6417}, '92': {'count': 3762}, '93': {'count': 7349}, '94': {'count': 8149}, '95': {'count': 3925}, '96': {'count': 3378}, '97': {'count': 7721}, '98': {'count': 3671}, '99': {'count': 6292}, '100': {'count': 6508}}}} | -| [UkrFormalityClassification](https://huggingface.co/datasets/ukr-detect/ukr-formality-dataset-translated-gyafc) | ['ukr'] | Classification | s2s | [News, Written] | None | None | +| [UkrFormalityClassification](https://huggingface.co/datasets/ukr-detect/ukr-formality-dataset-translated-gyafc) (Rao et al., 2018) | ['ukr'] | Classification | s2s | [News, Written] | None | None | | [UnfairTOSLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | +| [UrbanSound8k_Zeroshot](https://huggingface.co/datasets/danavery/urbansound8K) (Justin Salamon, 2014) | ['eng'] | AudioZeroshotClassification | a2t | [Spoken] | None | None | | [UrduRomanSentimentClassification](https://archive.ics.uci.edu/dataset/458/roman+urdu+data+set) (Sharf,Zareen, 2018) | ['urd'] | Classification | s2s | [Social, Written] | None | None | | [VDRMultilingualRetrieval](https://huggingface.co/datasets/llamaindex/vdr-multilingual-test) (LlamaIndex, 2025) | ['deu', 'eng', 'fra', 'ita', 'spa'] | Retrieval | it2it | [Web] | None | None | | [VGHierarchicalClusteringP2P](https://huggingface.co/datasets/navjordj/VG_summarization) (Navjord et al., 2023) | ['nob'] | Clustering | p2p | [News, Non-fiction, Written] | None | None | | [VGHierarchicalClusteringS2S](https://huggingface.co/datasets/navjordj/VG_summarization) (Navjord et al., 2023) | ['nob'] | Clustering | p2p | [News, Non-fiction, Written] | None | None | -| [VOC2007](http://host.robots.ox.ac.uk/pascal/VOC/) | ['eng'] | ImageClassification | i2i | [Encyclopaedic] | {'test': 4952} | {'test': {'num_samples': 4952, 'min_image_width': 148, 'average_image_width': 471.25, 'max_image_width': 500, 'min_image_height': 139, 'average_image_height': 381.54, 'max_image_height': 500, 'min_labels_per_sample': 1, 'average_label_per_sample': 1.42, 'max_labels_per_sample': 5, 'unique_num_labels': 20, 'labels': {'14': {'count': 2007}, '11': {'count': 418}, '18': {'count': 259}, '17': {'count': 223}, '8': {'count': 417}, '6': {'count': 721}, '15': {'count': 224}, '10': {'count': 190}, '12': {'count': 274}, '7': {'count': 322}, '9': {'count': 127}, '5': {'count': 174}, '1': {'count': 239}, '13': {'count': 222}, '2': {'count': 282}, '19': {'count': 229}, '16': {'count': 97}, '0': {'count': 204}, '3': {'count': 172}, '4': {'count': 212}}}} | +| [VOC2007](http://host.robots.ox.ac.uk/pascal/VOC/) (Everingham et al., 2010) | ['eng'] | ImageClassification | i2i | [Encyclopaedic] | {'test': 4952} | {'test': {'num_samples': 4952, 'min_image_width': 148, 'average_image_width': 471.25, 'max_image_width': 500, 'min_image_height': 139, 'average_image_height': 381.54, 'max_image_height': 500, 'min_labels_per_sample': 1, 'average_label_per_sample': 1.42, 'max_labels_per_sample': 5, 'unique_num_labels': 20, 'labels': {'14': {'count': 2007}, '11': {'count': 418}, '18': {'count': 259}, '17': {'count': 223}, '8': {'count': 417}, '6': {'count': 721}, '15': {'count': 224}, '10': {'count': 190}, '12': {'count': 274}, '7': {'count': 322}, '9': {'count': 127}, '5': {'count': 174}, '1': {'count': 239}, '13': {'count': 222}, '2': {'count': 282}, '19': {'count': 229}, '16': {'count': 97}, '0': {'count': 204}, '3': {'count': 172}, '4': {'count': 212}}}} | | [VQA2IT2TRetrieval](https://openaccess.thecvf.com/content_cvpr_2017/html/Goyal_Making_the_v_CVPR_2017_paper.html) (Goyal et al., 2017) | ['eng'] | Any2AnyRetrieval | it2t | [Web] | {'test': 235951} | {'test': {'number_of_characters': 6850685, 'num_samples': 235951, 'num_queries': 214354, 'num_documents': 21597, 'min_document_length': 1, 'average_document_length': 11.19, 'max_document_length': 99, 'unique_documents': 21597, 'num_document_images': 0, 'min_query_length': 10, 'average_query_length': 30.83, 'max_query_length': 100, 'unique_queries': 81565, 'num_query_images': 214354, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 21597}} | -| [VideoRetrieval](https://arxiv.org/abs/2203.03367) | ['cmn'] | Retrieval | s2p | | None | None | +| [VehicleSoundClustering](https://huggingface.co/datasets/DynamicSuperb/Vehicle_sounds_classification_dataset) (Bazilinskyy et al., 2018) | ['eng'] | AudioClustering | a2a | [Scene] | None | None | +| [VideoRetrieval](https://arxiv.org/abs/2203.03367) (Dingkun Long, 2022) | ['cmn'] | Retrieval | s2p | | None | None | | [VidoreArxivQARetrieval](https://arxiv.org/pdf/2407.01449) (Faysse et al., 2024) | ['eng'] | DocumentUnderstanding | t2i | [Academic] | {'test': 1000} | {'test': {'number_of_characters': 49664, 'num_samples': 1000, 'num_queries': 500, 'num_documents': 500, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 500, 'min_query_length': 37, 'average_query_length': 99.33, 'max_query_length': 200, 'unique_queries': 500, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 500}} | | [VidoreDocVQARetrieval](https://arxiv.org/pdf/2407.01449) (Faysse et al., 2024) | ['eng'] | DocumentUnderstanding | t2i | [Academic] | {'test': 951} | {'test': {'number_of_characters': 19499, 'num_samples': 951, 'num_queries': 451, 'num_documents': 500, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 500, 'min_query_length': 13, 'average_query_length': 43.24, 'max_query_length': 128, 'unique_queries': 451, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.11, 'max_relevant_docs_per_query': 12, 'unique_relevant_docs': 500}} | | [VidoreInfoVQARetrieval](https://arxiv.org/pdf/2407.01449) (Faysse et al., 2024) | ['eng'] | DocumentUnderstanding | t2i | [Academic] | {'test': 994} | {'test': {'number_of_characters': 32253, 'num_samples': 994, 'num_queries': 494, 'num_documents': 500, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 500, 'min_query_length': 21, 'average_query_length': 65.29, 'max_query_length': 167, 'unique_queries': 494, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.01, 'max_relevant_docs_per_query': 5, 'unique_relevant_docs': 500}} | @@ -836,7 +866,7 @@ The following tables give you an overview of the tasks in MTEB. | [VidoreTabfquadRetrieval](https://arxiv.org/pdf/2407.01449) (Faysse et al., 2024) | ['eng'] | DocumentUnderstanding | t2i | [Academic] | {'test': 350} | {'test': {'number_of_characters': 28177, 'num_samples': 350, 'num_queries': 280, 'num_documents': 70, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 70, 'min_query_length': 33, 'average_query_length': 100.63, 'max_query_length': 184, 'unique_queries': 280, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 70}} | | [VidoreTatdqaRetrieval](https://arxiv.org/pdf/2407.01449) (Faysse et al., 2024) | ['eng'] | DocumentUnderstanding | t2i | [Academic] | {'test': 1923} | {'test': {'number_of_characters': 120235, 'num_samples': 1923, 'num_queries': 1646, 'num_documents': 277, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 277, 'min_query_length': 15, 'average_query_length': 73.05, 'max_query_length': 194, 'unique_queries': 1646, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.01, 'max_relevant_docs_per_query': 10, 'unique_relevant_docs': 277}} | | [VieMedEVBitextMining](https://aclanthology.org/2015.iwslt-evaluation.11/) (Nhu Vo, 2024) | ['eng', 'vie'] | BitextMining | s2s | [Medical, Written] | {'test': 2048} | {'test': {'num_samples': 2048, 'number_of_characters': 575910, 'unique_pairs': 2048, 'min_sentence1_length': 11, 'average_sentence1_length': 139.23, 'max_sentence1_length': 1291, 'unique_sentence1': 2048, 'min_sentence2_length': 11, 'average_sentence2_length': 141.98, 'max_sentence2_length': 1217, 'unique_sentence2': 2047}} | -| [VieQuADRetrieval](https://aclanthology.org/2020.coling-main.233.pdf) | ['vie'] | Retrieval | s2p | [Encyclopaedic, Non-fiction, Written] | None | None | +| [VieQuADRetrieval](https://aclanthology.org/2020.coling-main.233.pdf) (Nguyen et al., 2020) | ['vie'] | Retrieval | s2p | [Encyclopaedic, Non-fiction, Written] | None | None | | [VieStudentFeedbackClassification](https://ieeexplore.ieee.org/document/8573337) (Nguyen et al., 2018) | ['vie'] | Classification | s2s | [Reviews, Written] | None | None | | [VisualNewsI2TRetrieval](https://aclanthology.org/2021.emnlp-main.542/) (Liu et al., 2021) | ['eng'] | Any2AnyRetrieval | i2t | [Encyclopaedic] | {'test': 557568} | {'test': {'number_of_characters': 58422402, 'num_samples': 557568, 'num_queries': 20000, 'num_documents': 537568, 'min_document_length': 2, 'average_document_length': 108.68, 'max_document_length': 2751, 'unique_documents': 537568, 'num_document_images': 0, 'min_query_length': 0, 'average_query_length': 0, 'max_query_length': 0, 'unique_queries': 0, 'num_query_images': 20000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 19995}} | | [VisualNewsT2IRetrieval](https://aclanthology.org/2021.emnlp-main.542/) (Liu et al., 2021) | ['eng'] | Any2AnyRetrieval | t2i | [Encyclopaedic] | {'test': 562241} | {'test': {'number_of_characters': 2204490, 'num_samples': 562241, 'num_queries': 19995, 'num_documents': 542246, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 542246, 'min_query_length': 2, 'average_query_length': 110.25, 'max_query_length': 1356, 'unique_queries': 19995, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 2, 'unique_relevant_docs': 20000}} | @@ -845,9 +875,12 @@ The following tables give you an overview of the tasks in MTEB. | [VisualSTS17Eng](https://arxiv.org/abs/2402.08183/) (Xiao et al., 2024) | ['eng'] | VisualSTS(eng) | i2i | [News, Social, Spoken, Web, Written] | None | None | | [VisualSTS17Multilingual](https://arxiv.org/abs/2402.08183/) (Xiao et al., 2024) | ['ara', 'deu', 'eng', 'fra', 'ita', 'kor', 'nld', 'spa', 'tur'] | VisualSTS(multi) | i2i | [News, Social, Spoken, Web, Written] | None | None | | [VizWizIT2TRetrieval](https://openaccess.thecvf.com/content_cvpr_2018/papers/Gurari_VizWiz_Grand_Challenge_CVPR_2018_paper.pdf) (Gurari et al., 2018) | ['eng'] | Any2AnyRetrieval | it2t | [Web] | {'test': 6410} | {'test': {'number_of_characters': 181824, 'num_samples': 6410, 'num_queries': 4319, 'num_documents': 2091, 'min_document_length': 1, 'average_document_length': 14.45, 'max_document_length': 94, 'unique_documents': 2091, 'num_document_images': 0, 'min_query_length': 7, 'average_query_length': 35.1, 'max_query_length': 264, 'unique_queries': 2798, 'num_query_images': 4319, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 2091}} | +| [VoiceGenderClustering](https://huggingface.co/datasets/mmn3690/voice-gender-clustering) | ['eng'] | AudioClustering | a2a | [Spoken] | None | None | +| [VoxCelebSA](https://huggingface.co/datasets/DynamicSuperb/Sentiment_Analysis_SLUE-VoxCeleb) (Suwon Shon, 2022) | ['eng'] | AudioClassification | a2t | [Spoken] | None | None | +| [VoxLingua107_Top10](https://huggingface.co/datasets/silky1708/VoxLingua107-Top-10) (Jörgen Valk, 2020) | ['eng'] | AudioClassification | a2t | [Speech] | None | None | | [VoyageMMarcoReranking](https://arxiv.org/abs/2312.16144) (Benjamin Clavié, 2023) | ['jpn'] | Reranking | s2s | [Academic, Non-fiction, Written] | None | None | | [WITT2IRetrieval](https://proceedings.mlr.press/v162/bugliarello22a/bugliarello22a.pdf) (Bugliarello et al., 2022) | ['ara', 'bul', 'dan', 'ell', 'eng', 'est', 'ind', 'jpn', 'kor', 'tur', 'vie'] | Any2AnyMultilingualRetrieval | t2i | [Encyclopaedic, Written] | {'test': 18137} | {'test': {'number_of_characters': 506601, 'num_samples': 18137, 'num_queries': 9584, 'num_documents': 8553, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 8553, 'min_query_length': 9, 'average_query_length': 52.86, 'max_query_length': 779, 'unique_queries': 9076, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 8553, 'hf_subset_descriptive_stats': {'ar': {'number_of_characters': 46144, 'num_samples': 1682, 'num_queries': 890, 'num_documents': 792, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 792, 'min_query_length': 4, 'average_query_length': 51.85, 'max_query_length': 533, 'unique_queries': 871, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 792}, 'bg': {'number_of_characters': 40682, 'num_samples': 1666, 'num_queries': 860, 'num_documents': 806, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 806, 'min_query_length': 4, 'average_query_length': 47.3, 'max_query_length': 771, 'unique_queries': 830, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 806}, 'da': {'number_of_characters': 48235, 'num_samples': 1705, 'num_queries': 891, 'num_documents': 814, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 814, 'min_query_length': 4, 'average_query_length': 54.14, 'max_query_length': 537, 'unique_queries': 889, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 814}, 'el': {'number_of_characters': 30842, 'num_samples': 1111, 'num_queries': 570, 'num_documents': 541, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 541, 'min_query_length': 1, 'average_query_length': 54.11, 'max_query_length': 404, 'unique_queries': 565, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 541}, 'et': {'number_of_characters': 33995, 'num_samples': 1654, 'num_queries': 874, 'num_documents': 780, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 780, 'min_query_length': 3, 'average_query_length': 38.9, 'max_query_length': 588, 'unique_queries': 750, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 780}, 'id': {'number_of_characters': 45428, 'num_samples': 1755, 'num_queries': 901, 'num_documents': 854, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 854, 'min_query_length': 1, 'average_query_length': 50.42, 'max_query_length': 628, 'unique_queries': 863, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 854}, 'ko': {'number_of_characters': 18304, 'num_samples': 1820, 'num_queries': 931, 'num_documents': 889, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 889, 'min_query_length': 2, 'average_query_length': 19.66, 'max_query_length': 168, 'unique_queries': 905, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 889}, 'ja': {'number_of_characters': 21706, 'num_samples': 1842, 'num_queries': 1000, 'num_documents': 842, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 842, 'min_query_length': 2, 'average_query_length': 21.71, 'max_query_length': 368, 'unique_queries': 875, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 842}, 'tr': {'number_of_characters': 33434, 'num_samples': 1402, 'num_queries': 721, 'num_documents': 681, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 681, 'min_query_length': 4, 'average_query_length': 46.37, 'max_query_length': 408, 'unique_queries': 712, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 681}, 'vi': {'number_of_characters': 53181, 'num_samples': 1815, 'num_queries': 946, 'num_documents': 869, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 869, 'min_query_length': 3, 'average_query_length': 56.22, 'max_query_length': 476, 'unique_queries': 921, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 869}, 'en': {'number_of_characters': 57978, 'num_samples': 1685, 'num_queries': 1000, 'num_documents': 685, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 685, 'min_query_length': 4, 'average_query_length': 57.98, 'max_query_length': 690, 'unique_queries': 895, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 685}}}} | -| [WRIMEClassification](https://aclanthology.org/2021.naacl-main.169/) | ['jpn'] | Classification | s2s | [Social, Written] | None | None | +| [WRIMEClassification](https://aclanthology.org/2021.naacl-main.169/) (Kajiwara et al., 2021) | ['jpn'] | Classification | s2s | [Social, Written] | None | None | | [Waimai](https://aclanthology.org/2023.nodalida-1.20/) (Xiao et al., 2023) | ['cmn'] | Classification | s2s | | None | None | | [WebFAQBitextMiningQAs](https://huggingface.co/PaDaS-Lab) (Michael Dinzinger, 2025) | ['ara', 'aze', 'ben', 'bul', 'cat', 'ces', 'dan', 'deu', 'ell', 'eng', 'est', 'fas', 'fin', 'fra', 'heb', 'hin', 'hrv', 'hun', 'ind', 'isl', 'ita', 'jpn', 'kat', 'kaz', 'kor', 'lav', 'lit', 'mar', 'msa', 'nld', 'nor', 'pol', 'por', 'ron', 'rus', 'slk', 'slv', 'spa', 'srp', 'swe', 'tgl', 'tha', 'tur', 'ukr', 'urd', 'vie', 'zho'] | BitextMining | p2p | [Web, Written] | {'default': 682057} | {'default': {'num_samples': 682057, 'number_of_characters': 526563222, 'unique_pairs': 682057, 'min_sentence1_length': 43, 'average_sentence1_length': 386.04, 'max_sentence1_length': 18410, 'unique_sentence1': 379274, 'min_sentence2_length': 41, 'average_sentence2_length': 385.98, 'max_sentence2_length': 21081, 'unique_sentence2': 398878, 'hf_subset_descriptive_stats': {'ara-fas': {'num_samples': 609, 'number_of_characters': 411293, 'unique_pairs': 609, 'min_sentence1_length': 54, 'average_sentence1_length': 321.44, 'max_sentence1_length': 2446, 'unique_sentence1': 609, 'min_sentence2_length': 63, 'average_sentence2_length': 353.91, 'max_sentence2_length': 2754, 'unique_sentence2': 609}, 'ara-heb': {'num_samples': 978, 'number_of_characters': 628664, 'unique_pairs': 978, 'min_sentence1_length': 68, 'average_sentence1_length': 336.2, 'max_sentence1_length': 4204, 'unique_sentence1': 978, 'min_sentence2_length': 62, 'average_sentence2_length': 306.61, 'max_sentence2_length': 3077, 'unique_sentence2': 978}, 'jpn-kor': {'num_samples': 4820, 'number_of_characters': 1914641, 'unique_pairs': 4820, 'min_sentence1_length': 43, 'average_sentence1_length': 194.26, 'max_sentence1_length': 1741, 'unique_sentence1': 4820, 'min_sentence2_length': 43, 'average_sentence2_length': 202.97, 'max_sentence2_length': 1830, 'unique_sentence2': 4820}, 'jpn-vie': {'num_samples': 1356, 'number_of_characters': 823292, 'unique_pairs': 1356, 'min_sentence1_length': 50, 'average_sentence1_length': 202.49, 'max_sentence1_length': 1660, 'unique_sentence1': 1356, 'min_sentence2_length': 71, 'average_sentence2_length': 404.66, 'max_sentence2_length': 3938, 'unique_sentence2': 1356}, 'jpn-zho': {'num_samples': 1728, 'number_of_characters': 587091, 'unique_pairs': 1728, 'min_sentence1_length': 49, 'average_sentence1_length': 198.45, 'max_sentence1_length': 6803, 'unique_sentence1': 1728, 'min_sentence2_length': 43, 'average_sentence2_length': 141.3, 'max_sentence2_length': 4500, 'unique_sentence2': 1728}, 'kor-vie': {'num_samples': 1386, 'number_of_characters': 906969, 'unique_pairs': 1386, 'min_sentence1_length': 50, 'average_sentence1_length': 224.99, 'max_sentence1_length': 3677, 'unique_sentence1': 1386, 'min_sentence2_length': 71, 'average_sentence2_length': 429.39, 'max_sentence2_length': 6989, 'unique_sentence2': 1386}, 'kor-zho': {'num_samples': 1087, 'number_of_characters': 354672, 'unique_pairs': 1087, 'min_sentence1_length': 64, 'average_sentence1_length': 193.25, 'max_sentence1_length': 1064, 'unique_sentence1': 1087, 'min_sentence2_length': 45, 'average_sentence2_length': 133.03, 'max_sentence2_length': 729, 'unique_sentence2': 1087}, 'vie-zho': {'num_samples': 646, 'number_of_characters': 341786, 'unique_pairs': 646, 'min_sentence1_length': 89, 'average_sentence1_length': 388.88, 'max_sentence1_length': 2114, 'unique_sentence1': 646, 'min_sentence2_length': 45, 'average_sentence2_length': 140.2, 'max_sentence2_length': 1267, 'unique_sentence2': 646}, 'ind-msa': {'num_samples': 455, 'number_of_characters': 197047, 'unique_pairs': 455, 'min_sentence1_length': 78, 'average_sentence1_length': 217.38, 'max_sentence1_length': 870, 'unique_sentence1': 455, 'min_sentence2_length': 72, 'average_sentence2_length': 215.69, 'max_sentence2_length': 784, 'unique_sentence2': 455}, 'ind-tgl': {'num_samples': 378, 'number_of_characters': 265203, 'unique_pairs': 378, 'min_sentence1_length': 74, 'average_sentence1_length': 329.09, 'max_sentence1_length': 1418, 'unique_sentence1': 378, 'min_sentence2_length': 75, 'average_sentence2_length': 372.51, 'max_sentence2_length': 1600, 'unique_sentence2': 378}, 'ind-tha': {'num_samples': 1258, 'number_of_characters': 794128, 'unique_pairs': 1258, 'min_sentence1_length': 72, 'average_sentence1_length': 347.4, 'max_sentence1_length': 3226, 'unique_sentence1': 1258, 'min_sentence2_length': 63, 'average_sentence2_length': 283.86, 'max_sentence2_length': 2816, 'unique_sentence2': 1258}, 'bul-ces': {'num_samples': 1485, 'number_of_characters': 922134, 'unique_pairs': 1485, 'min_sentence1_length': 71, 'average_sentence1_length': 325.89, 'max_sentence1_length': 1945, 'unique_sentence1': 1485, 'min_sentence2_length': 56, 'average_sentence2_length': 295.08, 'max_sentence2_length': 1921, 'unique_sentence2': 1485}, 'bul-lav': {'num_samples': 710, 'number_of_characters': 492895, 'unique_pairs': 710, 'min_sentence1_length': 74, 'average_sentence1_length': 358.15, 'max_sentence1_length': 2765, 'unique_sentence1': 710, 'min_sentence2_length': 61, 'average_sentence2_length': 336.07, 'max_sentence2_length': 2523, 'unique_sentence2': 710}, 'bul-lit': {'num_samples': 803, 'number_of_characters': 540245, 'unique_pairs': 803, 'min_sentence1_length': 71, 'average_sentence1_length': 346.31, 'max_sentence1_length': 1945, 'unique_sentence1': 803, 'min_sentence2_length': 67, 'average_sentence2_length': 326.47, 'max_sentence2_length': 1925, 'unique_sentence2': 803}, 'bul-pol': {'num_samples': 1635, 'number_of_characters': 1126043, 'unique_pairs': 1635, 'min_sentence1_length': 69, 'average_sentence1_length': 347.74, 'max_sentence1_length': 2431, 'unique_sentence1': 1635, 'min_sentence2_length': 68, 'average_sentence2_length': 340.97, 'max_sentence2_length': 2277, 'unique_sentence2': 1635}, 'bul-rus': {'num_samples': 1476, 'number_of_characters': 995879, 'unique_pairs': 1476, 'min_sentence1_length': 71, 'average_sentence1_length': 337.76, 'max_sentence1_length': 4620, 'unique_sentence1': 1476, 'min_sentence2_length': 63, 'average_sentence2_length': 336.95, 'max_sentence2_length': 4654, 'unique_sentence2': 1476}, 'bul-slk': {'num_samples': 1154, 'number_of_characters': 777946, 'unique_pairs': 1154, 'min_sentence1_length': 73, 'average_sentence1_length': 349.94, 'max_sentence1_length': 1945, 'unique_sentence1': 1154, 'min_sentence2_length': 68, 'average_sentence2_length': 324.19, 'max_sentence2_length': 2073, 'unique_sentence2': 1154}, 'bul-slv': {'num_samples': 1034, 'number_of_characters': 673719, 'unique_pairs': 1034, 'min_sentence1_length': 86, 'average_sentence1_length': 339.07, 'max_sentence1_length': 1945, 'unique_sentence1': 1034, 'min_sentence2_length': 79, 'average_sentence2_length': 312.5, 'max_sentence2_length': 1872, 'unique_sentence2': 1034}, 'bul-srp': {'num_samples': 296, 'number_of_characters': 222838, 'unique_pairs': 296, 'min_sentence1_length': 92, 'average_sentence1_length': 390.41, 'max_sentence1_length': 1945, 'unique_sentence1': 296, 'min_sentence2_length': 87, 'average_sentence2_length': 362.43, 'max_sentence2_length': 1845, 'unique_sentence2': 296}, 'bul-ukr': {'num_samples': 1074, 'number_of_characters': 708525, 'unique_pairs': 1074, 'min_sentence1_length': 64, 'average_sentence1_length': 335.27, 'max_sentence1_length': 2057, 'unique_sentence1': 1074, 'min_sentence2_length': 59, 'average_sentence2_length': 324.43, 'max_sentence2_length': 2042, 'unique_sentence2': 1074}, 'ces-lav': {'num_samples': 875, 'number_of_characters': 569552, 'unique_pairs': 875, 'min_sentence1_length': 74, 'average_sentence1_length': 318.83, 'max_sentence1_length': 1921, 'unique_sentence1': 875, 'min_sentence2_length': 76, 'average_sentence2_length': 332.08, 'max_sentence2_length': 1948, 'unique_sentence2': 875}, 'ces-lit': {'num_samples': 1002, 'number_of_characters': 674361, 'unique_pairs': 1002, 'min_sentence1_length': 73, 'average_sentence1_length': 328.37, 'max_sentence1_length': 2956, 'unique_sentence1': 1002, 'min_sentence2_length': 84, 'average_sentence2_length': 344.64, 'max_sentence2_length': 2995, 'unique_sentence2': 1002}, 'ces-pol': {'num_samples': 3367, 'number_of_characters': 2230397, 'unique_pairs': 3367, 'min_sentence1_length': 56, 'average_sentence1_length': 317.49, 'max_sentence1_length': 2453, 'unique_sentence1': 3367, 'min_sentence2_length': 64, 'average_sentence2_length': 344.94, 'max_sentence2_length': 2621, 'unique_sentence2': 3367}, 'ces-rus': {'num_samples': 2144, 'number_of_characters': 1438311, 'unique_pairs': 2144, 'min_sentence1_length': 56, 'average_sentence1_length': 319.4, 'max_sentence1_length': 2349, 'unique_sentence1': 2144, 'min_sentence2_length': 71, 'average_sentence2_length': 351.45, 'max_sentence2_length': 2509, 'unique_sentence2': 2144}, 'ces-slk': {'num_samples': 2551, 'number_of_characters': 1733126, 'unique_pairs': 2551, 'min_sentence1_length': 65, 'average_sentence1_length': 334.9, 'max_sentence1_length': 7967, 'unique_sentence1': 2551, 'min_sentence2_length': 67, 'average_sentence2_length': 344.49, 'max_sentence2_length': 10365, 'unique_sentence2': 2551}, 'ces-slv': {'num_samples': 1370, 'number_of_characters': 848116, 'unique_pairs': 1370, 'min_sentence1_length': 54, 'average_sentence1_length': 304.5, 'max_sentence1_length': 2956, 'unique_sentence1': 1370, 'min_sentence2_length': 59, 'average_sentence2_length': 314.56, 'max_sentence2_length': 3006, 'unique_sentence2': 1370}, 'ces-srp': {'num_samples': 362, 'number_of_characters': 238713, 'unique_pairs': 362, 'min_sentence1_length': 77, 'average_sentence1_length': 322.69, 'max_sentence1_length': 1921, 'unique_sentence1': 362, 'min_sentence2_length': 80, 'average_sentence2_length': 336.74, 'max_sentence2_length': 1861, 'unique_sentence2': 362}, 'ces-ukr': {'num_samples': 1285, 'number_of_characters': 789567, 'unique_pairs': 1285, 'min_sentence1_length': 56, 'average_sentence1_length': 295.05, 'max_sentence1_length': 1921, 'unique_sentence1': 1285, 'min_sentence2_length': 69, 'average_sentence2_length': 319.4, 'max_sentence2_length': 2042, 'unique_sentence2': 1285}, 'hrv-slk': {'num_samples': 313, 'number_of_characters': 184033, 'unique_pairs': 313, 'min_sentence1_length': 112, 'average_sentence1_length': 295.81, 'max_sentence1_length': 1393, 'unique_sentence1': 313, 'min_sentence2_length': 104, 'average_sentence2_length': 292.16, 'max_sentence2_length': 1411, 'unique_sentence2': 313}, 'kat-rus': {'num_samples': 262, 'number_of_characters': 190050, 'unique_pairs': 262, 'min_sentence1_length': 68, 'average_sentence1_length': 362.44, 'max_sentence1_length': 2879, 'unique_sentence1': 262, 'min_sentence2_length': 74, 'average_sentence2_length': 362.95, 'max_sentence2_length': 3069, 'unique_sentence2': 262}, 'lav-lit': {'num_samples': 1061, 'number_of_characters': 794243, 'unique_pairs': 1061, 'min_sentence1_length': 61, 'average_sentence1_length': 372.32, 'max_sentence1_length': 2410, 'unique_sentence1': 1061, 'min_sentence2_length': 67, 'average_sentence2_length': 376.26, 'max_sentence2_length': 2463, 'unique_sentence2': 1061}, 'lav-pol': {'num_samples': 951, 'number_of_characters': 701354, 'unique_pairs': 951, 'min_sentence1_length': 63, 'average_sentence1_length': 359.69, 'max_sentence1_length': 2044, 'unique_sentence1': 951, 'min_sentence2_length': 72, 'average_sentence2_length': 377.8, 'max_sentence2_length': 2234, 'unique_sentence2': 951}, 'lav-rus': {'num_samples': 1412, 'number_of_characters': 1039535, 'unique_pairs': 1412, 'min_sentence1_length': 61, 'average_sentence1_length': 358.04, 'max_sentence1_length': 2206, 'unique_sentence1': 1412, 'min_sentence2_length': 63, 'average_sentence2_length': 378.18, 'max_sentence2_length': 2383, 'unique_sentence2': 1412}, 'lav-slk': {'num_samples': 789, 'number_of_characters': 535091, 'unique_pairs': 789, 'min_sentence1_length': 75, 'average_sentence1_length': 342.33, 'max_sentence1_length': 1948, 'unique_sentence1': 789, 'min_sentence2_length': 68, 'average_sentence2_length': 335.86, 'max_sentence2_length': 1910, 'unique_sentence2': 789}, 'lav-slv': {'num_samples': 518, 'number_of_characters': 340127, 'unique_pairs': 518, 'min_sentence1_length': 76, 'average_sentence1_length': 329.34, 'max_sentence1_length': 1948, 'unique_sentence1': 518, 'min_sentence2_length': 71, 'average_sentence2_length': 327.28, 'max_sentence2_length': 1872, 'unique_sentence2': 518}, 'lav-ukr': {'num_samples': 579, 'number_of_characters': 428022, 'unique_pairs': 579, 'min_sentence1_length': 61, 'average_sentence1_length': 365.16, 'max_sentence1_length': 2410, 'unique_sentence1': 579, 'min_sentence2_length': 59, 'average_sentence2_length': 374.08, 'max_sentence2_length': 2412, 'unique_sentence2': 579}, 'lit-pol': {'num_samples': 1026, 'number_of_characters': 767128, 'unique_pairs': 1026, 'min_sentence1_length': 64, 'average_sentence1_length': 366.06, 'max_sentence1_length': 1990, 'unique_sentence1': 1026, 'min_sentence2_length': 79, 'average_sentence2_length': 381.63, 'max_sentence2_length': 2234, 'unique_sentence2': 1026}, 'lit-rus': {'num_samples': 961, 'number_of_characters': 744509, 'unique_pairs': 961, 'min_sentence1_length': 67, 'average_sentence1_length': 379.47, 'max_sentence1_length': 3141, 'unique_sentence1': 961, 'min_sentence2_length': 63, 'average_sentence2_length': 395.26, 'max_sentence2_length': 2201, 'unique_sentence2': 961}, 'lit-slk': {'num_samples': 859, 'number_of_characters': 583451, 'unique_pairs': 859, 'min_sentence1_length': 74, 'average_sentence1_length': 344.75, 'max_sentence1_length': 1925, 'unique_sentence1': 859, 'min_sentence2_length': 68, 'average_sentence2_length': 334.47, 'max_sentence2_length': 1961, 'unique_sentence2': 859}, 'lit-slv': {'num_samples': 607, 'number_of_characters': 438866, 'unique_pairs': 607, 'min_sentence1_length': 70, 'average_sentence1_length': 366.02, 'max_sentence1_length': 2995, 'unique_sentence1': 607, 'min_sentence2_length': 75, 'average_sentence2_length': 356.99, 'max_sentence2_length': 3006, 'unique_sentence2': 607}, 'lit-ukr': {'num_samples': 639, 'number_of_characters': 463616, 'unique_pairs': 639, 'min_sentence1_length': 67, 'average_sentence1_length': 361.16, 'max_sentence1_length': 2463, 'unique_sentence1': 639, 'min_sentence2_length': 59, 'average_sentence2_length': 364.37, 'max_sentence2_length': 2412, 'unique_sentence2': 639}, 'pol-rus': {'num_samples': 5014, 'number_of_characters': 3850186, 'unique_pairs': 5014, 'min_sentence1_length': 60, 'average_sentence1_length': 380.93, 'max_sentence1_length': 5103, 'unique_sentence1': 5014, 'min_sentence2_length': 59, 'average_sentence2_length': 386.95, 'max_sentence2_length': 4888, 'unique_sentence2': 5014}, 'pol-slk': {'num_samples': 1918, 'number_of_characters': 1321855, 'unique_pairs': 1918, 'min_sentence1_length': 71, 'average_sentence1_length': 354.28, 'max_sentence1_length': 5103, 'unique_sentence1': 1918, 'min_sentence2_length': 67, 'average_sentence2_length': 334.9, 'max_sentence2_length': 4641, 'unique_sentence2': 1918}, 'pol-slv': {'num_samples': 1382, 'number_of_characters': 859222, 'unique_pairs': 1382, 'min_sentence1_length': 76, 'average_sentence1_length': 317.49, 'max_sentence1_length': 2101, 'unique_sentence1': 1382, 'min_sentence2_length': 75, 'average_sentence2_length': 304.23, 'max_sentence2_length': 2015, 'unique_sentence2': 1382}, 'pol-srp': {'num_samples': 492, 'number_of_characters': 350413, 'unique_pairs': 492, 'min_sentence1_length': 82, 'average_sentence1_length': 357.26, 'max_sentence1_length': 1902, 'unique_sentence1': 492, 'min_sentence2_length': 78, 'average_sentence2_length': 354.96, 'max_sentence2_length': 1845, 'unique_sentence2': 492}, 'pol-ukr': {'num_samples': 2370, 'number_of_characters': 1753652, 'unique_pairs': 2370, 'min_sentence1_length': 59, 'average_sentence1_length': 373.9, 'max_sentence1_length': 3106, 'unique_sentence1': 2370, 'min_sentence2_length': 61, 'average_sentence2_length': 366.04, 'max_sentence2_length': 2827, 'unique_sentence2': 2370}, 'rus-slk': {'num_samples': 1263, 'number_of_characters': 905526, 'unique_pairs': 1263, 'min_sentence1_length': 69, 'average_sentence1_length': 371.39, 'max_sentence1_length': 4888, 'unique_sentence1': 1263, 'min_sentence2_length': 67, 'average_sentence2_length': 345.58, 'max_sentence2_length': 4641, 'unique_sentence2': 1263}, 'rus-slv': {'num_samples': 1096, 'number_of_characters': 719013, 'unique_pairs': 1096, 'min_sentence1_length': 84, 'average_sentence1_length': 341.51, 'max_sentence1_length': 2164, 'unique_sentence1': 1096, 'min_sentence2_length': 71, 'average_sentence2_length': 314.52, 'max_sentence2_length': 2015, 'unique_sentence2': 1096}, 'rus-srp': {'num_samples': 455, 'number_of_characters': 341619, 'unique_pairs': 455, 'min_sentence1_length': 92, 'average_sentence1_length': 386.63, 'max_sentence1_length': 1921, 'unique_sentence1': 455, 'min_sentence2_length': 90, 'average_sentence2_length': 364.18, 'max_sentence2_length': 1845, 'unique_sentence2': 455}, 'rus-ukr': {'num_samples': 15251, 'number_of_characters': 10782282, 'unique_pairs': 15251, 'min_sentence1_length': 55, 'average_sentence1_length': 358.27, 'max_sentence1_length': 3905, 'unique_sentence1': 15251, 'min_sentence2_length': 49, 'average_sentence2_length': 348.72, 'max_sentence2_length': 3801, 'unique_sentence2': 15251}, 'slk-slv': {'num_samples': 1259, 'number_of_characters': 852109, 'unique_pairs': 1259, 'min_sentence1_length': 68, 'average_sentence1_length': 338.08, 'max_sentence1_length': 1961, 'unique_sentence1': 1259, 'min_sentence2_length': 71, 'average_sentence2_length': 338.74, 'max_sentence2_length': 1872, 'unique_sentence2': 1259}, 'slk-srp': {'num_samples': 561, 'number_of_characters': 493396, 'unique_pairs': 561, 'min_sentence1_length': 82, 'average_sentence1_length': 431.13, 'max_sentence1_length': 1910, 'unique_sentence1': 561, 'min_sentence2_length': 80, 'average_sentence2_length': 448.36, 'max_sentence2_length': 1845, 'unique_sentence2': 561}, 'slk-ukr': {'num_samples': 944, 'number_of_characters': 608143, 'unique_pairs': 944, 'min_sentence1_length': 68, 'average_sentence1_length': 314.41, 'max_sentence1_length': 1910, 'unique_sentence1': 944, 'min_sentence2_length': 69, 'average_sentence2_length': 329.81, 'max_sentence2_length': 1923, 'unique_sentence2': 944}, 'slv-srp': {'num_samples': 499, 'number_of_characters': 378293, 'unique_pairs': 499, 'min_sentence1_length': 80, 'average_sentence1_length': 374.47, 'max_sentence1_length': 2476, 'unique_sentence1': 499, 'min_sentence2_length': 80, 'average_sentence2_length': 383.63, 'max_sentence2_length': 2387, 'unique_sentence2': 499}, 'slv-ukr': {'num_samples': 733, 'number_of_characters': 431361, 'unique_pairs': 733, 'min_sentence1_length': 71, 'average_sentence1_length': 286.64, 'max_sentence1_length': 1872, 'unique_sentence1': 733, 'min_sentence2_length': 69, 'average_sentence2_length': 301.85, 'max_sentence2_length': 1923, 'unique_sentence2': 733}, 'cat-deu': {'num_samples': 302, 'number_of_characters': 279459, 'unique_pairs': 302, 'min_sentence1_length': 66, 'average_sentence1_length': 451.95, 'max_sentence1_length': 2056, 'unique_sentence1': 302, 'min_sentence2_length': 77, 'average_sentence2_length': 473.41, 'max_sentence2_length': 2082, 'unique_sentence2': 302}, 'cat-fra': {'num_samples': 598, 'number_of_characters': 476709, 'unique_pairs': 598, 'min_sentence1_length': 62, 'average_sentence1_length': 381.68, 'max_sentence1_length': 2056, 'unique_sentence1': 598, 'min_sentence2_length': 74, 'average_sentence2_length': 415.5, 'max_sentence2_length': 2277, 'unique_sentence2': 598}, 'cat-ita': {'num_samples': 418, 'number_of_characters': 327132, 'unique_pairs': 418, 'min_sentence1_length': 60, 'average_sentence1_length': 388.8, 'max_sentence1_length': 2056, 'unique_sentence1': 418, 'min_sentence2_length': 55, 'average_sentence2_length': 393.81, 'max_sentence2_length': 2186, 'unique_sentence2': 418}, 'cat-por': {'num_samples': 370, 'number_of_characters': 248938, 'unique_pairs': 370, 'min_sentence1_length': 58, 'average_sentence1_length': 338.32, 'max_sentence1_length': 2056, 'unique_sentence1': 370, 'min_sentence2_length': 60, 'average_sentence2_length': 334.48, 'max_sentence2_length': 2088, 'unique_sentence2': 370}, 'cat-spa': {'num_samples': 2648, 'number_of_characters': 2308040, 'unique_pairs': 2648, 'min_sentence1_length': 62, 'average_sentence1_length': 430.96, 'max_sentence1_length': 8113, 'unique_sentence1': 2648, 'min_sentence2_length': 65, 'average_sentence2_length': 440.66, 'max_sentence2_length': 8345, 'unique_sentence2': 2648}, 'dan-deu': {'num_samples': 4337, 'number_of_characters': 3443148, 'unique_pairs': 4337, 'min_sentence1_length': 60, 'average_sentence1_length': 372.7, 'max_sentence1_length': 4236, 'unique_sentence1': 4337, 'min_sentence2_length': 69, 'average_sentence2_length': 421.21, 'max_sentence2_length': 4093, 'unique_sentence2': 4337}, 'dan-fra': {'num_samples': 3802, 'number_of_characters': 3021023, 'unique_pairs': 3802, 'min_sentence1_length': 62, 'average_sentence1_length': 365.2, 'max_sentence1_length': 4236, 'unique_sentence1': 3802, 'min_sentence2_length': 70, 'average_sentence2_length': 429.39, 'max_sentence2_length': 4717, 'unique_sentence2': 3802}, 'dan-isl': {'num_samples': 327, 'number_of_characters': 230870, 'unique_pairs': 327, 'min_sentence1_length': 81, 'average_sentence1_length': 357.51, 'max_sentence1_length': 1856, 'unique_sentence1': 327, 'min_sentence2_length': 82, 'average_sentence2_length': 348.51, 'max_sentence2_length': 1897, 'unique_sentence2': 327}, 'dan-ita': {'num_samples': 3818, 'number_of_characters': 2976506, 'unique_pairs': 3818, 'min_sentence1_length': 62, 'average_sentence1_length': 371.54, 'max_sentence1_length': 4236, 'unique_sentence1': 3818, 'min_sentence2_length': 64, 'average_sentence2_length': 408.06, 'max_sentence2_length': 4574, 'unique_sentence2': 3818}, 'dan-nld': {'num_samples': 4099, 'number_of_characters': 3047816, 'unique_pairs': 4099, 'min_sentence1_length': 68, 'average_sentence1_length': 360.12, 'max_sentence1_length': 4236, 'unique_sentence1': 4099, 'min_sentence2_length': 63, 'average_sentence2_length': 383.43, 'max_sentence2_length': 4431, 'unique_sentence2': 4099}, 'dan-nor': {'num_samples': 2603, 'number_of_characters': 1873194, 'unique_pairs': 2603, 'min_sentence1_length': 62, 'average_sentence1_length': 365.46, 'max_sentence1_length': 3505, 'unique_sentence1': 2603, 'min_sentence2_length': 59, 'average_sentence2_length': 354.17, 'max_sentence2_length': 3400, 'unique_sentence2': 2603}, 'dan-por': {'num_samples': 3206, 'number_of_characters': 2344257, 'unique_pairs': 3206, 'min_sentence1_length': 60, 'average_sentence1_length': 353.3, 'max_sentence1_length': 3843, 'unique_sentence1': 3206, 'min_sentence2_length': 65, 'average_sentence2_length': 377.91, 'max_sentence2_length': 3799, 'unique_sentence2': 3206}, 'dan-ron': {'num_samples': 2052, 'number_of_characters': 1446475, 'unique_pairs': 2052, 'min_sentence1_length': 67, 'average_sentence1_length': 336.33, 'max_sentence1_length': 4236, 'unique_sentence1': 2052, 'min_sentence2_length': 63, 'average_sentence2_length': 368.58, 'max_sentence2_length': 4285, 'unique_sentence2': 2052}, 'dan-spa': {'num_samples': 3571, 'number_of_characters': 2720999, 'unique_pairs': 3571, 'min_sentence1_length': 60, 'average_sentence1_length': 360.46, 'max_sentence1_length': 4236, 'unique_sentence1': 3571, 'min_sentence2_length': 65, 'average_sentence2_length': 401.51, 'max_sentence2_length': 4498, 'unique_sentence2': 3571}, 'dan-swe': {'num_samples': 4268, 'number_of_characters': 3115686, 'unique_pairs': 4268, 'min_sentence1_length': 63, 'average_sentence1_length': 368.21, 'max_sentence1_length': 15317, 'unique_sentence1': 4268, 'min_sentence2_length': 64, 'average_sentence2_length': 361.8, 'max_sentence2_length': 15427, 'unique_sentence2': 4268}, 'deu-fra': {'num_samples': 27727, 'number_of_characters': 24341503, 'unique_pairs': 27727, 'min_sentence1_length': 52, 'average_sentence1_length': 429.85, 'max_sentence1_length': 10595, 'unique_sentence1': 27727, 'min_sentence2_length': 60, 'average_sentence2_length': 448.05, 'max_sentence2_length': 11165, 'unique_sentence2': 27727}, 'deu-isl': {'num_samples': 294, 'number_of_characters': 221411, 'unique_pairs': 294, 'min_sentence1_length': 84, 'average_sentence1_length': 403.22, 'max_sentence1_length': 2082, 'unique_sentence1': 294, 'min_sentence2_length': 79, 'average_sentence2_length': 349.88, 'max_sentence2_length': 1897, 'unique_sentence2': 294}, 'deu-ita': {'num_samples': 18787, 'number_of_characters': 15815694, 'unique_pairs': 18787, 'min_sentence1_length': 52, 'average_sentence1_length': 424.82, 'max_sentence1_length': 7136, 'unique_sentence1': 18787, 'min_sentence2_length': 56, 'average_sentence2_length': 417.02, 'max_sentence2_length': 6634, 'unique_sentence2': 18787}, 'deu-nld': {'num_samples': 14211, 'number_of_characters': 11610783, 'unique_pairs': 14211, 'min_sentence1_length': 52, 'average_sentence1_length': 418.9, 'max_sentence1_length': 4919, 'unique_sentence1': 14211, 'min_sentence2_length': 62, 'average_sentence2_length': 398.13, 'max_sentence2_length': 4779, 'unique_sentence2': 14211}, 'deu-nor': {'num_samples': 2783, 'number_of_characters': 2150067, 'unique_pairs': 2783, 'min_sentence1_length': 63, 'average_sentence1_length': 413.82, 'max_sentence1_length': 18410, 'unique_sentence1': 2783, 'min_sentence2_length': 63, 'average_sentence2_length': 358.75, 'max_sentence2_length': 16149, 'unique_sentence2': 2783}, 'deu-por': {'num_samples': 11319, 'number_of_characters': 9085897, 'unique_pairs': 11319, 'min_sentence1_length': 63, 'average_sentence1_length': 412.04, 'max_sentence1_length': 7136, 'unique_sentence1': 11319, 'min_sentence2_length': 59, 'average_sentence2_length': 390.67, 'max_sentence2_length': 6536, 'unique_sentence2': 11319}, 'deu-ron': {'num_samples': 3598, 'number_of_characters': 2755112, 'unique_pairs': 3598, 'min_sentence1_length': 61, 'average_sentence1_length': 387.56, 'max_sentence1_length': 4093, 'unique_sentence1': 3598, 'min_sentence2_length': 55, 'average_sentence2_length': 378.17, 'max_sentence2_length': 4285, 'unique_sentence2': 3598}, 'deu-spa': {'num_samples': 19739, 'number_of_characters': 16855942, 'unique_pairs': 19739, 'min_sentence1_length': 60, 'average_sentence1_length': 430.84, 'max_sentence1_length': 7136, 'unique_sentence1': 19739, 'min_sentence2_length': 55, 'average_sentence2_length': 423.1, 'max_sentence2_length': 6963, 'unique_sentence2': 19739}, 'deu-swe': {'num_samples': 5772, 'number_of_characters': 4469906, 'unique_pairs': 5772, 'min_sentence1_length': 59, 'average_sentence1_length': 412.41, 'max_sentence1_length': 4093, 'unique_sentence1': 5772, 'min_sentence2_length': 57, 'average_sentence2_length': 362.0, 'max_sentence2_length': 4038, 'unique_sentence2': 5772}, 'fra-isl': {'num_samples': 347, 'number_of_characters': 256923, 'unique_pairs': 347, 'min_sentence1_length': 76, 'average_sentence1_length': 400.64, 'max_sentence1_length': 2277, 'unique_sentence1': 347, 'min_sentence2_length': 75, 'average_sentence2_length': 339.77, 'max_sentence2_length': 1897, 'unique_sentence2': 347}, 'fra-ita': {'num_samples': 20002, 'number_of_characters': 17269559, 'unique_pairs': 20002, 'min_sentence1_length': 62, 'average_sentence1_length': 444.94, 'max_sentence1_length': 7253, 'unique_sentence1': 20002, 'min_sentence2_length': 49, 'average_sentence2_length': 418.46, 'max_sentence2_length': 6634, 'unique_sentence2': 20002}, 'fra-nld': {'num_samples': 14684, 'number_of_characters': 12784405, 'unique_pairs': 14684, 'min_sentence1_length': 60, 'average_sentence1_length': 455.31, 'max_sentence1_length': 9107, 'unique_sentence1': 14684, 'min_sentence2_length': 51, 'average_sentence2_length': 415.33, 'max_sentence2_length': 8534, 'unique_sentence2': 14684}, 'fra-nor': {'num_samples': 2558, 'number_of_characters': 1963253, 'unique_pairs': 2558, 'min_sentence1_length': 86, 'average_sentence1_length': 421.16, 'max_sentence1_length': 4086, 'unique_sentence1': 2558, 'min_sentence2_length': 64, 'average_sentence2_length': 346.33, 'max_sentence2_length': 3400, 'unique_sentence2': 2558}, 'fra-por': {'num_samples': 13265, 'number_of_characters': 10942625, 'unique_pairs': 13265, 'min_sentence1_length': 61, 'average_sentence1_length': 431.68, 'max_sentence1_length': 7253, 'unique_sentence1': 13265, 'min_sentence2_length': 55, 'average_sentence2_length': 393.24, 'max_sentence2_length': 6536, 'unique_sentence2': 13265}, 'fra-ron': {'num_samples': 3295, 'number_of_characters': 2448321, 'unique_pairs': 3295, 'min_sentence1_length': 70, 'average_sentence1_length': 385.29, 'max_sentence1_length': 4717, 'unique_sentence1': 3295, 'min_sentence2_length': 63, 'average_sentence2_length': 357.75, 'max_sentence2_length': 4285, 'unique_sentence2': 3295}, 'fra-spa': {'num_samples': 23311, 'number_of_characters': 20477763, 'unique_pairs': 23311, 'min_sentence1_length': 54, 'average_sentence1_length': 451.84, 'max_sentence1_length': 14943, 'unique_sentence1': 23311, 'min_sentence2_length': 50, 'average_sentence2_length': 426.62, 'max_sentence2_length': 13323, 'unique_sentence2': 23311}, 'fra-swe': {'num_samples': 5006, 'number_of_characters': 3880565, 'unique_pairs': 5006, 'min_sentence1_length': 70, 'average_sentence1_length': 421.95, 'max_sentence1_length': 4717, 'unique_sentence1': 5006, 'min_sentence2_length': 59, 'average_sentence2_length': 353.23, 'max_sentence2_length': 4038, 'unique_sentence2': 5006}, 'isl-ita': {'num_samples': 421, 'number_of_characters': 310158, 'unique_pairs': 421, 'min_sentence1_length': 75, 'average_sentence1_length': 350.33, 'max_sentence1_length': 1897, 'unique_sentence1': 421, 'min_sentence2_length': 74, 'average_sentence2_length': 386.39, 'max_sentence2_length': 2186, 'unique_sentence2': 421}, 'isl-nld': {'num_samples': 311, 'number_of_characters': 228026, 'unique_pairs': 311, 'min_sentence1_length': 123, 'average_sentence1_length': 350.56, 'max_sentence1_length': 1705, 'unique_sentence1': 311, 'min_sentence2_length': 125, 'average_sentence2_length': 382.64, 'max_sentence2_length': 2142, 'unique_sentence2': 311}, 'isl-por': {'num_samples': 341, 'number_of_characters': 260057, 'unique_pairs': 341, 'min_sentence1_length': 110, 'average_sentence1_length': 367.62, 'max_sentence1_length': 1897, 'unique_sentence1': 341, 'min_sentence2_length': 128, 'average_sentence2_length': 395.01, 'max_sentence2_length': 2088, 'unique_sentence2': 341}, 'isl-spa': {'num_samples': 366, 'number_of_characters': 262085, 'unique_pairs': 366, 'min_sentence1_length': 75, 'average_sentence1_length': 336.95, 'max_sentence1_length': 1517, 'unique_sentence1': 366, 'min_sentence2_length': 73, 'average_sentence2_length': 379.13, 'max_sentence2_length': 1601, 'unique_sentence2': 366}, 'isl-swe': {'num_samples': 312, 'number_of_characters': 214179, 'unique_pairs': 312, 'min_sentence1_length': 75, 'average_sentence1_length': 342.19, 'max_sentence1_length': 1897, 'unique_sentence1': 312, 'min_sentence2_length': 78, 'average_sentence2_length': 344.28, 'max_sentence2_length': 1841, 'unique_sentence2': 312}, 'ita-nld': {'num_samples': 9160, 'number_of_characters': 7462623, 'unique_pairs': 9160, 'min_sentence1_length': 64, 'average_sentence1_length': 413.37, 'max_sentence1_length': 16311, 'unique_sentence1': 9160, 'min_sentence2_length': 62, 'average_sentence2_length': 401.33, 'max_sentence2_length': 15855, 'unique_sentence2': 9160}, 'ita-nor': {'num_samples': 2516, 'number_of_characters': 1877016, 'unique_pairs': 2516, 'min_sentence1_length': 88, 'average_sentence1_length': 395.47, 'max_sentence1_length': 3906, 'unique_sentence1': 2516, 'min_sentence2_length': 80, 'average_sentence2_length': 350.57, 'max_sentence2_length': 3030, 'unique_sentence2': 2516}, 'ita-por': {'num_samples': 10924, 'number_of_characters': 8751330, 'unique_pairs': 10924, 'min_sentence1_length': 55, 'average_sentence1_length': 406.05, 'max_sentence1_length': 16311, 'unique_sentence1': 10924, 'min_sentence2_length': 55, 'average_sentence2_length': 395.06, 'max_sentence2_length': 16230, 'unique_sentence2': 10924}, 'ita-ron': {'num_samples': 3360, 'number_of_characters': 2417807, 'unique_pairs': 3360, 'min_sentence1_length': 63, 'average_sentence1_length': 360.05, 'max_sentence1_length': 6226, 'unique_sentence1': 3360, 'min_sentence2_length': 65, 'average_sentence2_length': 359.53, 'max_sentence2_length': 6571, 'unique_sentence2': 3360}, 'ita-spa': {'num_samples': 16534, 'number_of_characters': 14167503, 'unique_pairs': 16534, 'min_sentence1_length': 49, 'average_sentence1_length': 426.64, 'max_sentence1_length': 16311, 'unique_sentence1': 16534, 'min_sentence2_length': 50, 'average_sentence2_length': 430.23, 'max_sentence2_length': 16655, 'unique_sentence2': 16534}, 'ita-swe': {'num_samples': 4741, 'number_of_characters': 3557838, 'unique_pairs': 4741, 'min_sentence1_length': 64, 'average_sentence1_length': 394.97, 'max_sentence1_length': 16311, 'unique_sentence1': 4741, 'min_sentence2_length': 57, 'average_sentence2_length': 355.47, 'max_sentence2_length': 15020, 'unique_sentence2': 4741}, 'nld-nor': {'num_samples': 2664, 'number_of_characters': 1941142, 'unique_pairs': 2664, 'min_sentence1_length': 72, 'average_sentence1_length': 380.63, 'max_sentence1_length': 3967, 'unique_sentence1': 2664, 'min_sentence2_length': 68, 'average_sentence2_length': 348.03, 'max_sentence2_length': 3400, 'unique_sentence2': 2664}, 'nld-por': {'num_samples': 7021, 'number_of_characters': 5347190, 'unique_pairs': 7021, 'min_sentence1_length': 51, 'average_sentence1_length': 380.22, 'max_sentence1_length': 15855, 'unique_sentence1': 7021, 'min_sentence2_length': 56, 'average_sentence2_length': 381.38, 'max_sentence2_length': 16230, 'unique_sentence2': 7021}, 'nld-ron': {'num_samples': 2888, 'number_of_characters': 2001437, 'unique_pairs': 2888, 'min_sentence1_length': 60, 'average_sentence1_length': 340.69, 'max_sentence1_length': 4431, 'unique_sentence1': 2888, 'min_sentence2_length': 70, 'average_sentence2_length': 352.33, 'max_sentence2_length': 4285, 'unique_sentence2': 2888}, 'nld-spa': {'num_samples': 9555, 'number_of_characters': 7861048, 'unique_pairs': 9555, 'min_sentence1_length': 62, 'average_sentence1_length': 403.95, 'max_sentence1_length': 15855, 'unique_sentence1': 9555, 'min_sentence2_length': 55, 'average_sentence2_length': 418.76, 'max_sentence2_length': 16655, 'unique_sentence2': 9555}, 'nld-swe': {'num_samples': 5072, 'number_of_characters': 3727392, 'unique_pairs': 5072, 'min_sentence1_length': 65, 'average_sentence1_length': 381.69, 'max_sentence1_length': 15855, 'unique_sentence1': 5072, 'min_sentence2_length': 56, 'average_sentence2_length': 353.2, 'max_sentence2_length': 15020, 'unique_sentence2': 5072}, 'nor-por': {'num_samples': 2096, 'number_of_characters': 1461412, 'unique_pairs': 2096, 'min_sentence1_length': 68, 'average_sentence1_length': 331.47, 'max_sentence1_length': 3400, 'unique_sentence1': 2096, 'min_sentence2_length': 61, 'average_sentence2_length': 365.77, 'max_sentence2_length': 3784, 'unique_sentence2': 2096}, 'nor-ron': {'num_samples': 1412, 'number_of_characters': 972154, 'unique_pairs': 1412, 'min_sentence1_length': 78, 'average_sentence1_length': 324.01, 'max_sentence1_length': 1884, 'unique_sentence1': 1412, 'min_sentence2_length': 75, 'average_sentence2_length': 364.48, 'max_sentence2_length': 2196, 'unique_sentence2': 1412}, 'nor-spa': {'num_samples': 2603, 'number_of_characters': 1933198, 'unique_pairs': 2603, 'min_sentence1_length': 63, 'average_sentence1_length': 347.56, 'max_sentence1_length': 3030, 'unique_sentence1': 2603, 'min_sentence2_length': 74, 'average_sentence2_length': 395.12, 'max_sentence2_length': 3847, 'unique_sentence2': 2603}, 'nor-swe': {'num_samples': 3165, 'number_of_characters': 2305135, 'unique_pairs': 3165, 'min_sentence1_length': 66, 'average_sentence1_length': 360.99, 'max_sentence1_length': 2366, 'unique_sentence1': 3165, 'min_sentence2_length': 70, 'average_sentence2_length': 367.33, 'max_sentence2_length': 2340, 'unique_sentence2': 3165}, 'por-ron': {'num_samples': 3026, 'number_of_characters': 2086079, 'unique_pairs': 3026, 'min_sentence1_length': 71, 'average_sentence1_length': 340.88, 'max_sentence1_length': 4439, 'unique_sentence1': 3026, 'min_sentence2_length': 63, 'average_sentence2_length': 348.51, 'max_sentence2_length': 3274, 'unique_sentence2': 3026}, 'por-spa': {'num_samples': 16084, 'number_of_characters': 12835938, 'unique_pairs': 16084, 'min_sentence1_length': 51, 'average_sentence1_length': 391.86, 'max_sentence1_length': 16230, 'unique_sentence1': 16084, 'min_sentence2_length': 54, 'average_sentence2_length': 406.2, 'max_sentence2_length': 16655, 'unique_sentence2': 16084}, 'por-swe': {'num_samples': 4235, 'number_of_characters': 2994503, 'unique_pairs': 4235, 'min_sentence1_length': 62, 'average_sentence1_length': 367.66, 'max_sentence1_length': 16230, 'unique_sentence1': 4235, 'min_sentence2_length': 57, 'average_sentence2_length': 339.43, 'max_sentence2_length': 15020, 'unique_sentence2': 4235}, 'ron-spa': {'num_samples': 3375, 'number_of_characters': 2415347, 'unique_pairs': 3375, 'min_sentence1_length': 73, 'average_sentence1_length': 355.39, 'max_sentence1_length': 4285, 'unique_sentence1': 3375, 'min_sentence2_length': 70, 'average_sentence2_length': 360.27, 'max_sentence2_length': 4498, 'unique_sentence2': 3375}, 'ron-swe': {'num_samples': 2154, 'number_of_characters': 1454257, 'unique_pairs': 2154, 'min_sentence1_length': 63, 'average_sentence1_length': 354.66, 'max_sentence1_length': 4285, 'unique_sentence1': 2154, 'min_sentence2_length': 63, 'average_sentence2_length': 320.48, 'max_sentence2_length': 4038, 'unique_sentence2': 2154}, 'spa-swe': {'num_samples': 4884, 'number_of_characters': 3751782, 'unique_pairs': 4884, 'min_sentence1_length': 66, 'average_sentence1_length': 406.52, 'max_sentence1_length': 16655, 'unique_sentence1': 4884, 'min_sentence2_length': 62, 'average_sentence2_length': 361.66, 'max_sentence2_length': 15020, 'unique_sentence2': 4884}, 'ben-hin': {'num_samples': 1174, 'number_of_characters': 682915, 'unique_pairs': 1174, 'min_sentence1_length': 64, 'average_sentence1_length': 287.33, 'max_sentence1_length': 1957, 'unique_sentence1': 1174, 'min_sentence2_length': 50, 'average_sentence2_length': 294.37, 'max_sentence2_length': 1980, 'unique_sentence2': 1174}, 'ben-mar': {'num_samples': 566, 'number_of_characters': 305353, 'unique_pairs': 566, 'min_sentence1_length': 50, 'average_sentence1_length': 271.83, 'max_sentence1_length': 1753, 'unique_sentence1': 566, 'min_sentence2_length': 57, 'average_sentence2_length': 267.66, 'max_sentence2_length': 1780, 'unique_sentence2': 566}, 'ben-urd': {'num_samples': 488, 'number_of_characters': 265698, 'unique_pairs': 488, 'min_sentence1_length': 61, 'average_sentence1_length': 269.23, 'max_sentence1_length': 1190, 'unique_sentence1': 488, 'min_sentence2_length': 62, 'average_sentence2_length': 275.23, 'max_sentence2_length': 1179, 'unique_sentence2': 488}, 'hin-mar': {'num_samples': 615, 'number_of_characters': 320880, 'unique_pairs': 615, 'min_sentence1_length': 58, 'average_sentence1_length': 265.89, 'max_sentence1_length': 1769, 'unique_sentence1': 615, 'min_sentence2_length': 58, 'average_sentence2_length': 255.87, 'max_sentence2_length': 1780, 'unique_sentence2': 615}, 'hin-urd': {'num_samples': 545, 'number_of_characters': 293939, 'unique_pairs': 545, 'min_sentence1_length': 63, 'average_sentence1_length': 271.89, 'max_sentence1_length': 1206, 'unique_sentence1': 545, 'min_sentence2_length': 62, 'average_sentence2_length': 267.45, 'max_sentence2_length': 1179, 'unique_sentence2': 545}, 'mar-urd': {'num_samples': 270, 'number_of_characters': 147706, 'unique_pairs': 270, 'min_sentence1_length': 63, 'average_sentence1_length': 270.9, 'max_sentence1_length': 1169, 'unique_sentence1': 270, 'min_sentence2_length': 66, 'average_sentence2_length': 276.16, 'max_sentence2_length': 1172, 'unique_sentence2': 270}, 'aze-kaz': {'num_samples': 412, 'number_of_characters': 230950, 'unique_pairs': 412, 'min_sentence1_length': 73, 'average_sentence1_length': 280.5, 'max_sentence1_length': 1824, 'unique_sentence1': 412, 'min_sentence2_length': 68, 'average_sentence2_length': 280.06, 'max_sentence2_length': 1855, 'unique_sentence2': 412}, 'aze-tur': {'num_samples': 388, 'number_of_characters': 205998, 'unique_pairs': 388, 'min_sentence1_length': 72, 'average_sentence1_length': 266.67, 'max_sentence1_length': 1824, 'unique_sentence1': 388, 'min_sentence2_length': 64, 'average_sentence2_length': 264.26, 'max_sentence2_length': 1838, 'unique_sentence2': 388}, 'kaz-tur': {'num_samples': 340, 'number_of_characters': 181572, 'unique_pairs': 340, 'min_sentence1_length': 68, 'average_sentence1_length': 267.44, 'max_sentence1_length': 1855, 'unique_sentence1': 340, 'min_sentence2_length': 64, 'average_sentence2_length': 266.59, 'max_sentence2_length': 1838, 'unique_sentence2': 340}, 'est-fin': {'num_samples': 790, 'number_of_characters': 551725, 'unique_pairs': 790, 'min_sentence1_length': 63, 'average_sentence1_length': 341.07, 'max_sentence1_length': 1829, 'unique_sentence1': 790, 'min_sentence2_length': 63, 'average_sentence2_length': 357.32, 'max_sentence2_length': 1815, 'unique_sentence2': 790}, 'est-hun': {'num_samples': 674, 'number_of_characters': 465779, 'unique_pairs': 674, 'min_sentence1_length': 72, 'average_sentence1_length': 329.88, 'max_sentence1_length': 2117, 'unique_sentence1': 674, 'min_sentence2_length': 73, 'average_sentence2_length': 361.18, 'max_sentence2_length': 2403, 'unique_sentence2': 674}, 'fin-hun': {'num_samples': 1542, 'number_of_characters': 1062678, 'unique_pairs': 1542, 'min_sentence1_length': 65, 'average_sentence1_length': 336.81, 'max_sentence1_length': 2500, 'unique_sentence1': 1542, 'min_sentence2_length': 66, 'average_sentence2_length': 352.35, 'max_sentence2_length': 2674, 'unique_sentence2': 1542}, 'ara-eng': {'num_samples': 5698, 'number_of_characters': 3662324, 'unique_pairs': 5698, 'min_sentence1_length': 53, 'average_sentence1_length': 302.59, 'max_sentence1_length': 2664, 'unique_sentence1': 5698, 'min_sentence2_length': 60, 'average_sentence2_length': 340.15, 'max_sentence2_length': 2811, 'unique_sentence2': 5698}, 'aze-eng': {'num_samples': 603, 'number_of_characters': 383857, 'unique_pairs': 603, 'min_sentence1_length': 58, 'average_sentence1_length': 323.52, 'max_sentence1_length': 1339, 'unique_sentence1': 603, 'min_sentence2_length': 69, 'average_sentence2_length': 313.05, 'max_sentence2_length': 1352, 'unique_sentence2': 603}, 'ben-eng': {'num_samples': 1367, 'number_of_characters': 866437, 'unique_pairs': 1367, 'min_sentence1_length': 50, 'average_sentence1_length': 318.67, 'max_sentence1_length': 1191, 'unique_sentence1': 1367, 'min_sentence2_length': 65, 'average_sentence2_length': 315.16, 'max_sentence2_length': 1094, 'unique_sentence2': 1367}, 'bul-eng': {'num_samples': 2133, 'number_of_characters': 1458078, 'unique_pairs': 2133, 'min_sentence1_length': 65, 'average_sentence1_length': 354.96, 'max_sentence1_length': 3016, 'unique_sentence1': 2133, 'min_sentence2_length': 65, 'average_sentence2_length': 328.62, 'max_sentence2_length': 2770, 'unique_sentence2': 2133}, 'cat-eng': {'num_samples': 1152, 'number_of_characters': 977495, 'unique_pairs': 1152, 'min_sentence1_length': 64, 'average_sentence1_length': 437.25, 'max_sentence1_length': 8113, 'unique_sentence1': 1152, 'min_sentence2_length': 59, 'average_sentence2_length': 411.27, 'max_sentence2_length': 7400, 'unique_sentence2': 1152}, 'ces-eng': {'num_samples': 3775, 'number_of_characters': 2651016, 'unique_pairs': 3775, 'min_sentence1_length': 54, 'average_sentence1_length': 347.59, 'max_sentence1_length': 2349, 'unique_sentence1': 3775, 'min_sentence2_length': 60, 'average_sentence2_length': 354.66, 'max_sentence2_length': 2401, 'unique_sentence2': 3775}, 'dan-eng': {'num_samples': 4512, 'number_of_characters': 3404958, 'unique_pairs': 4512, 'min_sentence1_length': 49, 'average_sentence1_length': 381.45, 'max_sentence1_length': 15317, 'unique_sentence1': 4512, 'min_sentence2_length': 56, 'average_sentence2_length': 373.19, 'max_sentence2_length': 14749, 'unique_sentence2': 4512}, 'deu-eng': {'num_samples': 37348, 'number_of_characters': 29811894, 'unique_pairs': 37348, 'min_sentence1_length': 53, 'average_sentence1_length': 423.63, 'max_sentence1_length': 6437, 'unique_sentence1': 37348, 'min_sentence2_length': 46, 'average_sentence2_length': 374.59, 'max_sentence2_length': 5781, 'unique_sentence2': 37348}, 'ell-eng': {'num_samples': 2790, 'number_of_characters': 2021300, 'unique_pairs': 2790, 'min_sentence1_length': 66, 'average_sentence1_length': 394.65, 'max_sentence1_length': 2963, 'unique_sentence1': 2790, 'min_sentence2_length': 61, 'average_sentence2_length': 329.83, 'max_sentence2_length': 3013, 'unique_sentence2': 2790}, 'eng-est': {'num_samples': 755, 'number_of_characters': 528417, 'unique_pairs': 755, 'min_sentence1_length': 58, 'average_sentence1_length': 352.58, 'max_sentence1_length': 1567, 'unique_sentence1': 755, 'min_sentence2_length': 62, 'average_sentence2_length': 347.31, 'max_sentence2_length': 1630, 'unique_sentence2': 755}, 'eng-fas': {'num_samples': 556, 'number_of_characters': 431685, 'unique_pairs': 556, 'min_sentence1_length': 59, 'average_sentence1_length': 396.65, 'max_sentence1_length': 5339, 'unique_sentence1': 556, 'min_sentence2_length': 68, 'average_sentence2_length': 379.76, 'max_sentence2_length': 4782, 'unique_sentence2': 556}, 'eng-fin': {'num_samples': 3443, 'number_of_characters': 2505517, 'unique_pairs': 3443, 'min_sentence1_length': 62, 'average_sentence1_length': 359.76, 'max_sentence1_length': 4412, 'unique_sentence1': 3443, 'min_sentence2_length': 57, 'average_sentence2_length': 367.96, 'max_sentence2_length': 4583, 'unique_sentence2': 3443}, 'eng-fra': {'num_samples': 37208, 'number_of_characters': 30609932, 'unique_pairs': 37208, 'min_sentence1_length': 52, 'average_sentence1_length': 375.68, 'max_sentence1_length': 14463, 'unique_sentence1': 37208, 'min_sentence2_length': 59, 'average_sentence2_length': 446.99, 'max_sentence2_length': 15312, 'unique_sentence2': 37208}, 'eng-heb': {'num_samples': 882, 'number_of_characters': 541517, 'unique_pairs': 882, 'min_sentence1_length': 80, 'average_sentence1_length': 339.58, 'max_sentence1_length': 16651, 'unique_sentence1': 882, 'min_sentence2_length': 62, 'average_sentence2_length': 274.39, 'max_sentence2_length': 14483, 'unique_sentence2': 882}, 'eng-hin': {'num_samples': 2219, 'number_of_characters': 1277126, 'unique_pairs': 2219, 'min_sentence1_length': 59, 'average_sentence1_length': 284.61, 'max_sentence1_length': 2439, 'unique_sentence1': 2219, 'min_sentence2_length': 50, 'average_sentence2_length': 290.93, 'max_sentence2_length': 2496, 'unique_sentence2': 2219}, 'eng-hrv': {'num_samples': 336, 'number_of_characters': 247780, 'unique_pairs': 336, 'min_sentence1_length': 79, 'average_sentence1_length': 370.68, 'max_sentence1_length': 1657, 'unique_sentence1': 336, 'min_sentence2_length': 59, 'average_sentence2_length': 366.76, 'max_sentence2_length': 1393, 'unique_sentence2': 336}, 'eng-hun': {'num_samples': 2185, 'number_of_characters': 1517498, 'unique_pairs': 2185, 'min_sentence1_length': 61, 'average_sentence1_length': 334.86, 'max_sentence1_length': 1664, 'unique_sentence1': 2185, 'min_sentence2_length': 55, 'average_sentence2_length': 359.65, 'max_sentence2_length': 1814, 'unique_sentence2': 2185}, 'eng-ind': {'num_samples': 3454, 'number_of_characters': 2493124, 'unique_pairs': 3454, 'min_sentence1_length': 60, 'average_sentence1_length': 344.73, 'max_sentence1_length': 4253, 'unique_sentence1': 3454, 'min_sentence2_length': 67, 'average_sentence2_length': 377.07, 'max_sentence2_length': 4132, 'unique_sentence2': 3454}, 'eng-isl': {'num_samples': 358, 'number_of_characters': 237004, 'unique_pairs': 358, 'min_sentence1_length': 72, 'average_sentence1_length': 329.61, 'max_sentence1_length': 1112, 'unique_sentence1': 358, 'min_sentence2_length': 69, 'average_sentence2_length': 332.41, 'max_sentence2_length': 1206, 'unique_sentence2': 358}, 'eng-ita': {'num_samples': 19661, 'number_of_characters': 15893506, 'unique_pairs': 19661, 'min_sentence1_length': 54, 'average_sentence1_length': 381.62, 'max_sentence1_length': 14540, 'unique_sentence1': 19661, 'min_sentence2_length': 49, 'average_sentence2_length': 426.75, 'max_sentence2_length': 16311, 'unique_sentence2': 19661}, 'eng-jpn': {'num_samples': 3807, 'number_of_characters': 1890484, 'unique_pairs': 3807, 'min_sentence1_length': 66, 'average_sentence1_length': 323.35, 'max_sentence1_length': 1738, 'unique_sentence1': 3807, 'min_sentence2_length': 47, 'average_sentence2_length': 173.23, 'max_sentence2_length': 856, 'unique_sentence2': 3807}, 'eng-kaz': {'num_samples': 346, 'number_of_characters': 245791, 'unique_pairs': 346, 'min_sentence1_length': 72, 'average_sentence1_length': 349.87, 'max_sentence1_length': 2126, 'unique_sentence1': 346, 'min_sentence2_length': 79, 'average_sentence2_length': 360.51, 'max_sentence2_length': 2052, 'unique_sentence2': 346}, 'eng-kor': {'num_samples': 2558, 'number_of_characters': 1358932, 'unique_pairs': 2558, 'min_sentence1_length': 58, 'average_sentence1_length': 342.45, 'max_sentence1_length': 11054, 'unique_sentence1': 2558, 'min_sentence2_length': 47, 'average_sentence2_length': 188.79, 'max_sentence2_length': 6212, 'unique_sentence2': 2558}, 'eng-lav': {'num_samples': 1079, 'number_of_characters': 765748, 'unique_pairs': 1079, 'min_sentence1_length': 63, 'average_sentence1_length': 352.18, 'max_sentence1_length': 3960, 'unique_sentence1': 1079, 'min_sentence2_length': 65, 'average_sentence2_length': 357.5, 'max_sentence2_length': 3996, 'unique_sentence2': 1079}, 'eng-lit': {'num_samples': 1185, 'number_of_characters': 884963, 'unique_pairs': 1185, 'min_sentence1_length': 70, 'average_sentence1_length': 369.4, 'max_sentence1_length': 3955, 'unique_sentence1': 1185, 'min_sentence2_length': 87, 'average_sentence2_length': 377.4, 'max_sentence2_length': 3841, 'unique_sentence2': 1185}, 'eng-mar': {'num_samples': 280, 'number_of_characters': 153207, 'unique_pairs': 280, 'min_sentence1_length': 65, 'average_sentence1_length': 274.54, 'max_sentence1_length': 883, 'unique_sentence1': 280, 'min_sentence2_length': 57, 'average_sentence2_length': 272.62, 'max_sentence2_length': 943, 'unique_sentence2': 280}, 'eng-msa': {'num_samples': 469, 'number_of_characters': 205989, 'unique_pairs': 469, 'min_sentence1_length': 58, 'average_sentence1_length': 209.64, 'max_sentence1_length': 1842, 'unique_sentence1': 469, 'min_sentence2_length': 58, 'average_sentence2_length': 229.57, 'max_sentence2_length': 2082, 'unique_sentence2': 469}, 'eng-nld': {'num_samples': 15613, 'number_of_characters': 12313146, 'unique_pairs': 15613, 'min_sentence1_length': 55, 'average_sentence1_length': 378.03, 'max_sentence1_length': 15297, 'unique_sentence1': 15613, 'min_sentence2_length': 54, 'average_sentence2_length': 410.62, 'max_sentence2_length': 16485, 'unique_sentence2': 15613}, 'eng-nor': {'num_samples': 2666, 'number_of_characters': 1883809, 'unique_pairs': 2666, 'min_sentence1_length': 55, 'average_sentence1_length': 353.08, 'max_sentence1_length': 2834, 'unique_sentence1': 2666, 'min_sentence2_length': 56, 'average_sentence2_length': 353.53, 'max_sentence2_length': 2795, 'unique_sentence2': 2666}, 'eng-pol': {'num_samples': 6868, 'number_of_characters': 4946440, 'unique_pairs': 6868, 'min_sentence1_length': 54, 'average_sentence1_length': 349.29, 'max_sentence1_length': 4412, 'unique_sentence1': 6868, 'min_sentence2_length': 57, 'average_sentence2_length': 370.92, 'max_sentence2_length': 5103, 'unique_sentence2': 6868}, 'eng-por': {'num_samples': 12406, 'number_of_characters': 9040635, 'unique_pairs': 12406, 'min_sentence1_length': 58, 'average_sentence1_length': 347.94, 'max_sentence1_length': 6643, 'unique_sentence1': 12406, 'min_sentence2_length': 56, 'average_sentence2_length': 380.79, 'max_sentence2_length': 7445, 'unique_sentence2': 12406}, 'eng-ron': {'num_samples': 3039, 'number_of_characters': 2119434, 'unique_pairs': 3039, 'min_sentence1_length': 61, 'average_sentence1_length': 328.93, 'max_sentence1_length': 2085, 'unique_sentence1': 3039, 'min_sentence2_length': 70, 'average_sentence2_length': 368.48, 'max_sentence2_length': 2421, 'unique_sentence2': 3039}, 'eng-rus': {'num_samples': 9360, 'number_of_characters': 6547558, 'unique_pairs': 9360, 'min_sentence1_length': 54, 'average_sentence1_length': 340.47, 'max_sentence1_length': 3382, 'unique_sentence1': 9360, 'min_sentence2_length': 56, 'average_sentence2_length': 359.06, 'max_sentence2_length': 4018, 'unique_sentence2': 9360}, 'eng-slk': {'num_samples': 1823, 'number_of_characters': 1287409, 'unique_pairs': 1823, 'min_sentence1_length': 64, 'average_sentence1_length': 353.21, 'max_sentence1_length': 4412, 'unique_sentence1': 1823, 'min_sentence2_length': 68, 'average_sentence2_length': 352.99, 'max_sentence2_length': 4641, 'unique_sentence2': 1823}, 'eng-slv': {'num_samples': 1450, 'number_of_characters': 948874, 'unique_pairs': 1450, 'min_sentence1_length': 61, 'average_sentence1_length': 327.63, 'max_sentence1_length': 2058, 'unique_sentence1': 1450, 'min_sentence2_length': 59, 'average_sentence2_length': 326.77, 'max_sentence2_length': 2049, 'unique_sentence2': 1450}, 'eng-spa': {'num_samples': 35446, 'number_of_characters': 29514190, 'unique_pairs': 35446, 'min_sentence1_length': 52, 'average_sentence1_length': 391.45, 'max_sentence1_length': 16651, 'unique_sentence1': 35446, 'min_sentence2_length': 47, 'average_sentence2_length': 441.2, 'max_sentence2_length': 21081, 'unique_sentence2': 35446}, 'eng-srp': {'num_samples': 303, 'number_of_characters': 196853, 'unique_pairs': 303, 'min_sentence1_length': 85, 'average_sentence1_length': 323.19, 'max_sentence1_length': 1054, 'unique_sentence1': 303, 'min_sentence2_length': 78, 'average_sentence2_length': 326.49, 'max_sentence2_length': 1025, 'unique_sentence2': 303}, 'eng-swe': {'num_samples': 6005, 'number_of_characters': 4500597, 'unique_pairs': 6005, 'min_sentence1_length': 54, 'average_sentence1_length': 373.82, 'max_sentence1_length': 14540, 'unique_sentence1': 6005, 'min_sentence2_length': 56, 'average_sentence2_length': 375.65, 'max_sentence2_length': 15020, 'unique_sentence2': 6005}, 'eng-tgl': {'num_samples': 551, 'number_of_characters': 426303, 'unique_pairs': 551, 'min_sentence1_length': 61, 'average_sentence1_length': 346.09, 'max_sentence1_length': 1958, 'unique_sentence1': 551, 'min_sentence2_length': 57, 'average_sentence2_length': 427.6, 'max_sentence2_length': 2422, 'unique_sentence2': 551}, 'eng-tha': {'num_samples': 814, 'number_of_characters': 426960, 'unique_pairs': 814, 'min_sentence1_length': 67, 'average_sentence1_length': 274.76, 'max_sentence1_length': 1842, 'unique_sentence1': 814, 'min_sentence2_length': 52, 'average_sentence2_length': 249.76, 'max_sentence2_length': 1669, 'unique_sentence2': 814}, 'eng-tur': {'num_samples': 4606, 'number_of_characters': 3315390, 'unique_pairs': 4606, 'min_sentence1_length': 58, 'average_sentence1_length': 353.38, 'max_sentence1_length': 5595, 'unique_sentence1': 4606, 'min_sentence2_length': 58, 'average_sentence2_length': 366.42, 'max_sentence2_length': 6024, 'unique_sentence2': 4606}, 'eng-ukr': {'num_samples': 3778, 'number_of_characters': 2803318, 'unique_pairs': 3778, 'min_sentence1_length': 59, 'average_sentence1_length': 367.93, 'max_sentence1_length': 3627, 'unique_sentence1': 3778, 'min_sentence2_length': 49, 'average_sentence2_length': 374.08, 'max_sentence2_length': 3991, 'unique_sentence2': 3778}, 'eng-urd': {'num_samples': 268, 'number_of_characters': 137458, 'unique_pairs': 268, 'min_sentence1_length': 67, 'average_sentence1_length': 253.83, 'max_sentence1_length': 736, 'unique_sentence1': 268, 'min_sentence2_length': 57, 'average_sentence2_length': 259.07, 'max_sentence2_length': 795, 'unique_sentence2': 268}, 'eng-vie': {'num_samples': 1264, 'number_of_characters': 866332, 'unique_pairs': 1264, 'min_sentence1_length': 60, 'average_sentence1_length': 330.71, 'max_sentence1_length': 4253, 'unique_sentence1': 1264, 'min_sentence2_length': 59, 'average_sentence2_length': 354.68, 'max_sentence2_length': 3780, 'unique_sentence2': 1264}, 'eng-zho': {'num_samples': 4959, 'number_of_characters': 2696879, 'unique_pairs': 4959, 'min_sentence1_length': 75, 'average_sentence1_length': 402.38, 'max_sentence1_length': 14540, 'unique_sentence1': 4959, 'min_sentence2_length': 41, 'average_sentence2_length': 141.46, 'max_sentence2_length': 4500, 'unique_sentence2': 4959}}}} | | [WebFAQBitextMiningQuestions](https://huggingface.co/PaDaS-Lab) (Michael Dinzinger, 2025) | ['ara', 'aze', 'ben', 'bul', 'cat', 'ces', 'dan', 'deu', 'ell', 'eng', 'est', 'fas', 'fin', 'fra', 'heb', 'hin', 'hrv', 'hun', 'ind', 'isl', 'ita', 'jpn', 'kat', 'kaz', 'kor', 'lav', 'lit', 'mar', 'msa', 'nld', 'nor', 'pol', 'por', 'ron', 'rus', 'slk', 'slv', 'spa', 'srp', 'swe', 'tgl', 'tha', 'tur', 'ukr', 'urd', 'vie', 'zho'] | BitextMining | s2s | [Web, Written] | {'default': 682057} | {'default': {'num_samples': 682057, 'number_of_characters': 71984237, 'unique_pairs': 681597, 'min_sentence1_length': 6, 'average_sentence1_length': 52.72, 'max_sentence1_length': 847, 'unique_sentence1': 379086, 'min_sentence2_length': 6, 'average_sentence2_length': 52.82, 'max_sentence2_length': 773, 'unique_sentence2': 398743, 'hf_subset_descriptive_stats': {'ara-fas': {'num_samples': 609, 'number_of_characters': 53319, 'unique_pairs': 609, 'min_sentence1_length': 10, 'average_sentence1_length': 41.9, 'max_sentence1_length': 164, 'unique_sentence1': 609, 'min_sentence2_length': 11, 'average_sentence2_length': 45.65, 'max_sentence2_length': 177, 'unique_sentence2': 609}, 'ara-heb': {'num_samples': 978, 'number_of_characters': 93188, 'unique_pairs': 978, 'min_sentence1_length': 10, 'average_sentence1_length': 48.34, 'max_sentence1_length': 158, 'unique_sentence1': 978, 'min_sentence2_length': 9, 'average_sentence2_length': 46.95, 'max_sentence2_length': 160, 'unique_sentence2': 978}, 'jpn-kor': {'num_samples': 4820, 'number_of_characters': 310427, 'unique_pairs': 4820, 'min_sentence1_length': 10, 'average_sentence1_length': 31.91, 'max_sentence1_length': 146, 'unique_sentence1': 4820, 'min_sentence2_length': 7, 'average_sentence2_length': 32.5, 'max_sentence2_length': 208, 'unique_sentence2': 4820}, 'jpn-vie': {'num_samples': 1356, 'number_of_characters': 110198, 'unique_pairs': 1356, 'min_sentence1_length': 10, 'average_sentence1_length': 28.55, 'max_sentence1_length': 152, 'unique_sentence1': 1356, 'min_sentence2_length': 10, 'average_sentence2_length': 52.72, 'max_sentence2_length': 344, 'unique_sentence2': 1356}, 'jpn-zho': {'num_samples': 1728, 'number_of_characters': 86936, 'unique_pairs': 1728, 'min_sentence1_length': 7, 'average_sentence1_length': 29.03, 'max_sentence1_length': 135, 'unique_sentence1': 1728, 'min_sentence2_length': 8, 'average_sentence2_length': 21.28, 'max_sentence2_length': 126, 'unique_sentence2': 1728}, 'kor-vie': {'num_samples': 1386, 'number_of_characters': 116716, 'unique_pairs': 1386, 'min_sentence1_length': 10, 'average_sentence1_length': 30.34, 'max_sentence1_length': 108, 'unique_sentence1': 1386, 'min_sentence2_length': 10, 'average_sentence2_length': 53.87, 'max_sentence2_length': 176, 'unique_sentence2': 1386}, 'kor-zho': {'num_samples': 1087, 'number_of_characters': 56152, 'unique_pairs': 1087, 'min_sentence1_length': 10, 'average_sentence1_length': 30.24, 'max_sentence1_length': 109, 'unique_sentence1': 1087, 'min_sentence2_length': 10, 'average_sentence2_length': 21.42, 'max_sentence2_length': 95, 'unique_sentence2': 1087}, 'vie-zho': {'num_samples': 646, 'number_of_characters': 46981, 'unique_pairs': 646, 'min_sentence1_length': 11, 'average_sentence1_length': 52.13, 'max_sentence1_length': 258, 'unique_sentence1': 646, 'min_sentence2_length': 10, 'average_sentence2_length': 20.6, 'max_sentence2_length': 78, 'unique_sentence2': 646}, 'ind-msa': {'num_samples': 455, 'number_of_characters': 61802, 'unique_pairs': 455, 'min_sentence1_length': 17, 'average_sentence1_length': 69.13, 'max_sentence1_length': 135, 'unique_sentence1': 455, 'min_sentence2_length': 16, 'average_sentence2_length': 66.7, 'max_sentence2_length': 156, 'unique_sentence2': 455}, 'ind-tgl': {'num_samples': 378, 'number_of_characters': 40452, 'unique_pairs': 378, 'min_sentence1_length': 13, 'average_sentence1_length': 51.3, 'max_sentence1_length': 122, 'unique_sentence1': 378, 'min_sentence2_length': 13, 'average_sentence2_length': 55.71, 'max_sentence2_length': 136, 'unique_sentence2': 378}, 'ind-tha': {'num_samples': 1258, 'number_of_characters': 129949, 'unique_pairs': 1258, 'min_sentence1_length': 11, 'average_sentence1_length': 56.66, 'max_sentence1_length': 159, 'unique_sentence1': 1258, 'min_sentence2_length': 11, 'average_sentence2_length': 46.64, 'max_sentence2_length': 153, 'unique_sentence2': 1258}, 'bul-ces': {'num_samples': 1485, 'number_of_characters': 156647, 'unique_pairs': 1485, 'min_sentence1_length': 12, 'average_sentence1_length': 55.73, 'max_sentence1_length': 273, 'unique_sentence1': 1485, 'min_sentence2_length': 10, 'average_sentence2_length': 49.75, 'max_sentence2_length': 265, 'unique_sentence2': 1485}, 'bul-lav': {'num_samples': 710, 'number_of_characters': 72767, 'unique_pairs': 710, 'min_sentence1_length': 14, 'average_sentence1_length': 52.81, 'max_sentence1_length': 169, 'unique_sentence1': 710, 'min_sentence2_length': 13, 'average_sentence2_length': 49.68, 'max_sentence2_length': 137, 'unique_sentence2': 710}, 'bul-lit': {'num_samples': 803, 'number_of_characters': 83819, 'unique_pairs': 803, 'min_sentence1_length': 14, 'average_sentence1_length': 53.81, 'max_sentence1_length': 223, 'unique_sentence1': 803, 'min_sentence2_length': 14, 'average_sentence2_length': 50.57, 'max_sentence2_length': 188, 'unique_sentence2': 803}, 'bul-pol': {'num_samples': 1635, 'number_of_characters': 182803, 'unique_pairs': 1635, 'min_sentence1_length': 12, 'average_sentence1_length': 56.71, 'max_sentence1_length': 224, 'unique_sentence1': 1635, 'min_sentence2_length': 14, 'average_sentence2_length': 55.1, 'max_sentence2_length': 223, 'unique_sentence2': 1635}, 'bul-rus': {'num_samples': 1476, 'number_of_characters': 171755, 'unique_pairs': 1476, 'min_sentence1_length': 11, 'average_sentence1_length': 58.61, 'max_sentence1_length': 273, 'unique_sentence1': 1476, 'min_sentence2_length': 13, 'average_sentence2_length': 57.75, 'max_sentence2_length': 263, 'unique_sentence2': 1476}, 'bul-slk': {'num_samples': 1154, 'number_of_characters': 122062, 'unique_pairs': 1154, 'min_sentence1_length': 14, 'average_sentence1_length': 55.47, 'max_sentence1_length': 273, 'unique_sentence1': 1154, 'min_sentence2_length': 12, 'average_sentence2_length': 50.31, 'max_sentence2_length': 234, 'unique_sentence2': 1154}, 'bul-slv': {'num_samples': 1034, 'number_of_characters': 120376, 'unique_pairs': 1034, 'min_sentence1_length': 13, 'average_sentence1_length': 60.39, 'max_sentence1_length': 273, 'unique_sentence1': 1034, 'min_sentence2_length': 12, 'average_sentence2_length': 56.03, 'max_sentence2_length': 252, 'unique_sentence2': 1034}, 'bul-srp': {'num_samples': 296, 'number_of_characters': 29879, 'unique_pairs': 296, 'min_sentence1_length': 17, 'average_sentence1_length': 51.28, 'max_sentence1_length': 146, 'unique_sentence1': 296, 'min_sentence2_length': 10, 'average_sentence2_length': 49.66, 'max_sentence2_length': 118, 'unique_sentence2': 296}, 'bul-ukr': {'num_samples': 1074, 'number_of_characters': 120520, 'unique_pairs': 1074, 'min_sentence1_length': 11, 'average_sentence1_length': 57.26, 'max_sentence1_length': 155, 'unique_sentence1': 1074, 'min_sentence2_length': 12, 'average_sentence2_length': 54.96, 'max_sentence2_length': 152, 'unique_sentence2': 1074}, 'ces-lav': {'num_samples': 875, 'number_of_characters': 81949, 'unique_pairs': 875, 'min_sentence1_length': 12, 'average_sentence1_length': 45.45, 'max_sentence1_length': 133, 'unique_sentence1': 875, 'min_sentence2_length': 13, 'average_sentence2_length': 48.2, 'max_sentence2_length': 151, 'unique_sentence2': 875}, 'ces-lit': {'num_samples': 1002, 'number_of_characters': 95191, 'unique_pairs': 1002, 'min_sentence1_length': 10, 'average_sentence1_length': 45.99, 'max_sentence1_length': 129, 'unique_sentence1': 1002, 'min_sentence2_length': 12, 'average_sentence2_length': 49.01, 'max_sentence2_length': 177, 'unique_sentence2': 1002}, 'ces-pol': {'num_samples': 3367, 'number_of_characters': 335068, 'unique_pairs': 3367, 'min_sentence1_length': 10, 'average_sentence1_length': 47.42, 'max_sentence1_length': 211, 'unique_sentence1': 3367, 'min_sentence2_length': 13, 'average_sentence2_length': 52.09, 'max_sentence2_length': 231, 'unique_sentence2': 3367}, 'ces-rus': {'num_samples': 2144, 'number_of_characters': 221574, 'unique_pairs': 2144, 'min_sentence1_length': 10, 'average_sentence1_length': 48.92, 'max_sentence1_length': 265, 'unique_sentence1': 2144, 'min_sentence2_length': 13, 'average_sentence2_length': 54.42, 'max_sentence2_length': 263, 'unique_sentence2': 2144}, 'ces-slk': {'num_samples': 2551, 'number_of_characters': 253025, 'unique_pairs': 2551, 'min_sentence1_length': 10, 'average_sentence1_length': 48.91, 'max_sentence1_length': 265, 'unique_sentence1': 2551, 'min_sentence2_length': 10, 'average_sentence2_length': 50.28, 'max_sentence2_length': 237, 'unique_sentence2': 2551}, 'ces-slv': {'num_samples': 1370, 'number_of_characters': 145413, 'unique_pairs': 1370, 'min_sentence1_length': 10, 'average_sentence1_length': 51.01, 'max_sentence1_length': 265, 'unique_sentence1': 1370, 'min_sentence2_length': 11, 'average_sentence2_length': 55.13, 'max_sentence2_length': 252, 'unique_sentence2': 1370}, 'ces-srp': {'num_samples': 362, 'number_of_characters': 37517, 'unique_pairs': 362, 'min_sentence1_length': 15, 'average_sentence1_length': 49.99, 'max_sentence1_length': 129, 'unique_sentence1': 362, 'min_sentence2_length': 16, 'average_sentence2_length': 53.65, 'max_sentence2_length': 129, 'unique_sentence2': 362}, 'ces-ukr': {'num_samples': 1285, 'number_of_characters': 133762, 'unique_pairs': 1285, 'min_sentence1_length': 10, 'average_sentence1_length': 49.67, 'max_sentence1_length': 233, 'unique_sentence1': 1285, 'min_sentence2_length': 12, 'average_sentence2_length': 54.42, 'max_sentence2_length': 253, 'unique_sentence2': 1285}, 'hrv-slk': {'num_samples': 313, 'number_of_characters': 38510, 'unique_pairs': 313, 'min_sentence1_length': 16, 'average_sentence1_length': 61.02, 'max_sentence1_length': 174, 'unique_sentence1': 313, 'min_sentence2_length': 15, 'average_sentence2_length': 62.02, 'max_sentence2_length': 182, 'unique_sentence2': 313}, 'kat-rus': {'num_samples': 262, 'number_of_characters': 26840, 'unique_pairs': 262, 'min_sentence1_length': 14, 'average_sentence1_length': 51.6, 'max_sentence1_length': 179, 'unique_sentence1': 262, 'min_sentence2_length': 16, 'average_sentence2_length': 50.85, 'max_sentence2_length': 186, 'unique_sentence2': 262}, 'lav-lit': {'num_samples': 1061, 'number_of_characters': 101845, 'unique_pairs': 1061, 'min_sentence1_length': 11, 'average_sentence1_length': 47.87, 'max_sentence1_length': 283, 'unique_sentence1': 1061, 'min_sentence2_length': 12, 'average_sentence2_length': 48.12, 'max_sentence2_length': 268, 'unique_sentence2': 1061}, 'lav-pol': {'num_samples': 951, 'number_of_characters': 94163, 'unique_pairs': 951, 'min_sentence1_length': 11, 'average_sentence1_length': 48.29, 'max_sentence1_length': 172, 'unique_sentence1': 951, 'min_sentence2_length': 14, 'average_sentence2_length': 50.72, 'max_sentence2_length': 208, 'unique_sentence2': 951}, 'lav-rus': {'num_samples': 1412, 'number_of_characters': 141609, 'unique_pairs': 1412, 'min_sentence1_length': 11, 'average_sentence1_length': 49.2, 'max_sentence1_length': 146, 'unique_sentence1': 1412, 'min_sentence2_length': 13, 'average_sentence2_length': 51.09, 'max_sentence2_length': 182, 'unique_sentence2': 1412}, 'lav-slk': {'num_samples': 789, 'number_of_characters': 72654, 'unique_pairs': 789, 'min_sentence1_length': 13, 'average_sentence1_length': 46.75, 'max_sentence1_length': 172, 'unique_sentence1': 789, 'min_sentence2_length': 12, 'average_sentence2_length': 45.33, 'max_sentence2_length': 162, 'unique_sentence2': 789}, 'lav-slv': {'num_samples': 518, 'number_of_characters': 48873, 'unique_pairs': 518, 'min_sentence1_length': 13, 'average_sentence1_length': 46.73, 'max_sentence1_length': 126, 'unique_sentence1': 518, 'min_sentence2_length': 13, 'average_sentence2_length': 47.62, 'max_sentence2_length': 125, 'unique_sentence2': 518}, 'lav-ukr': {'num_samples': 579, 'number_of_characters': 52695, 'unique_pairs': 579, 'min_sentence1_length': 11, 'average_sentence1_length': 45.8, 'max_sentence1_length': 151, 'unique_sentence1': 579, 'min_sentence2_length': 12, 'average_sentence2_length': 45.21, 'max_sentence2_length': 170, 'unique_sentence2': 579}, 'lit-pol': {'num_samples': 1026, 'number_of_characters': 102814, 'unique_pairs': 1026, 'min_sentence1_length': 13, 'average_sentence1_length': 48.94, 'max_sentence1_length': 188, 'unique_sentence1': 1026, 'min_sentence2_length': 15, 'average_sentence2_length': 51.27, 'max_sentence2_length': 206, 'unique_sentence2': 1026}, 'lit-rus': {'num_samples': 961, 'number_of_characters': 96822, 'unique_pairs': 961, 'min_sentence1_length': 12, 'average_sentence1_length': 49.5, 'max_sentence1_length': 155, 'unique_sentence1': 961, 'min_sentence2_length': 12, 'average_sentence2_length': 51.25, 'max_sentence2_length': 182, 'unique_sentence2': 961}, 'lit-slk': {'num_samples': 859, 'number_of_characters': 82605, 'unique_pairs': 859, 'min_sentence1_length': 12, 'average_sentence1_length': 49.03, 'max_sentence1_length': 188, 'unique_sentence1': 859, 'min_sentence2_length': 10, 'average_sentence2_length': 47.13, 'max_sentence2_length': 195, 'unique_sentence2': 859}, 'lit-slv': {'num_samples': 607, 'number_of_characters': 58755, 'unique_pairs': 607, 'min_sentence1_length': 12, 'average_sentence1_length': 48.06, 'max_sentence1_length': 155, 'unique_sentence1': 607, 'min_sentence2_length': 11, 'average_sentence2_length': 48.74, 'max_sentence2_length': 145, 'unique_sentence2': 607}, 'lit-ukr': {'num_samples': 639, 'number_of_characters': 59450, 'unique_pairs': 639, 'min_sentence1_length': 14, 'average_sentence1_length': 46.94, 'max_sentence1_length': 149, 'unique_sentence1': 639, 'min_sentence2_length': 13, 'average_sentence2_length': 46.1, 'max_sentence2_length': 170, 'unique_sentence2': 639}, 'pol-rus': {'num_samples': 5014, 'number_of_characters': 536093, 'unique_pairs': 5014, 'min_sentence1_length': 12, 'average_sentence1_length': 53.29, 'max_sentence1_length': 424, 'unique_sentence1': 5014, 'min_sentence2_length': 11, 'average_sentence2_length': 53.63, 'max_sentence2_length': 456, 'unique_sentence2': 5014}, 'pol-slk': {'num_samples': 1918, 'number_of_characters': 196496, 'unique_pairs': 1918, 'min_sentence1_length': 14, 'average_sentence1_length': 53.03, 'max_sentence1_length': 206, 'unique_sentence1': 1918, 'min_sentence2_length': 10, 'average_sentence2_length': 49.42, 'max_sentence2_length': 194, 'unique_sentence2': 1918}, 'pol-slv': {'num_samples': 1382, 'number_of_characters': 160396, 'unique_pairs': 1382, 'min_sentence1_length': 9, 'average_sentence1_length': 57.59, 'max_sentence1_length': 180, 'unique_sentence1': 1382, 'min_sentence2_length': 8, 'average_sentence2_length': 58.47, 'max_sentence2_length': 146, 'unique_sentence2': 1382}, 'pol-srp': {'num_samples': 492, 'number_of_characters': 54772, 'unique_pairs': 492, 'min_sentence1_length': 19, 'average_sentence1_length': 54.55, 'max_sentence1_length': 148, 'unique_sentence1': 492, 'min_sentence2_length': 16, 'average_sentence2_length': 56.77, 'max_sentence2_length': 129, 'unique_sentence2': 492}, 'pol-ukr': {'num_samples': 2370, 'number_of_characters': 251424, 'unique_pairs': 2370, 'min_sentence1_length': 14, 'average_sentence1_length': 53.8, 'max_sentence1_length': 305, 'unique_sentence1': 2370, 'min_sentence2_length': 12, 'average_sentence2_length': 52.29, 'max_sentence2_length': 202, 'unique_sentence2': 2370}, 'rus-slk': {'num_samples': 1263, 'number_of_characters': 136750, 'unique_pairs': 1263, 'min_sentence1_length': 13, 'average_sentence1_length': 56.65, 'max_sentence1_length': 316, 'unique_sentence1': 1263, 'min_sentence2_length': 10, 'average_sentence2_length': 51.63, 'max_sentence2_length': 309, 'unique_sentence2': 1263}, 'rus-slv': {'num_samples': 1096, 'number_of_characters': 133263, 'unique_pairs': 1096, 'min_sentence1_length': 14, 'average_sentence1_length': 62.56, 'max_sentence1_length': 263, 'unique_sentence1': 1096, 'min_sentence2_length': 11, 'average_sentence2_length': 59.03, 'max_sentence2_length': 252, 'unique_sentence2': 1096}, 'rus-srp': {'num_samples': 455, 'number_of_characters': 51800, 'unique_pairs': 455, 'min_sentence1_length': 17, 'average_sentence1_length': 57.52, 'max_sentence1_length': 122, 'unique_sentence1': 455, 'min_sentence2_length': 12, 'average_sentence2_length': 56.33, 'max_sentence2_length': 129, 'unique_sentence2': 455}, 'rus-ukr': {'num_samples': 15251, 'number_of_characters': 1504266, 'unique_pairs': 15251, 'min_sentence1_length': 10, 'average_sentence1_length': 49.98, 'max_sentence1_length': 308, 'unique_sentence1': 15251, 'min_sentence2_length': 10, 'average_sentence2_length': 48.66, 'max_sentence2_length': 353, 'unique_sentence2': 15251}, 'slk-slv': {'num_samples': 1259, 'number_of_characters': 133621, 'unique_pairs': 1259, 'min_sentence1_length': 10, 'average_sentence1_length': 50.42, 'max_sentence1_length': 234, 'unique_sentence1': 1259, 'min_sentence2_length': 11, 'average_sentence2_length': 55.71, 'max_sentence2_length': 252, 'unique_sentence2': 1259}, 'slk-srp': {'num_samples': 561, 'number_of_characters': 57637, 'unique_pairs': 561, 'min_sentence1_length': 15, 'average_sentence1_length': 47.93, 'max_sentence1_length': 117, 'unique_sentence1': 561, 'min_sentence2_length': 16, 'average_sentence2_length': 54.81, 'max_sentence2_length': 112, 'unique_sentence2': 561}, 'slk-ukr': {'num_samples': 944, 'number_of_characters': 90612, 'unique_pairs': 944, 'min_sentence1_length': 10, 'average_sentence1_length': 46.87, 'max_sentence1_length': 237, 'unique_sentence1': 944, 'min_sentence2_length': 12, 'average_sentence2_length': 49.11, 'max_sentence2_length': 253, 'unique_sentence2': 944}, 'slv-srp': {'num_samples': 499, 'number_of_characters': 60828, 'unique_pairs': 499, 'min_sentence1_length': 16, 'average_sentence1_length': 63.64, 'max_sentence1_length': 122, 'unique_sentence1': 499, 'min_sentence2_length': 16, 'average_sentence2_length': 58.26, 'max_sentence2_length': 118, 'unique_sentence2': 499}, 'slv-ukr': {'num_samples': 733, 'number_of_characters': 86586, 'unique_pairs': 733, 'min_sentence1_length': 11, 'average_sentence1_length': 57.88, 'max_sentence1_length': 124, 'unique_sentence1': 733, 'min_sentence2_length': 12, 'average_sentence2_length': 60.24, 'max_sentence2_length': 132, 'unique_sentence2': 733}, 'cat-deu': {'num_samples': 302, 'number_of_characters': 33752, 'unique_pairs': 302, 'min_sentence1_length': 13, 'average_sentence1_length': 53.35, 'max_sentence1_length': 255, 'unique_sentence1': 302, 'min_sentence2_length': 12, 'average_sentence2_length': 58.41, 'max_sentence2_length': 269, 'unique_sentence2': 302}, 'cat-fra': {'num_samples': 598, 'number_of_characters': 69263, 'unique_pairs': 598, 'min_sentence1_length': 14, 'average_sentence1_length': 53.4, 'max_sentence1_length': 202, 'unique_sentence1': 598, 'min_sentence2_length': 16, 'average_sentence2_length': 62.42, 'max_sentence2_length': 235, 'unique_sentence2': 598}, 'cat-ita': {'num_samples': 418, 'number_of_characters': 43550, 'unique_pairs': 418, 'min_sentence1_length': 15, 'average_sentence1_length': 50.67, 'max_sentence1_length': 195, 'unique_sentence1': 418, 'min_sentence2_length': 15, 'average_sentence2_length': 53.51, 'max_sentence2_length': 230, 'unique_sentence2': 418}, 'cat-por': {'num_samples': 370, 'number_of_characters': 36411, 'unique_pairs': 370, 'min_sentence1_length': 15, 'average_sentence1_length': 48.72, 'max_sentence1_length': 193, 'unique_sentence1': 370, 'min_sentence2_length': 15, 'average_sentence2_length': 49.69, 'max_sentence2_length': 211, 'unique_sentence2': 370}, 'cat-spa': {'num_samples': 2648, 'number_of_characters': 285897, 'unique_pairs': 2648, 'min_sentence1_length': 10, 'average_sentence1_length': 52.98, 'max_sentence1_length': 216, 'unique_sentence1': 2648, 'min_sentence2_length': 11, 'average_sentence2_length': 54.98, 'max_sentence2_length': 247, 'unique_sentence2': 2648}, 'dan-deu': {'num_samples': 4337, 'number_of_characters': 463434, 'unique_pairs': 4337, 'min_sentence1_length': 11, 'average_sentence1_length': 50.84, 'max_sentence1_length': 198, 'unique_sentence1': 4337, 'min_sentence2_length': 11, 'average_sentence2_length': 56.02, 'max_sentence2_length': 224, 'unique_sentence2': 4337}, 'dan-fra': {'num_samples': 3802, 'number_of_characters': 434788, 'unique_pairs': 3802, 'min_sentence1_length': 12, 'average_sentence1_length': 51.92, 'max_sentence1_length': 296, 'unique_sentence1': 3802, 'min_sentence2_length': 11, 'average_sentence2_length': 62.44, 'max_sentence2_length': 345, 'unique_sentence2': 3802}, 'dan-isl': {'num_samples': 327, 'number_of_characters': 32853, 'unique_pairs': 327, 'min_sentence1_length': 12, 'average_sentence1_length': 51.35, 'max_sentence1_length': 198, 'unique_sentence1': 327, 'min_sentence2_length': 12, 'average_sentence2_length': 49.11, 'max_sentence2_length': 181, 'unique_sentence2': 327}, 'dan-ita': {'num_samples': 3818, 'number_of_characters': 421045, 'unique_pairs': 3818, 'min_sentence1_length': 12, 'average_sentence1_length': 53.2, 'max_sentence1_length': 296, 'unique_sentence1': 3818, 'min_sentence2_length': 11, 'average_sentence2_length': 57.08, 'max_sentence2_length': 271, 'unique_sentence2': 3818}, 'dan-nld': {'num_samples': 4099, 'number_of_characters': 428737, 'unique_pairs': 4099, 'min_sentence1_length': 11, 'average_sentence1_length': 51.53, 'max_sentence1_length': 225, 'unique_sentence1': 4099, 'min_sentence2_length': 10, 'average_sentence2_length': 53.06, 'max_sentence2_length': 254, 'unique_sentence2': 4099}, 'dan-nor': {'num_samples': 2603, 'number_of_characters': 278953, 'unique_pairs': 2603, 'min_sentence1_length': 13, 'average_sentence1_length': 54.74, 'max_sentence1_length': 296, 'unique_sentence1': 2603, 'min_sentence2_length': 11, 'average_sentence2_length': 52.42, 'max_sentence2_length': 291, 'unique_sentence2': 2603}, 'dan-por': {'num_samples': 3206, 'number_of_characters': 349267, 'unique_pairs': 3206, 'min_sentence1_length': 12, 'average_sentence1_length': 52.78, 'max_sentence1_length': 176, 'unique_sentence1': 3206, 'min_sentence2_length': 10, 'average_sentence2_length': 56.16, 'max_sentence2_length': 211, 'unique_sentence2': 3206}, 'dan-ron': {'num_samples': 2052, 'number_of_characters': 225425, 'unique_pairs': 2052, 'min_sentence1_length': 8, 'average_sentence1_length': 53.82, 'max_sentence1_length': 296, 'unique_sentence1': 2052, 'min_sentence2_length': 9, 'average_sentence2_length': 56.03, 'max_sentence2_length': 281, 'unique_sentence2': 2052}, 'dan-spa': {'num_samples': 3571, 'number_of_characters': 389048, 'unique_pairs': 3571, 'min_sentence1_length': 12, 'average_sentence1_length': 52.06, 'max_sentence1_length': 257, 'unique_sentence1': 3571, 'min_sentence2_length': 12, 'average_sentence2_length': 56.89, 'max_sentence2_length': 239, 'unique_sentence2': 3571}, 'dan-swe': {'num_samples': 4268, 'number_of_characters': 440347, 'unique_pairs': 4268, 'min_sentence1_length': 12, 'average_sentence1_length': 52.09, 'max_sentence1_length': 315, 'unique_sentence1': 4268, 'min_sentence2_length': 11, 'average_sentence2_length': 51.09, 'max_sentence2_length': 330, 'unique_sentence2': 4268}, 'deu-fra': {'num_samples': 27727, 'number_of_characters': 3222716, 'unique_pairs': 27727, 'min_sentence1_length': 6, 'average_sentence1_length': 55.56, 'max_sentence1_length': 337, 'unique_sentence1': 27727, 'min_sentence2_length': 6, 'average_sentence2_length': 60.67, 'max_sentence2_length': 330, 'unique_sentence2': 27727}, 'deu-isl': {'num_samples': 294, 'number_of_characters': 31097, 'unique_pairs': 294, 'min_sentence1_length': 14, 'average_sentence1_length': 56.09, 'max_sentence1_length': 201, 'unique_sentence1': 294, 'min_sentence2_length': 14, 'average_sentence2_length': 49.69, 'max_sentence2_length': 181, 'unique_sentence2': 294}, 'deu-ita': {'num_samples': 18787, 'number_of_characters': 2100285, 'unique_pairs': 18787, 'min_sentence1_length': 10, 'average_sentence1_length': 56.25, 'max_sentence1_length': 346, 'unique_sentence1': 18787, 'min_sentence2_length': 10, 'average_sentence2_length': 55.54, 'max_sentence2_length': 357, 'unique_sentence2': 18787}, 'deu-nld': {'num_samples': 14211, 'number_of_characters': 1508365, 'unique_pairs': 14211, 'min_sentence1_length': 10, 'average_sentence1_length': 54.51, 'max_sentence1_length': 299, 'unique_sentence1': 14211, 'min_sentence2_length': 10, 'average_sentence2_length': 51.63, 'max_sentence2_length': 276, 'unique_sentence2': 14211}, 'deu-nor': {'num_samples': 2783, 'number_of_characters': 295406, 'unique_pairs': 2783, 'min_sentence1_length': 10, 'average_sentence1_length': 56.26, 'max_sentence1_length': 224, 'unique_sentence1': 2783, 'min_sentence2_length': 11, 'average_sentence2_length': 49.89, 'max_sentence2_length': 163, 'unique_sentence2': 2783}, 'deu-por': {'num_samples': 11319, 'number_of_characters': 1213812, 'unique_pairs': 11319, 'min_sentence1_length': 9, 'average_sentence1_length': 54.7, 'max_sentence1_length': 374, 'unique_sentence1': 11319, 'min_sentence2_length': 10, 'average_sentence2_length': 52.54, 'max_sentence2_length': 337, 'unique_sentence2': 11319}, 'deu-ron': {'num_samples': 3598, 'number_of_characters': 401116, 'unique_pairs': 3598, 'min_sentence1_length': 8, 'average_sentence1_length': 57.01, 'max_sentence1_length': 716, 'unique_sentence1': 3598, 'min_sentence2_length': 9, 'average_sentence2_length': 54.48, 'max_sentence2_length': 699, 'unique_sentence2': 3598}, 'deu-spa': {'num_samples': 19739, 'number_of_characters': 2151487, 'unique_pairs': 19739, 'min_sentence1_length': 9, 'average_sentence1_length': 54.84, 'max_sentence1_length': 428, 'unique_sentence1': 19739, 'min_sentence2_length': 10, 'average_sentence2_length': 54.16, 'max_sentence2_length': 415, 'unique_sentence2': 19739}, 'deu-swe': {'num_samples': 5772, 'number_of_characters': 610949, 'unique_pairs': 5772, 'min_sentence1_length': 10, 'average_sentence1_length': 55.78, 'max_sentence1_length': 264, 'unique_sentence1': 5772, 'min_sentence2_length': 10, 'average_sentence2_length': 50.06, 'max_sentence2_length': 237, 'unique_sentence2': 5772}, 'fra-isl': {'num_samples': 347, 'number_of_characters': 39368, 'unique_pairs': 347, 'min_sentence1_length': 14, 'average_sentence1_length': 62.12, 'max_sentence1_length': 229, 'unique_sentence1': 347, 'min_sentence2_length': 16, 'average_sentence2_length': 51.34, 'max_sentence2_length': 194, 'unique_sentence2': 347}, 'fra-ita': {'num_samples': 20002, 'number_of_characters': 2326983, 'unique_pairs': 20002, 'min_sentence1_length': 10, 'average_sentence1_length': 61.12, 'max_sentence1_length': 743, 'unique_sentence1': 20002, 'min_sentence2_length': 8, 'average_sentence2_length': 55.22, 'max_sentence2_length': 653, 'unique_sentence2': 20002}, 'fra-nld': {'num_samples': 14684, 'number_of_characters': 1659621, 'unique_pairs': 14684, 'min_sentence1_length': 10, 'average_sentence1_length': 60.34, 'max_sentence1_length': 400, 'unique_sentence1': 14684, 'min_sentence2_length': 10, 'average_sentence2_length': 52.69, 'max_sentence2_length': 263, 'unique_sentence2': 14684}, 'fra-nor': {'num_samples': 2558, 'number_of_characters': 290540, 'unique_pairs': 2558, 'min_sentence1_length': 10, 'average_sentence1_length': 63.31, 'max_sentence1_length': 377, 'unique_sentence1': 2558, 'min_sentence2_length': 11, 'average_sentence2_length': 50.27, 'max_sentence2_length': 305, 'unique_sentence2': 2558}, 'fra-por': {'num_samples': 13265, 'number_of_characters': 1485292, 'unique_pairs': 13265, 'min_sentence1_length': 10, 'average_sentence1_length': 59.58, 'max_sentence1_length': 284, 'unique_sentence1': 13265, 'min_sentence2_length': 8, 'average_sentence2_length': 52.39, 'max_sentence2_length': 443, 'unique_sentence2': 13265}, 'fra-ron': {'num_samples': 3295, 'number_of_characters': 381273, 'unique_pairs': 3295, 'min_sentence1_length': 8, 'average_sentence1_length': 61.62, 'max_sentence1_length': 677, 'unique_sentence1': 3295, 'min_sentence2_length': 8, 'average_sentence2_length': 54.1, 'max_sentence2_length': 587, 'unique_sentence2': 3295}, 'fra-spa': {'num_samples': 23311, 'number_of_characters': 2650610, 'unique_pairs': 23311, 'min_sentence1_length': 10, 'average_sentence1_length': 59.63, 'max_sentence1_length': 677, 'unique_sentence1': 23311, 'min_sentence2_length': 8, 'average_sentence2_length': 54.08, 'max_sentence2_length': 576, 'unique_sentence2': 23311}, 'fra-swe': {'num_samples': 5006, 'number_of_characters': 565299, 'unique_pairs': 5006, 'min_sentence1_length': 10, 'average_sentence1_length': 62.23, 'max_sentence1_length': 345, 'unique_sentence1': 5006, 'min_sentence2_length': 11, 'average_sentence2_length': 50.7, 'max_sentence2_length': 289, 'unique_sentence2': 5006}, 'isl-ita': {'num_samples': 421, 'number_of_characters': 42563, 'unique_pairs': 421, 'min_sentence1_length': 14, 'average_sentence1_length': 48.12, 'max_sentence1_length': 145, 'unique_sentence1': 421, 'min_sentence2_length': 13, 'average_sentence2_length': 52.98, 'max_sentence2_length': 151, 'unique_sentence2': 421}, 'isl-nld': {'num_samples': 311, 'number_of_characters': 33292, 'unique_pairs': 311, 'min_sentence1_length': 13, 'average_sentence1_length': 51.5, 'max_sentence1_length': 160, 'unique_sentence1': 311, 'min_sentence2_length': 11, 'average_sentence2_length': 55.55, 'max_sentence2_length': 228, 'unique_sentence2': 311}, 'isl-por': {'num_samples': 341, 'number_of_characters': 34367, 'unique_pairs': 341, 'min_sentence1_length': 12, 'average_sentence1_length': 48.83, 'max_sentence1_length': 135, 'unique_sentence1': 341, 'min_sentence2_length': 12, 'average_sentence2_length': 51.96, 'max_sentence2_length': 156, 'unique_sentence2': 341}, 'isl-spa': {'num_samples': 366, 'number_of_characters': 38563, 'unique_pairs': 366, 'min_sentence1_length': 14, 'average_sentence1_length': 49.8, 'max_sentence1_length': 196, 'unique_sentence1': 366, 'min_sentence2_length': 17, 'average_sentence2_length': 55.56, 'max_sentence2_length': 206, 'unique_sentence2': 366}, 'isl-swe': {'num_samples': 312, 'number_of_characters': 30662, 'unique_pairs': 312, 'min_sentence1_length': 14, 'average_sentence1_length': 48.84, 'max_sentence1_length': 149, 'unique_sentence1': 312, 'min_sentence2_length': 15, 'average_sentence2_length': 49.44, 'max_sentence2_length': 141, 'unique_sentence2': 312}, 'ita-nld': {'num_samples': 9160, 'number_of_characters': 973718, 'unique_pairs': 9160, 'min_sentence1_length': 11, 'average_sentence1_length': 54.2, 'max_sentence1_length': 291, 'unique_sentence1': 9160, 'min_sentence2_length': 10, 'average_sentence2_length': 52.1, 'max_sentence2_length': 337, 'unique_sentence2': 9160}, 'ita-nor': {'num_samples': 2516, 'number_of_characters': 267838, 'unique_pairs': 2516, 'min_sentence1_length': 13, 'average_sentence1_length': 56.18, 'max_sentence1_length': 356, 'unique_sentence1': 2516, 'min_sentence2_length': 12, 'average_sentence2_length': 50.28, 'max_sentence2_length': 305, 'unique_sentence2': 2516}, 'ita-por': {'num_samples': 10924, 'number_of_characters': 1157982, 'unique_pairs': 10924, 'min_sentence1_length': 10, 'average_sentence1_length': 53.5, 'max_sentence1_length': 357, 'unique_sentence1': 10924, 'min_sentence2_length': 10, 'average_sentence2_length': 52.5, 'max_sentence2_length': 276, 'unique_sentence2': 10924}, 'ita-ron': {'num_samples': 3360, 'number_of_characters': 375697, 'unique_pairs': 3360, 'min_sentence1_length': 8, 'average_sentence1_length': 56.38, 'max_sentence1_length': 346, 'unique_sentence1': 3360, 'min_sentence2_length': 9, 'average_sentence2_length': 55.43, 'max_sentence2_length': 402, 'unique_sentence2': 3360}, 'ita-spa': {'num_samples': 16534, 'number_of_characters': 1778757, 'unique_pairs': 16534, 'min_sentence1_length': 10, 'average_sentence1_length': 53.56, 'max_sentence1_length': 362, 'unique_sentence1': 16534, 'min_sentence2_length': 11, 'average_sentence2_length': 54.02, 'max_sentence2_length': 363, 'unique_sentence2': 16534}, 'ita-swe': {'num_samples': 4741, 'number_of_characters': 508653, 'unique_pairs': 4741, 'min_sentence1_length': 10, 'average_sentence1_length': 55.87, 'max_sentence1_length': 271, 'unique_sentence1': 4741, 'min_sentence2_length': 10, 'average_sentence2_length': 51.42, 'max_sentence2_length': 289, 'unique_sentence2': 4741}, 'nld-nor': {'num_samples': 2664, 'number_of_characters': 281584, 'unique_pairs': 2664, 'min_sentence1_length': 10, 'average_sentence1_length': 54.62, 'max_sentence1_length': 211, 'unique_sentence1': 2664, 'min_sentence2_length': 11, 'average_sentence2_length': 51.08, 'max_sentence2_length': 162, 'unique_sentence2': 2664}, 'nld-por': {'num_samples': 7021, 'number_of_characters': 738280, 'unique_pairs': 7021, 'min_sentence1_length': 10, 'average_sentence1_length': 51.91, 'max_sentence1_length': 243, 'unique_sentence1': 7021, 'min_sentence2_length': 11, 'average_sentence2_length': 53.24, 'max_sentence2_length': 443, 'unique_sentence2': 7021}, 'nld-ron': {'num_samples': 2888, 'number_of_characters': 311689, 'unique_pairs': 2888, 'min_sentence1_length': 10, 'average_sentence1_length': 53.16, 'max_sentence1_length': 228, 'unique_sentence1': 2888, 'min_sentence2_length': 11, 'average_sentence2_length': 54.77, 'max_sentence2_length': 252, 'unique_sentence2': 2888}, 'nld-spa': {'num_samples': 9555, 'number_of_characters': 1015756, 'unique_pairs': 9555, 'min_sentence1_length': 10, 'average_sentence1_length': 51.91, 'max_sentence1_length': 229, 'unique_sentence1': 9555, 'min_sentence2_length': 10, 'average_sentence2_length': 54.39, 'max_sentence2_length': 235, 'unique_sentence2': 9555}, 'nld-swe': {'num_samples': 5072, 'number_of_characters': 529162, 'unique_pairs': 5072, 'min_sentence1_length': 10, 'average_sentence1_length': 53.53, 'max_sentence1_length': 333, 'unique_sentence1': 5072, 'min_sentence2_length': 10, 'average_sentence2_length': 50.8, 'max_sentence2_length': 311, 'unique_sentence2': 5072}, 'nor-por': {'num_samples': 2096, 'number_of_characters': 227941, 'unique_pairs': 2096, 'min_sentence1_length': 11, 'average_sentence1_length': 51.43, 'max_sentence1_length': 200, 'unique_sentence1': 2096, 'min_sentence2_length': 12, 'average_sentence2_length': 57.32, 'max_sentence2_length': 180, 'unique_sentence2': 2096}, 'nor-ron': {'num_samples': 1412, 'number_of_characters': 156087, 'unique_pairs': 1412, 'min_sentence1_length': 13, 'average_sentence1_length': 52.16, 'max_sentence1_length': 291, 'unique_sentence1': 1412, 'min_sentence2_length': 14, 'average_sentence2_length': 58.39, 'max_sentence2_length': 281, 'unique_sentence2': 1412}, 'nor-spa': {'num_samples': 2603, 'number_of_characters': 283540, 'unique_pairs': 2603, 'min_sentence1_length': 12, 'average_sentence1_length': 51.0, 'max_sentence1_length': 283, 'unique_sentence1': 2603, 'min_sentence2_length': 13, 'average_sentence2_length': 57.93, 'max_sentence2_length': 274, 'unique_sentence2': 2603}, 'nor-swe': {'num_samples': 3165, 'number_of_characters': 325198, 'unique_pairs': 3165, 'min_sentence1_length': 11, 'average_sentence1_length': 50.94, 'max_sentence1_length': 291, 'unique_sentence1': 3165, 'min_sentence2_length': 10, 'average_sentence2_length': 51.81, 'max_sentence2_length': 289, 'unique_sentence2': 3165}, 'por-ron': {'num_samples': 3026, 'number_of_characters': 341012, 'unique_pairs': 3026, 'min_sentence1_length': 12, 'average_sentence1_length': 56.57, 'max_sentence1_length': 334, 'unique_sentence1': 3026, 'min_sentence2_length': 12, 'average_sentence2_length': 56.13, 'max_sentence2_length': 402, 'unique_sentence2': 3026}, 'por-spa': {'num_samples': 16084, 'number_of_characters': 1698580, 'unique_pairs': 16084, 'min_sentence1_length': 10, 'average_sentence1_length': 51.93, 'max_sentence1_length': 350, 'unique_sentence1': 16084, 'min_sentence2_length': 10, 'average_sentence2_length': 53.68, 'max_sentence2_length': 364, 'unique_sentence2': 16084}, 'por-swe': {'num_samples': 4235, 'number_of_characters': 444824, 'unique_pairs': 4235, 'min_sentence1_length': 11, 'average_sentence1_length': 54.48, 'max_sentence1_length': 282, 'unique_sentence1': 4235, 'min_sentence2_length': 11, 'average_sentence2_length': 50.55, 'max_sentence2_length': 255, 'unique_sentence2': 4235}, 'ron-spa': {'num_samples': 3375, 'number_of_characters': 376497, 'unique_pairs': 3375, 'min_sentence1_length': 12, 'average_sentence1_length': 54.71, 'max_sentence1_length': 587, 'unique_sentence1': 3375, 'min_sentence2_length': 11, 'average_sentence2_length': 56.85, 'max_sentence2_length': 576, 'unique_sentence2': 3375}, 'ron-swe': {'num_samples': 2154, 'number_of_characters': 238390, 'unique_pairs': 2154, 'min_sentence1_length': 9, 'average_sentence1_length': 57.04, 'max_sentence1_length': 281, 'unique_sentence1': 2154, 'min_sentence2_length': 8, 'average_sentence2_length': 53.64, 'max_sentence2_length': 289, 'unique_sentence2': 2154}, 'spa-swe': {'num_samples': 4884, 'number_of_characters': 525651, 'unique_pairs': 4884, 'min_sentence1_length': 12, 'average_sentence1_length': 56.58, 'max_sentence1_length': 244, 'unique_sentence1': 4884, 'min_sentence2_length': 10, 'average_sentence2_length': 51.04, 'max_sentence2_length': 280, 'unique_sentence2': 4884}, 'ben-hin': {'num_samples': 1174, 'number_of_characters': 115288, 'unique_pairs': 1174, 'min_sentence1_length': 11, 'average_sentence1_length': 47.5, 'max_sentence1_length': 145, 'unique_sentence1': 1174, 'min_sentence2_length': 13, 'average_sentence2_length': 50.71, 'max_sentence2_length': 149, 'unique_sentence2': 1174}, 'ben-mar': {'num_samples': 566, 'number_of_characters': 54106, 'unique_pairs': 566, 'min_sentence1_length': 12, 'average_sentence1_length': 47.8, 'max_sentence1_length': 145, 'unique_sentence1': 566, 'min_sentence2_length': 14, 'average_sentence2_length': 47.8, 'max_sentence2_length': 130, 'unique_sentence2': 566}, 'ben-urd': {'num_samples': 488, 'number_of_characters': 44387, 'unique_pairs': 488, 'min_sentence1_length': 11, 'average_sentence1_length': 44.0, 'max_sentence1_length': 110, 'unique_sentence1': 488, 'min_sentence2_length': 12, 'average_sentence2_length': 46.96, 'max_sentence2_length': 101, 'unique_sentence2': 488}, 'hin-mar': {'num_samples': 615, 'number_of_characters': 59136, 'unique_pairs': 615, 'min_sentence1_length': 11, 'average_sentence1_length': 49.57, 'max_sentence1_length': 149, 'unique_sentence1': 615, 'min_sentence2_length': 11, 'average_sentence2_length': 46.59, 'max_sentence2_length': 143, 'unique_sentence2': 615}, 'hin-urd': {'num_samples': 545, 'number_of_characters': 52165, 'unique_pairs': 545, 'min_sentence1_length': 14, 'average_sentence1_length': 48.38, 'max_sentence1_length': 111, 'unique_sentence1': 545, 'min_sentence2_length': 12, 'average_sentence2_length': 47.34, 'max_sentence2_length': 125, 'unique_sentence2': 545}, 'mar-urd': {'num_samples': 270, 'number_of_characters': 23951, 'unique_pairs': 270, 'min_sentence1_length': 14, 'average_sentence1_length': 43.16, 'max_sentence1_length': 108, 'unique_sentence1': 270, 'min_sentence2_length': 13, 'average_sentence2_length': 45.54, 'max_sentence2_length': 107, 'unique_sentence2': 270}, 'aze-kaz': {'num_samples': 412, 'number_of_characters': 38912, 'unique_pairs': 412, 'min_sentence1_length': 12, 'average_sentence1_length': 46.7, 'max_sentence1_length': 121, 'unique_sentence1': 412, 'min_sentence2_length': 14, 'average_sentence2_length': 47.75, 'max_sentence2_length': 108, 'unique_sentence2': 412}, 'aze-tur': {'num_samples': 388, 'number_of_characters': 36138, 'unique_pairs': 388, 'min_sentence1_length': 12, 'average_sentence1_length': 46.69, 'max_sentence1_length': 124, 'unique_sentence1': 388, 'min_sentence2_length': 12, 'average_sentence2_length': 46.45, 'max_sentence2_length': 123, 'unique_sentence2': 388}, 'kaz-tur': {'num_samples': 340, 'number_of_characters': 31637, 'unique_pairs': 340, 'min_sentence1_length': 17, 'average_sentence1_length': 47.27, 'max_sentence1_length': 122, 'unique_sentence1': 340, 'min_sentence2_length': 12, 'average_sentence2_length': 45.78, 'max_sentence2_length': 114, 'unique_sentence2': 340}, 'est-fin': {'num_samples': 790, 'number_of_characters': 80226, 'unique_pairs': 790, 'min_sentence1_length': 13, 'average_sentence1_length': 50.35, 'max_sentence1_length': 158, 'unique_sentence1': 790, 'min_sentence2_length': 14, 'average_sentence2_length': 51.21, 'max_sentence2_length': 152, 'unique_sentence2': 790}, 'est-hun': {'num_samples': 674, 'number_of_characters': 69641, 'unique_pairs': 674, 'min_sentence1_length': 8, 'average_sentence1_length': 50.49, 'max_sentence1_length': 157, 'unique_sentence1': 674, 'min_sentence2_length': 9, 'average_sentence2_length': 52.83, 'max_sentence2_length': 180, 'unique_sentence2': 674}, 'fin-hun': {'num_samples': 1542, 'number_of_characters': 167588, 'unique_pairs': 1542, 'min_sentence1_length': 8, 'average_sentence1_length': 53.55, 'max_sentence1_length': 243, 'unique_sentence1': 1542, 'min_sentence2_length': 9, 'average_sentence2_length': 55.13, 'max_sentence2_length': 228, 'unique_sentence2': 1542}, 'ara-eng': {'num_samples': 5698, 'number_of_characters': 544132, 'unique_pairs': 5698, 'min_sentence1_length': 10, 'average_sentence1_length': 45.67, 'max_sentence1_length': 280, 'unique_sentence1': 5698, 'min_sentence2_length': 11, 'average_sentence2_length': 49.82, 'max_sentence2_length': 287, 'unique_sentence2': 5698}, 'aze-eng': {'num_samples': 603, 'number_of_characters': 58907, 'unique_pairs': 603, 'min_sentence1_length': 12, 'average_sentence1_length': 49.94, 'max_sentence1_length': 121, 'unique_sentence1': 603, 'min_sentence2_length': 12, 'average_sentence2_length': 47.75, 'max_sentence2_length': 129, 'unique_sentence2': 603}, 'ben-eng': {'num_samples': 1367, 'number_of_characters': 126399, 'unique_pairs': 1367, 'min_sentence1_length': 10, 'average_sentence1_length': 46.61, 'max_sentence1_length': 147, 'unique_sentence1': 1367, 'min_sentence2_length': 14, 'average_sentence2_length': 45.85, 'max_sentence2_length': 148, 'unique_sentence2': 1367}, 'bul-eng': {'num_samples': 2133, 'number_of_characters': 219893, 'unique_pairs': 2133, 'min_sentence1_length': 12, 'average_sentence1_length': 53.71, 'max_sentence1_length': 273, 'unique_sentence1': 2133, 'min_sentence2_length': 11, 'average_sentence2_length': 49.38, 'max_sentence2_length': 287, 'unique_sentence2': 2133}, 'cat-eng': {'num_samples': 1152, 'number_of_characters': 118852, 'unique_pairs': 1152, 'min_sentence1_length': 10, 'average_sentence1_length': 52.34, 'max_sentence1_length': 202, 'unique_sentence1': 1152, 'min_sentence2_length': 11, 'average_sentence2_length': 50.83, 'max_sentence2_length': 195, 'unique_sentence2': 1152}, 'ces-eng': {'num_samples': 3775, 'number_of_characters': 364386, 'unique_pairs': 3775, 'min_sentence1_length': 10, 'average_sentence1_length': 47.79, 'max_sentence1_length': 265, 'unique_sentence1': 3775, 'min_sentence2_length': 10, 'average_sentence2_length': 48.74, 'max_sentence2_length': 287, 'unique_sentence2': 3775}, 'dan-eng': {'num_samples': 4512, 'number_of_characters': 451232, 'unique_pairs': 4512, 'min_sentence1_length': 10, 'average_sentence1_length': 51.15, 'max_sentence1_length': 296, 'unique_sentence1': 4512, 'min_sentence2_length': 10, 'average_sentence2_length': 48.86, 'max_sentence2_length': 287, 'unique_sentence2': 4512}, 'deu-eng': {'num_samples': 37348, 'number_of_characters': 3890899, 'unique_pairs': 37348, 'min_sentence1_length': 8, 'average_sentence1_length': 55.15, 'max_sentence1_length': 716, 'unique_sentence1': 37348, 'min_sentence2_length': 9, 'average_sentence2_length': 49.03, 'max_sentence2_length': 586, 'unique_sentence2': 37348}, 'ell-eng': {'num_samples': 2790, 'number_of_characters': 302459, 'unique_pairs': 2790, 'min_sentence1_length': 14, 'average_sentence1_length': 58.78, 'max_sentence1_length': 286, 'unique_sentence1': 2790, 'min_sentence2_length': 11, 'average_sentence2_length': 49.63, 'max_sentence2_length': 243, 'unique_sentence2': 2790}, 'eng-est': {'num_samples': 755, 'number_of_characters': 75289, 'unique_pairs': 755, 'min_sentence1_length': 12, 'average_sentence1_length': 49.1, 'max_sentence1_length': 152, 'unique_sentence1': 755, 'min_sentence2_length': 15, 'average_sentence2_length': 50.62, 'max_sentence2_length': 160, 'unique_sentence2': 755}, 'eng-fas': {'num_samples': 556, 'number_of_characters': 52628, 'unique_pairs': 556, 'min_sentence1_length': 12, 'average_sentence1_length': 48.16, 'max_sentence1_length': 184, 'unique_sentence1': 556, 'min_sentence2_length': 10, 'average_sentence2_length': 46.49, 'max_sentence2_length': 161, 'unique_sentence2': 556}, 'eng-fin': {'num_samples': 3443, 'number_of_characters': 348436, 'unique_pairs': 3443, 'min_sentence1_length': 11, 'average_sentence1_length': 49.68, 'max_sentence1_length': 410, 'unique_sentence1': 3443, 'min_sentence2_length': 11, 'average_sentence2_length': 51.52, 'max_sentence2_length': 387, 'unique_sentence2': 3443}, 'eng-fra': {'num_samples': 37208, 'number_of_characters': 4091513, 'unique_pairs': 37208, 'min_sentence1_length': 8, 'average_sentence1_length': 49.11, 'max_sentence1_length': 414, 'unique_sentence1': 37208, 'min_sentence2_length': 10, 'average_sentence2_length': 60.86, 'max_sentence2_length': 428, 'unique_sentence2': 37208}, 'eng-heb': {'num_samples': 882, 'number_of_characters': 88397, 'unique_pairs': 882, 'min_sentence1_length': 13, 'average_sentence1_length': 54.84, 'max_sentence1_length': 193, 'unique_sentence1': 882, 'min_sentence2_length': 10, 'average_sentence2_length': 45.38, 'max_sentence2_length': 162, 'unique_sentence2': 882}, 'eng-hin': {'num_samples': 2219, 'number_of_characters': 227451, 'unique_pairs': 2219, 'min_sentence1_length': 12, 'average_sentence1_length': 49.35, 'max_sentence1_length': 222, 'unique_sentence1': 2219, 'min_sentence2_length': 12, 'average_sentence2_length': 53.15, 'max_sentence2_length': 281, 'unique_sentence2': 2219}, 'eng-hrv': {'num_samples': 336, 'number_of_characters': 35675, 'unique_pairs': 336, 'min_sentence1_length': 11, 'average_sentence1_length': 52.62, 'max_sentence1_length': 264, 'unique_sentence1': 336, 'min_sentence2_length': 16, 'average_sentence2_length': 53.56, 'max_sentence2_length': 258, 'unique_sentence2': 336}, 'eng-hun': {'num_samples': 2185, 'number_of_characters': 225323, 'unique_pairs': 2185, 'min_sentence1_length': 8, 'average_sentence1_length': 50.01, 'max_sentence1_length': 238, 'unique_sentence1': 2185, 'min_sentence2_length': 9, 'average_sentence2_length': 53.11, 'max_sentence2_length': 233, 'unique_sentence2': 2185}, 'eng-ind': {'num_samples': 3454, 'number_of_characters': 364799, 'unique_pairs': 3454, 'min_sentence1_length': 11, 'average_sentence1_length': 49.18, 'max_sentence1_length': 222, 'unique_sentence1': 3454, 'min_sentence2_length': 11, 'average_sentence2_length': 56.43, 'max_sentence2_length': 245, 'unique_sentence2': 3454}, 'eng-isl': {'num_samples': 358, 'number_of_characters': 33431, 'unique_pairs': 358, 'min_sentence1_length': 13, 'average_sentence1_length': 46.72, 'max_sentence1_length': 136, 'unique_sentence1': 358, 'min_sentence2_length': 13, 'average_sentence2_length': 46.66, 'max_sentence2_length': 122, 'unique_sentence2': 358}, 'eng-ita': {'num_samples': 19661, 'number_of_characters': 2063797, 'unique_pairs': 19661, 'min_sentence1_length': 10, 'average_sentence1_length': 49.52, 'max_sentence1_length': 365, 'unique_sentence1': 19661, 'min_sentence2_length': 10, 'average_sentence2_length': 55.45, 'max_sentence2_length': 362, 'unique_sentence2': 19661}, 'eng-jpn': {'num_samples': 3807, 'number_of_characters': 318641, 'unique_pairs': 3807, 'min_sentence1_length': 12, 'average_sentence1_length': 51.45, 'max_sentence1_length': 743, 'unique_sentence1': 3807, 'min_sentence2_length': 10, 'average_sentence2_length': 32.25, 'max_sentence2_length': 414, 'unique_sentence2': 3807}, 'eng-kaz': {'num_samples': 346, 'number_of_characters': 32798, 'unique_pairs': 346, 'min_sentence1_length': 15, 'average_sentence1_length': 45.54, 'max_sentence1_length': 110, 'unique_sentence1': 346, 'min_sentence2_length': 18, 'average_sentence2_length': 49.25, 'max_sentence2_length': 122, 'unique_sentence2': 346}, 'eng-kor': {'num_samples': 2558, 'number_of_characters': 217654, 'unique_pairs': 2558, 'min_sentence1_length': 12, 'average_sentence1_length': 51.78, 'max_sentence1_length': 252, 'unique_sentence1': 2558, 'min_sentence2_length': 10, 'average_sentence2_length': 33.31, 'max_sentence2_length': 225, 'unique_sentence2': 2558}, 'eng-lav': {'num_samples': 1079, 'number_of_characters': 103672, 'unique_pairs': 1079, 'min_sentence1_length': 12, 'average_sentence1_length': 47.47, 'max_sentence1_length': 165, 'unique_sentence1': 1079, 'min_sentence2_length': 11, 'average_sentence2_length': 48.61, 'max_sentence2_length': 151, 'unique_sentence2': 1079}, 'eng-lit': {'num_samples': 1185, 'number_of_characters': 113428, 'unique_pairs': 1185, 'min_sentence1_length': 12, 'average_sentence1_length': 47.12, 'max_sentence1_length': 175, 'unique_sentence1': 1185, 'min_sentence2_length': 12, 'average_sentence2_length': 48.6, 'max_sentence2_length': 167, 'unique_sentence2': 1185}, 'eng-mar': {'num_samples': 280, 'number_of_characters': 26641, 'unique_pairs': 280, 'min_sentence1_length': 14, 'average_sentence1_length': 47.66, 'max_sentence1_length': 148, 'unique_sentence1': 280, 'min_sentence2_length': 14, 'average_sentence2_length': 47.49, 'max_sentence2_length': 143, 'unique_sentence2': 280}, 'eng-msa': {'num_samples': 469, 'number_of_characters': 59862, 'unique_pairs': 469, 'min_sentence1_length': 15, 'average_sentence1_length': 61.27, 'max_sentence1_length': 163, 'unique_sentence1': 469, 'min_sentence2_length': 14, 'average_sentence2_length': 66.37, 'max_sentence2_length': 170, 'unique_sentence2': 469}, 'eng-nld': {'num_samples': 15613, 'number_of_characters': 1555910, 'unique_pairs': 15613, 'min_sentence1_length': 10, 'average_sentence1_length': 47.86, 'max_sentence1_length': 312, 'unique_sentence1': 15613, 'min_sentence2_length': 10, 'average_sentence2_length': 51.79, 'max_sentence2_length': 337, 'unique_sentence2': 15613}, 'eng-nor': {'num_samples': 2666, 'number_of_characters': 263032, 'unique_pairs': 2666, 'min_sentence1_length': 11, 'average_sentence1_length': 49.04, 'max_sentence1_length': 287, 'unique_sentence1': 2666, 'min_sentence2_length': 11, 'average_sentence2_length': 49.63, 'max_sentence2_length': 291, 'unique_sentence2': 2666}, 'eng-pol': {'num_samples': 6868, 'number_of_characters': 682796, 'unique_pairs': 6868, 'min_sentence1_length': 8, 'average_sentence1_length': 47.91, 'max_sentence1_length': 313, 'unique_sentence1': 6868, 'min_sentence2_length': 9, 'average_sentence2_length': 51.51, 'max_sentence2_length': 325, 'unique_sentence2': 6868}, 'eng-por': {'num_samples': 12406, 'number_of_characters': 1237980, 'unique_pairs': 12406, 'min_sentence1_length': 10, 'average_sentence1_length': 47.42, 'max_sentence1_length': 743, 'unique_sentence1': 12406, 'min_sentence2_length': 10, 'average_sentence2_length': 52.37, 'max_sentence2_length': 761, 'unique_sentence2': 12406}, 'eng-ron': {'num_samples': 3039, 'number_of_characters': 322632, 'unique_pairs': 3039, 'min_sentence1_length': 8, 'average_sentence1_length': 50.61, 'max_sentence1_length': 586, 'unique_sentence1': 3039, 'min_sentence2_length': 9, 'average_sentence2_length': 55.55, 'max_sentence2_length': 699, 'unique_sentence2': 3039}, 'eng-rus': {'num_samples': 9360, 'number_of_characters': 983153, 'unique_pairs': 9360, 'min_sentence1_length': 10, 'average_sentence1_length': 51.39, 'max_sentence1_length': 607, 'unique_sentence1': 9360, 'min_sentence2_length': 10, 'average_sentence2_length': 53.65, 'max_sentence2_length': 773, 'unique_sentence2': 9360}, 'eng-slk': {'num_samples': 1823, 'number_of_characters': 178649, 'unique_pairs': 1823, 'min_sentence1_length': 12, 'average_sentence1_length': 49.06, 'max_sentence1_length': 637, 'unique_sentence1': 1823, 'min_sentence2_length': 10, 'average_sentence2_length': 48.94, 'max_sentence2_length': 597, 'unique_sentence2': 1823}, 'eng-slv': {'num_samples': 1450, 'number_of_characters': 151964, 'unique_pairs': 1450, 'min_sentence1_length': 12, 'average_sentence1_length': 51.28, 'max_sentence1_length': 287, 'unique_sentence1': 1450, 'min_sentence2_length': 11, 'average_sentence2_length': 53.52, 'max_sentence2_length': 252, 'unique_sentence2': 1450}, 'eng-spa': {'num_samples': 35446, 'number_of_characters': 3679410, 'unique_pairs': 35446, 'min_sentence1_length': 10, 'average_sentence1_length': 48.67, 'max_sentence1_length': 525, 'unique_sentence1': 35446, 'min_sentence2_length': 10, 'average_sentence2_length': 55.13, 'max_sentence2_length': 607, 'unique_sentence2': 35446}, 'eng-srp': {'num_samples': 303, 'number_of_characters': 29733, 'unique_pairs': 303, 'min_sentence1_length': 15, 'average_sentence1_length': 48.15, 'max_sentence1_length': 123, 'unique_sentence1': 303, 'min_sentence2_length': 10, 'average_sentence2_length': 49.98, 'max_sentence2_length': 128, 'unique_sentence2': 303}, 'eng-swe': {'num_samples': 6005, 'number_of_characters': 597122, 'unique_pairs': 6005, 'min_sentence1_length': 10, 'average_sentence1_length': 49.21, 'max_sentence1_length': 287, 'unique_sentence1': 6005, 'min_sentence2_length': 10, 'average_sentence2_length': 50.22, 'max_sentence2_length': 289, 'unique_sentence2': 6005}, 'eng-tgl': {'num_samples': 551, 'number_of_characters': 56336, 'unique_pairs': 551, 'min_sentence1_length': 14, 'average_sentence1_length': 45.61, 'max_sentence1_length': 165, 'unique_sentence1': 551, 'min_sentence2_length': 13, 'average_sentence2_length': 56.64, 'max_sentence2_length': 198, 'unique_sentence2': 551}, 'eng-tha': {'num_samples': 814, 'number_of_characters': 79610, 'unique_pairs': 814, 'min_sentence1_length': 11, 'average_sentence1_length': 50.45, 'max_sentence1_length': 544, 'unique_sentence1': 814, 'min_sentence2_length': 11, 'average_sentence2_length': 47.35, 'max_sentence2_length': 511, 'unique_sentence2': 814}, 'eng-tur': {'num_samples': 4606, 'number_of_characters': 446492, 'unique_pairs': 4606, 'min_sentence1_length': 10, 'average_sentence1_length': 47.39, 'max_sentence1_length': 287, 'unique_sentence1': 4606, 'min_sentence2_length': 10, 'average_sentence2_length': 49.54, 'max_sentence2_length': 314, 'unique_sentence2': 4606}, 'eng-ukr': {'num_samples': 3778, 'number_of_characters': 388398, 'unique_pairs': 3778, 'min_sentence1_length': 11, 'average_sentence1_length': 51.14, 'max_sentence1_length': 284, 'unique_sentence1': 3778, 'min_sentence2_length': 10, 'average_sentence2_length': 51.66, 'max_sentence2_length': 255, 'unique_sentence2': 3778}, 'eng-urd': {'num_samples': 268, 'number_of_characters': 25182, 'unique_pairs': 268, 'min_sentence1_length': 14, 'average_sentence1_length': 46.1, 'max_sentence1_length': 99, 'unique_sentence1': 268, 'min_sentence2_length': 15, 'average_sentence2_length': 47.86, 'max_sentence2_length': 107, 'unique_sentence2': 268}, 'eng-vie': {'num_samples': 1264, 'number_of_characters': 123022, 'unique_pairs': 1264, 'min_sentence1_length': 12, 'average_sentence1_length': 45.65, 'max_sentence1_length': 241, 'unique_sentence1': 1264, 'min_sentence2_length': 10, 'average_sentence2_length': 51.68, 'max_sentence2_length': 262, 'unique_sentence2': 1264}, 'eng-zho': {'num_samples': 4959, 'number_of_characters': 347349, 'unique_pairs': 4959, 'min_sentence1_length': 11, 'average_sentence1_length': 50.01, 'max_sentence1_length': 847, 'unique_sentence1': 4959, 'min_sentence2_length': 8, 'average_sentence2_length': 20.04, 'max_sentence2_length': 620, 'unique_sentence2': 4959}}}} | @@ -876,9 +909,9 @@ The following tables give you an overview of the tasks in MTEB. | [WikipediaSolidStateColloidalClassification](https://arxiv.org/abs/2412.00532) (Kasmaee et al., 2024) | ['eng'] | Classification | s2s | [Chemistry] | None | None | | [WikipediaSpecialtiesInChemistryClustering](https://arxiv.org/abs/2412.00532) (Kasmaee et al., 2024) | ['eng'] | Clustering | s2p | [Chemistry] | None | None | | [WikipediaTheoreticalAppliedClassification](https://arxiv.org/abs/2412.00532) (Kasmaee et al., 2024) | ['eng'] | Classification | s2s | [Chemistry] | None | None | -| [WinoGrande](https://winogrande.allenai.org/) (Xiao et al., 2024) | ['eng'] | Retrieval | s2s | [Encyclopaedic, Written] | None | None | +| [WinoGrande](https://winogrande.allenai.org/) (Sakaguchi et al., 2021) | ['eng'] | Retrieval | s2s | [Encyclopaedic, Written] | None | None | | [Winoground](https://openaccess.thecvf.com/content/CVPR2022/html/Thrush_Winoground_Probing_Vision_and_Language_Models_for_Visio-Linguistic_Compositionality_CVPR_2022_paper) (Tristan Thrush, 2022) | ['eng'] | Compositionality | i2t | [Social] | {'test': 400} | {'test': {'num_samples': 400, 'num_images': 800, 'num_texts': 800, 'num_unique_texts': 800, 'min_text_length': 8, 'average_text_length': 45.47, 'max_text_length': 151}} | -| [WisesightSentimentClassification](https://github.com/PyThaiNLP/wisesight-sentiment) | ['tha'] | Classification | s2s | [News, Social, Written] | None | None | +| [WisesightSentimentClassification](https://github.com/PyThaiNLP/wisesight-sentiment) (Suriyawongkul et al., 2019) | ['tha'] | Classification | s2s | [News, Social, Written] | None | None | | [XFlickr30kCoT2IRetrieval](https://proceedings.mlr.press/v162/bugliarello22a/bugliarello22a.pdf) (Bugliarello et al., 2022) | ['deu', 'eng', 'ind', 'jpn', 'rus', 'spa', 'tur', 'zho'] | Any2AnyMultilingualRetrieval | t2i | [Encyclopaedic, Written] | {'test': 32000} | {'test': {'number_of_characters': 1149877, 'num_samples': 32000, 'num_queries': 16000, 'num_documents': 16000, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 16000, 'min_query_length': 12, 'average_query_length': 71.87, 'max_query_length': 385, 'unique_queries': 15987, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 16000, 'hf_subset_descriptive_stats': {'de': {'number_of_characters': 132154, 'num_samples': 4000, 'num_queries': 2000, 'num_documents': 2000, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 2000, 'min_query_length': 4, 'average_query_length': 66.08, 'max_query_length': 220, 'unique_queries': 1994, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 2000}, 'en': {'number_of_characters': 153801, 'num_samples': 4000, 'num_queries': 2000, 'num_documents': 2000, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 2000, 'min_query_length': 34, 'average_query_length': 76.9, 'max_query_length': 377, 'unique_queries': 2000, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 2000}, 'es': {'number_of_characters': 160049, 'num_samples': 4000, 'num_queries': 2000, 'num_documents': 2000, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 2000, 'min_query_length': 23, 'average_query_length': 80.02, 'max_query_length': 342, 'unique_queries': 2000, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 2000}, 'id': {'number_of_characters': 167858, 'num_samples': 4000, 'num_queries': 2000, 'num_documents': 2000, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 2000, 'min_query_length': 4, 'average_query_length': 83.93, 'max_query_length': 211, 'unique_queries': 2000, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 2000}, 'ja': {'number_of_characters': 75480, 'num_samples': 4000, 'num_queries': 2000, 'num_documents': 2000, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 2000, 'min_query_length': 9, 'average_query_length': 37.74, 'max_query_length': 179, 'unique_queries': 2000, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 2000}, 'ru': {'number_of_characters': 149947, 'num_samples': 4000, 'num_queries': 2000, 'num_documents': 2000, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 2000, 'min_query_length': 10, 'average_query_length': 74.97, 'max_query_length': 294, 'unique_queries': 1997, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 2000}, 'tr': {'number_of_characters': 136134, 'num_samples': 4000, 'num_queries': 2000, 'num_documents': 2000, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 2000, 'min_query_length': 19, 'average_query_length': 68.07, 'max_query_length': 199, 'unique_queries': 1997, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 2000}, 'zh': {'number_of_characters': 46454, 'num_samples': 4000, 'num_queries': 2000, 'num_documents': 2000, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 2000, 'min_query_length': 10, 'average_query_length': 23.23, 'max_query_length': 66, 'unique_queries': 1999, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 2000}}}} | | [XM3600T2IRetrieval](https://aclanthology.org/2022.emnlp-main.45/) (Thapliyal et al., 2022) | ['ara', 'ben', 'ces', 'dan', 'deu', 'ell', 'eng', 'fas', 'fil', 'fin', 'fra', 'heb', 'hin', 'hrv', 'hun', 'ind', 'ita', 'jpn', 'kor', 'mri', 'nld', 'nor', 'pol', 'por', 'quz', 'ron', 'rus', 'spa', 'swa', 'swe', 'tel', 'tha', 'tur', 'ukr', 'vie', 'zho'] | Any2AnyMultilingualRetrieval | t2i | [Encyclopaedic, Written] | {'test': 390975} | {'test': {'number_of_characters': 17009034, 'num_samples': 390975, 'num_queries': 261375, 'num_documents': 129600, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 129600, 'min_query_length': 9, 'average_query_length': 65.08, 'max_query_length': 532, 'unique_queries': 259932, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 129600, 'hf_subset_descriptive_stats': {'ar': {'number_of_characters': 310802, 'num_samples': 10967, 'num_queries': 7367, 'num_documents': 3600, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 3600, 'min_query_length': 8, 'average_query_length': 42.19, 'max_query_length': 208, 'unique_queries': 7339, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 3600}, 'bn': {'number_of_characters': 223622, 'num_samples': 7200, 'num_queries': 3600, 'num_documents': 3600, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 3600, 'min_query_length': 28, 'average_query_length': 62.12, 'max_query_length': 139, 'unique_queries': 3594, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 3600}, 'cs': {'number_of_characters': 282069, 'num_samples': 10807, 'num_queries': 7207, 'num_documents': 3600, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 3600, 'min_query_length': 3, 'average_query_length': 39.14, 'max_query_length': 266, 'unique_queries': 6814, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 3600}, 'da': {'number_of_characters': 351028, 'num_samples': 10864, 'num_queries': 7264, 'num_documents': 3600, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 3600, 'min_query_length': 7, 'average_query_length': 48.32, 'max_query_length': 158, 'unique_queries': 7246, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 3600}, 'de': {'number_of_characters': 660790, 'num_samples': 12243, 'num_queries': 8643, 'num_documents': 3600, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 3600, 'min_query_length': 11, 'average_query_length': 76.45, 'max_query_length': 334, 'unique_queries': 8643, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 3600}, 'el': {'number_of_characters': 370363, 'num_samples': 10804, 'num_queries': 7204, 'num_documents': 3600, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 3600, 'min_query_length': 4, 'average_query_length': 51.41, 'max_query_length': 262, 'unique_queries': 7100, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 3600}, 'en': {'number_of_characters': 356488, 'num_samples': 10800, 'num_queries': 7200, 'num_documents': 3600, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 3600, 'min_query_length': 6, 'average_query_length': 49.51, 'max_query_length': 148, 'unique_queries': 7129, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 3600}, 'es': {'number_of_characters': 485004, 'num_samples': 12214, 'num_queries': 8614, 'num_documents': 3600, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 3600, 'min_query_length': 12, 'average_query_length': 56.3, 'max_query_length': 179, 'unique_queries': 8605, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 3600}, 'fa': {'number_of_characters': 430055, 'num_samples': 10845, 'num_queries': 7245, 'num_documents': 3600, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 3600, 'min_query_length': 4, 'average_query_length': 59.36, 'max_query_length': 289, 'unique_queries': 7242, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 3600}, 'fi': {'number_of_characters': 464334, 'num_samples': 10727, 'num_queries': 7127, 'num_documents': 3600, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 3600, 'min_query_length': 1, 'average_query_length': 65.15, 'max_query_length': 336, 'unique_queries': 7110, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 3600}, 'fil': {'number_of_characters': 480287, 'num_samples': 10709, 'num_queries': 7109, 'num_documents': 3600, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 3600, 'min_query_length': 4, 'average_query_length': 67.56, 'max_query_length': 332, 'unique_queries': 7016, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 3600}, 'fr': {'number_of_characters': 595836, 'num_samples': 12162, 'num_queries': 8562, 'num_documents': 3600, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 3600, 'min_query_length': 15, 'average_query_length': 69.59, 'max_query_length': 173, 'unique_queries': 8560, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 3600}, 'he': {'number_of_characters': 457775, 'num_samples': 10800, 'num_queries': 7200, 'num_documents': 3600, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 3600, 'min_query_length': 3, 'average_query_length': 63.58, 'max_query_length': 453, 'unique_queries': 7190, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 3600}, 'hi': {'number_of_characters': 509092, 'num_samples': 12103, 'num_queries': 8503, 'num_documents': 3600, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 3600, 'min_query_length': 3, 'average_query_length': 59.87, 'max_query_length': 188, 'unique_queries': 8422, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 3600}, 'hr': {'number_of_characters': 420595, 'num_samples': 10880, 'num_queries': 7280, 'num_documents': 3600, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 3600, 'min_query_length': 3, 'average_query_length': 57.77, 'max_query_length': 271, 'unique_queries': 7224, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 3600}, 'hu': {'number_of_characters': 436677, 'num_samples': 10816, 'num_queries': 7216, 'num_documents': 3600, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 3600, 'min_query_length': 5, 'average_query_length': 60.52, 'max_query_length': 393, 'unique_queries': 7209, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 3600}, 'id': {'number_of_characters': 666387, 'num_samples': 10726, 'num_queries': 7126, 'num_documents': 3600, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 3600, 'min_query_length': 16, 'average_query_length': 93.51, 'max_query_length': 286, 'unique_queries': 7125, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 3600}, 'it': {'number_of_characters': 608604, 'num_samples': 12071, 'num_queries': 8471, 'num_documents': 3600, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 3600, 'min_query_length': 15, 'average_query_length': 71.85, 'max_query_length': 201, 'unique_queries': 8470, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 3600}, 'ja': {'number_of_characters': 186672, 'num_samples': 10785, 'num_queries': 7185, 'num_documents': 3600, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 3600, 'min_query_length': 1, 'average_query_length': 25.98, 'max_query_length': 97, 'unique_queries': 7175, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 3600}, 'ko': {'number_of_characters': 188812, 'num_samples': 11250, 'num_queries': 7650, 'num_documents': 3600, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 3600, 'min_query_length': 1, 'average_query_length': 24.68, 'max_query_length': 113, 'unique_queries': 7644, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 3600}, 'mi': {'number_of_characters': 262800, 'num_samples': 8332, 'num_queries': 4732, 'num_documents': 3600, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 3600, 'min_query_length': 7, 'average_query_length': 55.54, 'max_query_length': 304, 'unique_queries': 4707, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 3600}, 'nl': {'number_of_characters': 370231, 'num_samples': 11659, 'num_queries': 8059, 'num_documents': 3600, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 3600, 'min_query_length': 4, 'average_query_length': 45.94, 'max_query_length': 173, 'unique_queries': 8004, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 3600}, 'no': {'number_of_characters': 391381, 'num_samples': 10813, 'num_queries': 7213, 'num_documents': 3600, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 3600, 'min_query_length': 5, 'average_query_length': 54.26, 'max_query_length': 162, 'unique_queries': 7191, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 3600}, 'pl': {'number_of_characters': 411189, 'num_samples': 10741, 'num_queries': 7141, 'num_documents': 3600, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 3600, 'min_query_length': 4, 'average_query_length': 57.58, 'max_query_length': 226, 'unique_queries': 7117, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 3600}, 'pt': {'number_of_characters': 446873, 'num_samples': 10843, 'num_queries': 7243, 'num_documents': 3600, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 3600, 'min_query_length': 10, 'average_query_length': 61.7, 'max_query_length': 324, 'unique_queries': 7220, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 3600}, 'quz': {'number_of_characters': 278263, 'num_samples': 10800, 'num_queries': 7200, 'num_documents': 3600, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 3600, 'min_query_length': 2, 'average_query_length': 38.65, 'max_query_length': 234, 'unique_queries': 7130, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 3600}, 'ro': {'number_of_characters': 629977, 'num_samples': 10723, 'num_queries': 7123, 'num_documents': 3600, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 3600, 'min_query_length': 5, 'average_query_length': 88.44, 'max_query_length': 524, 'unique_queries': 7122, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 3600}, 'ru': {'number_of_characters': 477558, 'num_samples': 10800, 'num_queries': 7200, 'num_documents': 3600, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 3600, 'min_query_length': 11, 'average_query_length': 66.33, 'max_query_length': 232, 'unique_queries': 7194, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 3600}, 'sv': {'number_of_characters': 339400, 'num_samples': 10873, 'num_queries': 7273, 'num_documents': 3600, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 3600, 'min_query_length': 6, 'average_query_length': 46.67, 'max_query_length': 174, 'unique_queries': 7199, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 3600}, 'sw': {'number_of_characters': 444085, 'num_samples': 10646, 'num_queries': 7046, 'num_documents': 3600, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 3600, 'min_query_length': 2, 'average_query_length': 63.03, 'max_query_length': 299, 'unique_queries': 7014, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 3600}, 'te': {'number_of_characters': 341340, 'num_samples': 10800, 'num_queries': 7200, 'num_documents': 3600, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 3600, 'min_query_length': 16, 'average_query_length': 47.41, 'max_query_length': 132, 'unique_queries': 7062, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 3600}, 'th': {'number_of_characters': 344730, 'num_samples': 10800, 'num_queries': 7200, 'num_documents': 3600, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 3600, 'min_query_length': 8, 'average_query_length': 47.88, 'max_query_length': 147, 'unique_queries': 7170, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 3600}, 'tr': {'number_of_characters': 458639, 'num_samples': 10833, 'num_queries': 7233, 'num_documents': 3600, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 3600, 'min_query_length': 7, 'average_query_length': 63.41, 'max_query_length': 453, 'unique_queries': 7224, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 3600}, 'uk': {'number_of_characters': 474311, 'num_samples': 10815, 'num_queries': 7215, 'num_documents': 3600, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 3600, 'min_query_length': 10, 'average_query_length': 65.74, 'max_query_length': 372, 'unique_queries': 7206, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 3600}, 'vi': {'number_of_characters': 582546, 'num_samples': 10950, 'num_queries': 7350, 'num_documents': 3600, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 3600, 'min_query_length': 6, 'average_query_length': 79.26, 'max_query_length': 287, 'unique_queries': 7350, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 3600}, 'zh': {'number_of_characters': 165110, 'num_samples': 10774, 'num_queries': 7174, 'num_documents': 3600, 'min_document_length': 0, 'average_document_length': 0, 'max_document_length': 0, 'unique_documents': 0, 'num_document_images': 3600, 'min_query_length': 1, 'average_query_length': 23.02, 'max_query_length': 96, 'unique_queries': 7165, 'num_query_images': 0, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 3600}}}} | | XMarket (Bonab et al., 2021) | ['deu', 'eng', 'spa'] | Retrieval | s2p | | None | None | @@ -886,14 +919,15 @@ The following tables give you an overview of the tasks in MTEB. | [XNLIV2](https://arxiv.org/pdf/2301.06527) (Upadhyay et al., 2023) | ['asm', 'ben', 'bho', 'ell', 'guj', 'kan', 'mar', 'ory', 'pan', 'rus', 'san', 'tam', 'tur'] | PairClassification | s2s | [Fiction, Government, Non-fiction, Written] | None | None | | [XPQARetrieval](https://arxiv.org/abs/2305.09249) (Shen et al., 2023) | ['ara', 'cmn', 'deu', 'eng', 'fra', 'hin', 'ita', 'jpn', 'kor', 'pol', 'por', 'spa', 'tam'] | Retrieval | s2p | [Reviews, Written] | None | None | | [XQuADRetrieval](https://huggingface.co/datasets/xquad) (Mikel Artetxe, 2019) | ['arb', 'deu', 'ell', 'eng', 'hin', 'ron', 'rus', 'spa', 'tha', 'tur', 'vie', 'zho'] | Retrieval | s2p | [Web, Written] | None | None | -| [XStance](https://github.com/ZurichNLP/xstance) | ['deu', 'fra', 'ita'] | PairClassification | s2s | [Social, Written] | None | None | +| [XStance](https://github.com/ZurichNLP/xstance) (Vamvas et al., 2020) | ['deu', 'fra', 'ita'] | PairClassification | s2s | [Social, Written] | None | None | | [YahooAnswersTopicsClassification](https://huggingface.co/datasets/yahoo_answers_topics) (Zhang et al., 2015) | ['eng'] | Classification | s2s | [Web, Written] | None | None | | [YelpReviewFullClassification](https://arxiv.org/abs/1509.01626) (Zhang et al., 2015) | ['eng'] | Classification | s2s | [Reviews, Written] | None | None | | [YueOpenriceReviewClassification](https://github.com/Christainx/Dataset_Cantonese_Openrice) (Xiang et al., 2019) | ['yue'] | Classification | s2s | [Reviews, Spoken] | None | None | -| [indonli](https://link.springer.com/chapter/10.1007/978-3-030-41505-1_39) | ['ind'] | PairClassification | s2s | [Encyclopaedic, News, Web, Written] | None | None | +| [ZacLegalTextRetrieval](https://challenge.zalo.ai/) | ['vie'] | Retrieval | s2p | [Legal] | None | None | +| [indonli](https://link.springer.com/chapter/10.1007/978-3-030-41505-1_39) (Mahendra et al., 2021) | ['ind'] | PairClassification | s2s | [Encyclopaedic, News, Web, Written] | None | None | | [mFollowIRCrossLingualInstructionRetrieval](https://neuclir.github.io/) (Weller et al., 2024) | ['eng', 'fas', 'rus', 'zho'] | Retrieval | s2p | [News, Written] | {'test': 121758} | {'test': {'num_samples': 121758, 'num_docs': 121635, 'num_queries': 123, 'number_of_characters': 283654099, 'min_document_length': 74, 'average_document_length': 2331.08, 'max_document_length': 24179, 'unique_docs': 121635, 'min_query_length': 32, 'average_query_length': 81.88, 'max_query_length': 173, 'unique_queries': 75, 'min_instruction_length': 93, 'average_instruction_length': 389.95, 'max_instruction_length': 887, 'unique_instructions': 75, 'min_changed_instruction_length': 180, 'average_changed_instruction_length': 450.55, 'max_changed_instruction_length': 974, 'unique_changed_instructions': 123, 'min_average_relevant_docs_per_query': 0, 'average_relevant_docs_per_query': 10.43, 'max_average_relevant_docs_per_query': 24, 'min_average_top_ranked_per_query': 1000, 'average_top_ranked_per_query': 1000.0, 'max_average_top_ranked_per_query': 1000, 'hf_subset_descriptive_stats': {'eng-fas': {'num_samples': 41229, 'num_docs': 41189, 'num_queries': 40, 'number_of_characters': 129597567, 'min_document_length': 99, 'average_document_length': 3145.5, 'max_document_length': 24179, 'unique_docs': 41189, 'min_query_length': 34, 'average_query_length': 80.08, 'max_query_length': 124, 'unique_queries': 40, 'min_instruction_length': 150, 'average_instruction_length': 396.88, 'max_instruction_length': 887, 'unique_instructions': 40, 'min_changed_instruction_length': 205, 'average_changed_instruction_length': 463.18, 'max_changed_instruction_length': 974, 'unique_changed_instructions': 40, 'min_average_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 10.85, 'max_average_relevant_docs_per_query': 22, 'min_average_top_ranked_per_query': 1000, 'average_top_ranked_per_query': 1000.0, 'max_average_top_ranked_per_query': 1000}, 'eng-rus': {'num_samples': 39366, 'num_docs': 39326, 'num_queries': 40, 'number_of_characters': 109522175, 'min_document_length': 75, 'average_document_length': 2784.08, 'max_document_length': 24061, 'unique_docs': 39326, 'min_query_length': 32, 'average_query_length': 81.88, 'max_query_length': 173, 'unique_queries': 40, 'min_instruction_length': 93, 'average_instruction_length': 371.12, 'max_instruction_length': 887, 'unique_instructions': 40, 'min_changed_instruction_length': 180, 'average_changed_instruction_length': 431.8, 'max_changed_instruction_length': 957, 'unique_changed_instructions': 40, 'min_average_relevant_docs_per_query': 0, 'average_relevant_docs_per_query': 9.78, 'max_average_relevant_docs_per_query': 24, 'min_average_top_ranked_per_query': 1000, 'average_top_ranked_per_query': 1000.0, 'max_average_top_ranked_per_query': 1000}, 'eng-zho': {'num_samples': 41163, 'num_docs': 41120, 'num_queries': 43, 'number_of_characters': 44534357, 'min_document_length': 74, 'average_document_length': 1082.05, 'max_document_length': 23840, 'unique_docs': 41120, 'min_query_length': 32, 'average_query_length': 83.56, 'max_query_length': 159, 'unique_queries': 43, 'min_instruction_length': 157, 'average_instruction_length': 401.02, 'max_instruction_length': 731, 'unique_instructions': 43, 'min_changed_instruction_length': 209, 'average_changed_instruction_length': 456.26, 'max_changed_instruction_length': 822, 'unique_changed_instructions': 43, 'min_average_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 10.65, 'max_average_relevant_docs_per_query': 24, 'min_average_top_ranked_per_query': 1000, 'average_top_ranked_per_query': 1000.0, 'max_average_top_ranked_per_query': 1000}}}} | | [mFollowIRInstructionRetrieval](https://neuclir.github.io/) (Weller et al., 2024) | ['fas', 'rus', 'zho'] | Retrieval | s2p | [News, Written] | {'test': 121758} | {'test': {'num_samples': 121758, 'num_docs': 121635, 'num_queries': 123, 'number_of_characters': 283622456, 'min_document_length': 74, 'average_document_length': 2331.08, 'max_document_length': 24179, 'unique_docs': 121635, 'min_query_length': 10, 'average_query_length': 57.11, 'max_query_length': 136, 'unique_queries': 123, 'min_instruction_length': 37, 'average_instruction_length': 281.07, 'max_instruction_length': 1009, 'unique_instructions': 123, 'min_changed_instruction_length': 44, 'average_changed_instruction_length': 326.94, 'max_changed_instruction_length': 1083, 'unique_changed_instructions': 123, 'min_average_relevant_docs_per_query': 0, 'average_relevant_docs_per_query': 10.43, 'max_average_relevant_docs_per_query': 24, 'min_average_top_ranked_per_query': 1000, 'average_top_ranked_per_query': 1000.0, 'max_average_top_ranked_per_query': 1000, 'hf_subset_descriptive_stats': {'fas': {'num_samples': 41229, 'num_docs': 41189, 'num_queries': 40, 'number_of_characters': 129593838, 'min_document_length': 99, 'average_document_length': 3145.5, 'max_document_length': 24179, 'unique_docs': 41189, 'min_query_length': 34, 'average_query_length': 72.65, 'max_query_length': 124, 'unique_queries': 40, 'min_instruction_length': 121, 'average_instruction_length': 358.93, 'max_instruction_length': 759, 'unique_instructions': 40, 'min_changed_instruction_length': 163, 'average_changed_instruction_length': 415.32, 'max_changed_instruction_length': 842, 'unique_changed_instructions': 40, 'min_average_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 10.85, 'max_average_relevant_docs_per_query': 22, 'min_average_top_ranked_per_query': 1000, 'average_top_ranked_per_query': 1000.0, 'max_average_top_ranked_per_query': 1000}, 'rus': {'num_samples': 39366, 'num_docs': 39326, 'num_queries': 40, 'number_of_characters': 109523683, 'min_document_length': 75, 'average_document_length': 2784.08, 'max_document_length': 24061, 'unique_docs': 39326, 'min_query_length': 26, 'average_query_length': 77.5, 'max_query_length': 136, 'unique_queries': 40, 'min_instruction_length': 78, 'average_instruction_length': 387.0, 'max_instruction_length': 1009, 'unique_instructions': 40, 'min_changed_instruction_length': 187, 'average_changed_instruction_length': 458.0, 'max_changed_instruction_length': 1083, 'unique_changed_instructions': 40, 'min_average_relevant_docs_per_query': 0, 'average_relevant_docs_per_query': 9.78, 'max_average_relevant_docs_per_query': 24, 'min_average_top_ranked_per_query': 1000, 'average_top_ranked_per_query': 1000.0, 'max_average_top_ranked_per_query': 1000}, 'zho': {'num_samples': 41163, 'num_docs': 41120, 'num_queries': 43, 'number_of_characters': 44504935, 'min_document_length': 74, 'average_document_length': 1082.05, 'max_document_length': 23840, 'unique_docs': 41120, 'min_query_length': 10, 'average_query_length': 23.7, 'max_query_length': 44, 'unique_queries': 43, 'min_instruction_length': 37, 'average_instruction_length': 110.09, 'max_instruction_length': 209, 'unique_instructions': 43, 'min_changed_instruction_length': 44, 'average_changed_instruction_length': 122.81, 'max_changed_instruction_length': 229, 'unique_changed_instructions': 43, 'min_average_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 10.65, 'max_average_relevant_docs_per_query': 24, 'min_average_top_ranked_per_query': 1000, 'average_top_ranked_per_query': 1000.0, 'max_average_top_ranked_per_query': 1000}}}} | -| [mMARCO-NL](https://github.com/unicamp-dl/mMARCO) (Luiz Bonifacio and Israel Campiotti and Roberto de Alencar Lotufo and Rodrigo Frassetto Nogueira, 2021) | ['nld'] | Retrieval | s2p | [Web, Written] | None | None | +| [mMARCO-NL](https://github.com/unicamp-dl/mMARCO) (Luiz Bonifacio and Israel Campiotti and Roberto de Alencar Lotufo and Rodrigo Frassetto Nogueira, 2021) | ['nld'] | Retrieval | s2p | [Web, Written] | None | None | @@ -906,1065 +940,1062 @@ The following tables give you an overview of the tasks in MTEB.
-| ISO Code | Language | Family | Any2AnyMultiChoice | Any2AnyMultilingualRetrieval | Any2AnyRetrieval | BitextMining | Classification | Clustering | Compositionality | DocumentUnderstanding | ImageClassification | ImageClustering | ImageMultilabelClassification | InstructionRetrieval | MultilabelClassification | PairClassification | Reranking | Retrieval | STS | Speed | Summarization | VisionCentricQA | VisualSTS(eng) | VisualSTS(multi) | ZeroShotClassification | Sum | -|----------|----------|--------|--------------------|------------------------------|------------------|--------------|----------------|------------|------------------|-----------------------|---------------------|-----------------|-------------------------------|----------------------|--------------------------|--------------------|-----------|-----------|-----|-------|---------------|-----------------|----------------|------------------|------------------------|-----| -| aai | Arifama-Miniafia | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| aak | Ankave | Angan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| aau | Abau | Sepik | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| aaz | Amarasi | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| abs | Ambonese Malay | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| abt | Ambulas | Ndu | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| abx | Inabaknon | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| aby | Aneme Wake | Yareban | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ace | Achinese | Austronesian | 0 | 0 | 0 | 2 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | -| acf | Saint Lucian Creole French | Indo-European | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| acm | Mesopotamian Arabic | Afro-Asiatic | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | -| acq | Ta'izzi-Adeni Arabic | Afro-Asiatic | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| acr | Achi | Mayan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| acu | Achuar-Shiwiar | Chicham | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| adz | Adzera | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| aeb | Tunisian Arabic | Afro-Asiatic | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| aer | Eastern Arrernte | Pama-Nyungan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| aey | Amele | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| afr | Afrikaans | Indo-European | 0 | 0 | 0 | 3 | 4 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 10 | -| agd | Agarabi | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| agg | Angor | Senagi | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| agm | Angaataha | Angan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| agn | Agutaynen | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| agr | Aguaruna | Chicham | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| agt | Central Cagayan Agta | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| agu | Aguacateco | Mayan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| aia | Arosi | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| aii | Assyrian Neo-Aramaic | Afro-Asiatic | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ajp | South Levantine Arabic | Unclassified | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| aka | Akan | Atlantic-Congo | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | -| ake | Akawaio | Cariban | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| alp | Alune | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| alq | Algonquin | Algic | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| als | Tosk Albanian | Indo-European | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | -| aly | Alyawarr | Pama-Nyungan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ame | Yanesha' | Arawakan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| amf | Hamer-Banna | South Omotic | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| amh | Amharic | Afro-Asiatic | 0 | 0 | 0 | 3 | 6 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 14 | -| amk | Ambai | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| amm | Ama (Papua New Guinea) | Left May | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| amn | Amanab | Border | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| amo | Amo | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| amp | Alamblak | Sepik | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| amr | Amarakaeri | Harakmbut | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| amu | Guerrero Amuzgo | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| amx | Anmatyerre | Pama-Nyungan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ang | Old English (ca. 450-1100) | Indo-European | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| anh | Nend | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| anp | Angika | Indo-European | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| anv | Denya | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| aoi | Anindilyakwa | Gunwinyguan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| aoj | Mufian | Nuclear Torricelli | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| aom | Ömie | Koiarian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| aon | Bumbita Arapesh | Nuclear Torricelli | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| apb | Sa'a | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| apc | Levantine Arabic | Afro-Asiatic | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | -| ape | Bukiyip | Nuclear Torricelli | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| apn | Apinayé | Nuclear-Macro-Je | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| apr | Arop-Lokep | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| apu | Apurinã | Arawakan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| apw | Western Apache | Athabaskan-Eyak-Tlingit | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| apz | Safeyoka | Angan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ara | Arabic | Unclassified | 0 | 2 | 0 | 4 | 12 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 2 | 10 | 2 | 0 | 0 | 0 | 0 | 2 | 0 | 36 | -| arb | Standard Arabic | Afro-Asiatic | 0 | 0 | 0 | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 8 | -| are | Western Arrarnta | Pama-Nyungan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| arl | Arabela | Zaparoan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| arn | Mapudungun | Araucanian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| arp | Arapaho | Algic | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| arq | Algerian Arabic | Afro-Asiatic | 0 | 0 | 0 | 1 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | -| ars | Najdi Arabic | Afro-Asiatic | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | -| ary | Moroccan Arabic | Afro-Asiatic | 0 | 0 | 0 | 1 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 7 | -| arz | Egyptian Arabic | Afro-Asiatic | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | -| asm | Assamese | Indo-European | 0 | 0 | 0 | 5 | 3 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 14 | -| aso | Dano | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ast | Asturian | Indo-European | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | -| ata | Pele-Ata | Unclassified | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| atb | Zaiwa | Sino-Tibetan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| atd | Ata Manobo | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| atg | Ivbie North-Okpela-Arhe | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| att | Pamplona Atta | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| auc | Waorani | Unclassified | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| aui | Anuki | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| auy | Awiyaana | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| avt | Au | Nuclear Torricelli | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| awa | Awadhi | Indo-European | 0 | 0 | 0 | 3 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | -| awb | Awa (Papua New Guinea) | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| awk | Awabakal | Pama-Nyungan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| awx | Awara | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ayr | Central Aymara | Aymaran | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| azb | South Azerbaijani | Turkic | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | -| aze | Azerbaijani | Unclassified | 0 | 0 | 0 | 4 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7 | -| azg | San Pedro Amuzgos Amuzgo | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| azj | North Azerbaijani | Turkic | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | -| azz | Highland Puebla Nahuatl | Uto-Aztecan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bak | Bashkir | Turkic | 0 | 0 | 0 | 2 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | -| bam | Bambara | Mande | 0 | 0 | 0 | 1 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | -| ban | Balinese | Austronesian | 0 | 0 | 0 | 2 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | -| bao | Waimaha | Tucanoan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bba | Baatonum | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bbb | Barai | Koiarian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bbc | Batak Toba | Austronesian | 0 | 0 | 0 | 2 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | -| bbr | Girawa | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bch | Bariai | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bco | Kaluli | Bosavi | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bdd | Bunama | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bea | Beaver | Athabaskan-Eyak-Tlingit | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bef | Benabena | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bel | Belarusian | Indo-European | 0 | 0 | 0 | 4 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | -| bem | Bemba (Zambia) | Atlantic-Congo | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | -| ben | Bengali | Indo-European | 0 | 1 | 0 | 9 | 9 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 2 | 7 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 32 | -| beo | Beami | Bosavi | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ber | Berber (Other) | Unclassified | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| beu | Blagar | Timor-Alor-Pantar | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bew | Betawi | Austronesian | 0 | 0 | 0 | 1 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| bgc | Haryanvi | Indo-European | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | -| bgs | Tagabawa | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bgt | Bughotu | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bhb | Bhili | Indo-European | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bhd | Bhadrawahi | Indo-European | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bhg | Binandere | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bhl | Bimin | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bho | Bhojpuri | Indo-European | 0 | 0 | 0 | 2 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | -| bhp | Bima | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| big | Biangai | Kunimaipan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bjj | Kanauji | Indo-European | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bjk | Barok | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bjn | Banjar | Austronesian | 0 | 0 | 0 | 2 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | -| bjp | Fanamaket | Unclassified | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bjr | Binumarien | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bjv | Bedjond | Central Sudanic | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bjz | Baruga | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bkd | Binukid | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bki | Baki | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bkq | Bakairí | Cariban | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bkx | Baikeno | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| blw | Balangao | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| blz | Balantak | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bmh | Kein | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bmk | Ghayavi | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bmr | Muinane | Boran | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bmu | Somba-Siawari | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bnp | Bola | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bns | Bundeli | Indo-European | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| boa | Bora | Boran | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bod | Tibetan | Sino-Tibetan | 0 | 0 | 0 | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | -| boj | Anjam | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bon | Bine | Eastern Trans-Fly | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bos | Bosnian | Indo-European | 0 | 0 | 0 | 3 | 1 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | -| box | Buamu | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| boy | Bodo (Central African Republic) | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bpr | Koronadal Blaan | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bps | Sarangani Blaan | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bqc | Boko (Benin) | Mande | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bqp | Busa | Mande | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bra | Braj | Indo-European | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bre | Breton | Indo-European | 0 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | -| brx | Bodo (India) | Sino-Tibetan | 0 | 0 | 0 | 2 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | -| bsj | Bangwinji | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bsn | Barasana-Eduria | Tucanoan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bsp | Baga Sitemu | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bss | Akoose | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bug | Buginese | Austronesian | 0 | 0 | 0 | 2 | 4 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7 | -| buk | Bugawac | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bul | Bulgarian | Indo-European | 0 | 1 | 0 | 5 | 4 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 17 | -| bus | Bokobaru | Mande | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bvd | Baeggu | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bvr | Burarra | Maningrida | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bxh | Buhutu | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| byr | Baruya | Angan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| byx | Qaqet | Baining | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bzd | Bribri | Chibchan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bzh | Mapos Buang | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bzj | Belize Kriol English | Indo-European | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| caa | Chortí | Mayan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cab | Garifuna | Arawakan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cac | Chuj | Mayan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| caf | Southern Carrier | Athabaskan-Eyak-Tlingit | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cak | Kaqchikel | Mayan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cao | Chácobo | Pano-Tacanan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cap | Chipaya | Uru-Chipaya | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| car | Galibi Carib | Cariban | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cat | Catalan | Indo-European | 0 | 0 | 0 | 5 | 2 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 11 | -| cav | Cavineña | Pano-Tacanan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cax | Chiquitano | Chiquitano | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cbc | Carapana | Tucanoan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cbi | Chachi | Barbacoan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cbk | Chavacano | Indo-European | 0 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | -| cbr | Cashibo-Cacataibo | Pano-Tacanan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cbs | Cashinahua | Pano-Tacanan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cbt | Chayahuita | Cahuapanan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cbu | Candoshi-Shapra | Unclassified | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cbv | Cacua | Kakua-Nukak | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cco | Comaltepec Chinantec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ceb | Cebuano | Austronesian | 0 | 0 | 0 | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | -| cek | Eastern Khumi Chin | Sino-Tibetan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ces | Czech | Indo-European | 0 | 1 | 0 | 6 | 5 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 2 | 1 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 21 | -| cgc | Kagayanen | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cha | Chamorro | Austronesian | 0 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | -| chd | Highland Oaxaca Chontal | Tequistlatecan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| chf | Tabasco Chontal | Mayan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| chk | Chuukese | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| chq | Quiotepec Chinantec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| chv | Chuvash | Turkic | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| chz | Ozumacín Chinantec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cjk | Chokwe | Atlantic-Congo | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| cjo | Ashéninka Pajonal | Arawakan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cjv | Chuave | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ckb | Central Kurdish | Indo-European | 0 | 0 | 0 | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | -| cle | Lealao Chinantec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| clu | Caluyanun | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cme | Cerma | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cmn | Mandarin Chinese | Sino-Tibetan | 0 | 0 | 0 | 4 | 10 | 4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 4 | 10 | 9 | 0 | 0 | 0 | 0 | 2 | 0 | 46 | -| cmo | Central Mnong | Austroasiatic | 0 | 0 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | -| cni | Asháninka | Arawakan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cnl | Lalana Chinantec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cnt | Tepetotutla Chinantec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| code | unknown | Unclassified | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 37 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 41 | -| cof | Colorado | Barbacoan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| con | Cofán | Unclassified | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cop | Coptic | Afro-Asiatic | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cor | Cornish | Indo-European | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cot | Caquinte | Arawakan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cpa | Palantla Chinantec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cpb | Ucayali-Yurúa Ashéninka | Arawakan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cpc | Ajyíninka Apurucayali | Arawakan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cpu | Pichis Ashéninka | Arawakan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cpy | South Ucayali Ashéninka | Arawakan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| crh | Crimean Tatar | Turkic | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| crn | El Nayar Cora | Uto-Aztecan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| crx | Carrier | Athabaskan-Eyak-Tlingit | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| csb | Kashubian | Indo-European | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cso | Sochiapam Chinantec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| csy | Siyin Chin | Sino-Tibetan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cta | Tataltepec Chatino | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cth | Thaiphum Chin | Bookkeeping | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ctp | Western Highland Chatino | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ctu | Chol | Mayan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cub | Cubeo | Tucanoan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cuc | Usila Chinantec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cui | Cuiba | Guahiboan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cuk | San Blas Kuna | Chibchan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cut | Teutila Cuicatec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cux | Tepeuxila Cuicatec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cwe | Kwere | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cya | Nopala Chatino | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cym | Welsh | Indo-European | 0 | 0 | 0 | 3 | 4 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 8 | -| daa | Dangaléat | Afro-Asiatic | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| dad | Marik | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| dah | Gwahatike | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| dan | Danish | Indo-European | 0 | 2 | 0 | 7 | 9 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 6 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 28 | -| ded | Dedua | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| deu | German | Indo-European | 0 | 2 | 0 | 8 | 14 | 7 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 7 | 2 | 20 | 4 | 0 | 0 | 0 | 0 | 4 | 0 | 69 | -| dgc | Casiguran Dumagat Agta | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| dgr | Dogrib | Athabaskan-Eyak-Tlingit | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| dgz | Daga | Dagan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| dhg | Dhangu-Djangu | Pama-Nyungan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| dif | Dieri | Pama-Nyungan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| dik | Southwestern Dinka | Nilotic | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | -| div | Dhivehi | Indo-European | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| dji | Djinang | Pama-Nyungan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| djk | Eastern Maroon Creole | Indo-European | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| djr | Djambarrpuyngu | Pama-Nyungan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| dob | Dobu | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| doi | Dogri (macrolanguage) | Unclassified | 0 | 0 | 0 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| dop | Lukpa | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| dov | Dombe | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| dsb | Lower Sorbian | Indo-European | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| dtp | Kadazan Dusun | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| dwr | Dawro | Ta-Ne-Omotic | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| dww | Dawawa | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| dwy | Dhuwaya | Unclassified | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| dyu | Dyula | Mande | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| dza | Tunzu | Atlantic-Congo | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| dzo | Dzongkha | Sino-Tibetan | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | -| ebk | Eastern Bontok | Unclassified | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| eko | Koti | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ell | Modern Greek (1453-) | Indo-European | 0 | 2 | 0 | 5 | 6 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 2 | 0 | 4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 21 | -| emi | Mussau-Emira | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| emp | Northern Emberá | Chocoan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| eng | English | Indo-European | 0 | 3 | 49 | 19 | 160 | 21 | 7 | 10 | 22 | 5 | 0 | 3 | 1 | 13 | 9 | 113 | 13 | 2 | 1 | 6 | 7 | 3 | 24 | 491 | -| enq | Enga | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| epo | Esperanto | Artificial Language | 0 | 0 | 0 | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | -| eri | Ogea | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ese | Ese Ejja | Pano-Tacanan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| esk | Northwest Alaska Inupiatun | Eskimo-Aleut | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| est | Estonian | Uralic | 0 | 1 | 0 | 4 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 12 | -| etr | Edolo | Bosavi | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| eus | Basque | Unclassified | 0 | 0 | 0 | 3 | 2 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 8 | -| ewe | Ewe | Atlantic-Congo | 0 | 0 | 0 | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | -| faa | Fasu | Unclassified | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| fai | Faiwol | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| fao | Faroese | Indo-European | 0 | 0 | 0 | 3 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 7 | -| far | Fataleka | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| fas | Persian | Indo-European | 0 | 1 | 0 | 6 | 28 | 5 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 8 | 2 | 41 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 94 | -| ffm | Maasina Fulfulde | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| fij | Fijian | Austronesian | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | -| fil | Filipino | Austronesian | 0 | 1 | 0 | 1 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | -| fin | Finnish | Uralic | 0 | 1 | 0 | 5 | 5 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 2 | 6 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 23 | -| fon | Fon | Atlantic-Congo | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| for | Fore | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| fra | French | Indo-European | 0 | 1 | 0 | 9 | 13 | 8 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 6 | 3 | 17 | 4 | 0 | 1 | 0 | 0 | 4 | 0 | 67 | -| fry | Western Frisian | Indo-European | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| fuc | Pulaar | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| fue | Borgu Fulfulde | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| fuf | Pular | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| fuh | Western Niger Fulfulde | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| fur | Friulian | Indo-European | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| fuv | Nigerian Fulfulde | Atlantic-Congo | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | -| gah | Alekano | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| gai | Borei | Ramu | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| gam | Kandawo | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| gaw | Nobonob | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| gaz | West Central Oromo | Afro-Asiatic | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | -| gbm | Garhwali | Indo-European | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | -| gdn | Umanakaina | Dagan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| gdr | Wipi | Eastern Trans-Fly | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| geb | Kire | Ramu | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| gfk | Patpatar | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ghs | Guhu-Samane | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| gla | Scottish Gaelic | Indo-European | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | -| gle | Irish | Indo-European | 0 | 0 | 0 | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | -| glg | Galician | Indo-European | 0 | 0 | 0 | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | -| glk | Gilaki | Indo-European | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| glv | Manx | Indo-European | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| gmv | Gamo | Ta-Ne-Omotic | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| gng | Ngangam | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| gnn | Gumatj | Pama-Nyungan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| gnw | Western Bolivian Guaraní | Tupian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| gof | Gofa | Ta-Ne-Omotic | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| gom | Goan Konkani | Indo-European | 0 | 0 | 0 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | -| grc | Ancient Greek (to 1453) | Indo-European | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| grn | Guarani | Unclassified | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | -| gsw | Swiss German | Indo-European | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| gub | Guajajára | Unclassified | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| guh | Guahibo | Guahiboan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| gui | Eastern Bolivian Guaraní | Tupian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| guj | Gujarati | Indo-European | 0 | 0 | 0 | 6 | 6 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 18 | -| gul | Sea Island Creole English | Indo-European | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| gum | Guambiano | Barbacoan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| gun | Mbyá Guaraní | Tupian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| guo | Guayabero | Guahiboan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| gup | Gunwinggu | Gunwinyguan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| gux | Gourmanchéma | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| gvc | Guanano | Tucanoan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| gvf | Golin | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| gvn | Kuku-Yalanji | Pama-Nyungan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| gvs | Gumawana | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| gwi | Gwichʼin | Athabaskan-Eyak-Tlingit | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| gym | Ngäbere | Chibchan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| gyr | Guarayu | Tupian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| hat | Haitian | Indo-European | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | -| hau | Hausa | Afro-Asiatic | 0 | 0 | 0 | 4 | 5 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 14 | -| haw | Hawaiian | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| hbo | Ancient Hebrew | Afro-Asiatic | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| hch | Huichol | Uto-Aztecan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| heb | Hebrew | Afro-Asiatic | 0 | 1 | 0 | 6 | 5 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 15 | -| heg | Helong | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| hin | Hindi | Indo-European | 0 | 1 | 0 | 11 | 12 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 2 | 11 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 43 | -| hix | Hixkaryána | Cariban | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| hla | Halia | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| hlt | Matu Chin | Sino-Tibetan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| hmn | Hmong | Unclassified | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| hmo | Hiri Motu | Pidgin | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| hne | Chhattisgarhi | Indo-European | 0 | 0 | 0 | 2 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | -| hns | Caribbean Hindustani | Indo-European | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| hop | Hopi | Uto-Aztecan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| hot | Hote | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| hrv | Croatian | Indo-European | 0 | 1 | 0 | 6 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 14 | -| hsb | Upper Sorbian | Indo-European | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| hto | Minica Huitoto | Huitotoan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| hub | Huambisa | Chicham | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| hui | Huli | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| hun | Hungarian | Uralic | 0 | 1 | 0 | 7 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 16 | -| hus | Huastec | Mayan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| huu | Murui Huitoto | Huitotoan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| huv | San Mateo Del Mar Huave | Huavean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| hvn | Sabu | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| hye | Armenian | Indo-European | 0 | 0 | 0 | 3 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 9 | -| ian | Iatmul | Ndu | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ibo | Igbo | Atlantic-Congo | 0 | 0 | 0 | 3 | 5 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 12 | -| ido | Ido | Artificial Language | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ign | Ignaciano | Arawakan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ikk | Ika | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ikw | Ikwere | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ile | Interlingue | Artificial Language | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ilo | Iloko | Austronesian | 0 | 0 | 0 | 2 | 1 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | -| imo | Imbongu | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ina | Interlingua (International Auxiliary Language Association) | Artificial Language | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| inb | Inga | Quechuan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ind | Indonesian | Austronesian | 0 | 3 | 0 | 8 | 7 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 5 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 27 | -| ino | Inoke-Yate | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| iou | Tuma-Irumu | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ipi | Ipili | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| isl | Icelandic | Indo-European | 0 | 0 | 0 | 5 | 4 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 12 | -| isn | Isanzu | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ita | Italian | Indo-European | 0 | 1 | 0 | 7 | 9 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 2 | 1 | 7 | 3 | 0 | 0 | 0 | 0 | 4 | 0 | 36 | -| iws | Sepik Iwam | Sepik | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ixl | Ixil | Mayan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| jac | Popti' | Mayan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| jae | Yabem | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| jao | Yanyuwa | Pama-Nyungan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| jav | Javanese | Austronesian | 0 | 0 | 0 | 4 | 7 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 13 | -| jic | Tol | Jicaquean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| jid | Bu (Kaduna State) | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| jiv | Shuar | Chicham | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| jni | Janji | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| jpn | Japanese | Japonic | 0 | 3 | 0 | 7 | 8 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 3 | 14 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 42 | -| jvn | Caribbean Javanese | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kab | Kabyle | Afro-Asiatic | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | -| kac | Kachin | Sino-Tibetan | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | -| kam | Kamba (Kenya) | Atlantic-Congo | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| kan | Kannada | Dravidian | 0 | 0 | 0 | 6 | 7 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 19 | -| kaq | Capanahua | Pano-Tacanan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kas | Kashmiri | Indo-European | 0 | 0 | 0 | 3 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | -| kat | Georgian | Kartvelian | 0 | 0 | 0 | 6 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 13 | -| kaz | Kazakh | Turkic | 0 | 0 | 0 | 5 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 11 | -| kbc | Kadiwéu | Guaicuruan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kbh | Camsá | Unclassified | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kbm | Iwal | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kbp | Kabiyè | Atlantic-Congo | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| kbq | Kamano | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kdc | Kutu | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kde | Makonde | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kdl | Tsikimba | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kea | Kabuverdianu | Indo-European | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | -| kek | Kekchí | Mayan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ken | Kenyang | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kew | West Kewa | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kfg | Kudiya | Dravidian | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kfy | Kumaoni | Indo-European | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kgf | Kube | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kgk | Kaiwá | Tupian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kgp | Kaingang | Nuclear-Macro-Je | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| khk | Halh Mongolian | Mongolic-Khitan | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | -| khm | Khmer | Austroasiatic | 0 | 0 | 0 | 3 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 8 | -| khs | Kasua | Bosavi | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| khz | Keapara | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kik | Kikuyu | Atlantic-Congo | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | -| kin | Kinyarwanda | Atlantic-Congo | 0 | 0 | 0 | 2 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 8 | -| kir | Kirghiz | Turkic | 0 | 0 | 0 | 2 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7 | -| kiw | Northeast Kiwai | Kiwaian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kiz | Kisi | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kje | Kisar | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kjs | East Kewa | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kkc | Odoodee | East Strickland | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kkl | Kosarek Yale | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| klt | Nukna | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| klv | Maskelynes | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kmb | Kimbundu | Atlantic-Congo | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| kmg | Kâte | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kmh | Kalam | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kmk | Limos Kalinga | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kmo | Kwoma | Sepik | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kmr | Northern Kurdish | Indo-European | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | -| kms | Kamasau | Nuclear Torricelli | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kmu | Kanite | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| knc | Central Kanuri | Saharan | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| kne | Kankanaey | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| knf | Mankanya | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| knj | Western Kanjobal | Mayan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| knv | Tabo | Unclassified | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kon | Kongo | Unclassified | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| kor | Korean | Koreanic | 0 | 2 | 0 | 6 | 8 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 3 | 1 | 10 | 3 | 0 | 0 | 0 | 0 | 2 | 0 | 39 | -| kos | Kosraean | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kpf | Komba | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kpg | Kapingamarangi | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kpj | Karajá | Nuclear-Macro-Je | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kpr | Korafe-Yegha | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kpw | Kobon | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kpx | Mountain Koiali | Koiarian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kqa | Mum | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kqc | Doromu-Koki | Manubaran | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kqf | Kakabai | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kql | Kyenele | Yuat | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kqw | Kandas | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| krc | Karachay-Balkar | Turkic | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ksd | Kuanua | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ksj | Uare | Kwalean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ksr | Borong | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ktm | Kurti | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kto | Kuot | Unclassified | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kud | 'Auhelawa | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kue | Kuman (Papua New Guinea) | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kup | Kunimaipa | Kunimaipan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kur | Kurdish | Unclassified | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| kvg | Kuni-Boazi | Anim | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kvn | Border Kuna | Chibchan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kwd | Kwaio | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kwf | Kwara'ae | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kwi | Awa-Cuaiquer | Barbacoan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kwj | Kwanga | Sepik | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kyc | Kyaka | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kyf | Kouya | Kru | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kyg | Keyagana | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kyq | Kenga | Central Sudanic | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kyz | Kayabí | Tupian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kze | Kosena | Bookkeeping | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kzj | Coastal Kadazan | Unclassified | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| lac | Lacandon | Mayan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| lao | Lao | Tai-Kadai | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | -| lat | Latin | Indo-European | 0 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | -| lav | Latvian | Indo-European | 0 | 0 | 0 | 3 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 8 | -| lbb | Label | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| lbk | Central Bontok | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| lcm | Tungag | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| leu | Kara (Papua New Guinea) | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| lex | Luang | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| lfn | Lingua Franca Nova | Artificial Language | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| lgl | Wala | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| lid | Nyindrou | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| lif | Limbu | Sino-Tibetan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| lij | Ligurian | Indo-European | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| lim | Limburgan | Indo-European | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| lin | Lingala | Atlantic-Congo | 0 | 0 | 0 | 2 | 2 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 8 | -| lit | Lithuanian | Indo-European | 0 | 0 | 0 | 6 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 11 | -| llg | Lole | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| lmo | Lombard | Indo-European | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| ltg | Latgalian | Unclassified | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| ltz | Luxembourgish | Indo-European | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | -| lua | Luba-Lulua | Atlantic-Congo | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| lug | Ganda | Atlantic-Congo | 0 | 0 | 0 | 2 | 2 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 8 | -| luo | Luo (Kenya and Tanzania) | Nilotic | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | -| lus | Lushai | Sino-Tibetan | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| lvs | Standard Latvian | Unclassified | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | -| lww | Lewo | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| maa | San Jerónimo Tecóatl Mazatec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mad | Madurese | Austronesian | 0 | 0 | 0 | 2 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | -| mag | Magahi | Indo-European | 0 | 0 | 0 | 1 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | -| mai | Maithili | Indo-European | 0 | 0 | 0 | 4 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7 | -| maj | Jalapa De Díaz Mazatec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mak | Makasar | Austronesian | 0 | 0 | 0 | 1 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| mal | Malayalam | Dravidian | 0 | 0 | 0 | 7 | 7 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 19 | -| mam | Mam | Mayan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| maq | Chiquihuitlán Mazatec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mar | Marathi | Indo-European | 0 | 0 | 0 | 9 | 6 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 3 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 23 | -| mau | Huautla Mazatec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mav | Sateré-Mawé | Tupian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| max | North Moluccan Malay | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| maz | Central Mazahua | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mbb | Western Bukidnon Manobo | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mbc | Macushi | Cariban | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mbh | Mangseng | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mbj | Nadëb | Naduhup | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mbl | Maxakalí | Nuclear-Macro-Je | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mbs | Sarangani Manobo | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mbt | Matigsalug Manobo | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mca | Maca | Mataguayan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mcb | Machiguenga | Arawakan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mcd | Sharanahua | Pano-Tacanan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mcf | Matsés | Pano-Tacanan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mco | Coatlán Mixe | Mixe-Zoque | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mcp | Makaa | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mcq | Ese | Koiarian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mcr | Menya | Angan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mdy | Male (Ethiopia) | Ta-Ne-Omotic | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| med | Melpa | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mee | Mengen | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mek | Mekeo | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| meq | Merey | Afro-Asiatic | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| met | Mato | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| meu | Motu | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mey | Hassaniyya | Afro-Asiatic | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mgc | Morokodo | Central Sudanic | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mgh | Makhuwa-Meetto | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mgw | Matumbi | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mhl | Mauwake | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mhr | Eastern Mari | Uralic | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mib | Atatláhuca Mixtec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mic | Mi'kmaq | Algic | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mie | Ocotepec Mixtec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mig | San Miguel El Grande Mixtec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mih | Chayuco Mixtec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mil | Peñoles Mixtec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| min | Minangkabau | Austronesian | 0 | 0 | 0 | 3 | 4 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 9 | -| mio | Pinotepa Nacional Mixtec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mir | Isthmus Mixe | Mixe-Zoque | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mit | Southern Puebla Mixtec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| miz | Coatzospan Mixtec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mjc | San Juan Colorado Mixtec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mkd | Macedonian | Indo-European | 0 | 0 | 0 | 3 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7 | -| mkj | Mokilese | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mkl | Mokole | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mkn | Kupang Malay | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mks | Silacayoapan Mixtec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mle | Manambu | Ndu | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mlg | Malagasy | Unclassified | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mlh | Mape | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mlp | Bargam | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mlt | Maltese | Afro-Asiatic | 0 | 0 | 0 | 2 | 2 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 9 | -| mmo | Mangga Buang | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mmx | Madak | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mna | Mbula | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mni | Manipuri | Sino-Tibetan | 0 | 0 | 0 | 4 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7 | -| mon | Mongolian | Unclassified | 0 | 0 | 0 | 2 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | -| mop | Mopán Maya | Mayan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mos | Mossi | Atlantic-Congo | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| mox | Molima | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mph | Maung | Iwaidjan Proper | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mpj | Martu Wangka | Pama-Nyungan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mpm | Yosondúa Mixtec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mpp | Migabac | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mps | Dadibi | Teberan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mpt | Mian | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mpx | Misima-Panaeati | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mqb | Mbuko | Afro-Asiatic | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mqj | Mamasa | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mri | Maori | Austronesian | 0 | 1 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | -| msa | Malay (macrolanguage) | Unclassified | 0 | 0 | 0 | 3 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7 | -| msb | Masbatenyo | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| msc | Sankaran Maninka | Mande | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| msk | Mansaka | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| msm | Agusan Manobo | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| msy | Aruamu | Ramu | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mti | Maiwa (Papua New Guinea) | Dagan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mto | Totontepec Mixe | Mixe-Zoque | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mui | Musi | Austronesian | 0 | 0 | 0 | 1 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| mup | Malvi | Indo-European | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | -| mux | Bo-Ung | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| muy | Muyang | Afro-Asiatic | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mva | Manam | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mvn | Minaveha | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mwc | Are | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mwe | Mwera (Chimwera) | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mwf | Murrinh-Patha | Southern Daly | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mwp | Kala Lagaw Ya | Pama-Nyungan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mwr | Marwari | Unclassified | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mxb | Tezoatlán Mixtec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mxp | Tlahuitoltepec Mixe | Mixe-Zoque | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mxq | Juquila Mixe | Mixe-Zoque | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mxt | Jamiltepec Mixtec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mya | Burmese | Sino-Tibetan | 0 | 0 | 0 | 3 | 4 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 9 | -| myk | Mamara Senoufo | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| myu | Mundurukú | Tupian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| myw | Muyuw | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| myy | Macuna | Tucanoan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mzz | Maiadomu | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nab | Southern Nambikuára | Nambiquaran | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| naf | Nabak | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nak | Nakanai | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nas | Naasioi | South Bougainville | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nbl | South Ndebele | Unclassified | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nbq | Nggem | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nca | Iyo | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nch | Central Huasteca Nahuatl | Uto-Aztecan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ncj | Northern Puebla Nahuatl | Uto-Aztecan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ncl | Michoacán Nahuatl | Uto-Aztecan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ncu | Chumburung | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nde | North Ndebele | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ndg | Ndengereko | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ndj | Ndamba | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nds | Low German | Indo-European | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nep | Nepali (macrolanguage) | Unclassified | 0 | 0 | 0 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| nfa | Dhao | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ngp | Ngulu | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ngu | Guerrero Nahuatl | Uto-Aztecan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nhe | Eastern Huasteca Nahuatl | Uto-Aztecan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nhg | Tetelcingo Nahuatl | Uto-Aztecan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nhi | Zacatlán-Ahuacatlán-Tepetzintla Nahuatl | Uto-Aztecan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nho | Takuu | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nhr | Naro | Khoe-Kwadi | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nhu | Noone | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nhw | Western Huasteca Nahuatl | Uto-Aztecan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nhy | Northern Oaxaca Nahuatl | Uto-Aztecan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nif | Nek | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nii | Nii | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nij | Ngaju | Austronesian | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | -| nin | Ninzo | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nko | Nkonya | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nld | Dutch | Indo-European | 0 | 1 | 0 | 8 | 6 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 30 | 2 | 0 | 0 | 0 | 0 | 4 | 0 | 55 | -| nlg | Gela | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nna | Nyangumarta | Pama-Nyungan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nno | Norwegian Nynorsk | Unclassified | 0 | 0 | 0 | 4 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 8 | -| nnq | Ngindo | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| noa | Woun Meu | Chocoan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nob | Norwegian Bokmål | Unclassified | 0 | 0 | 0 | 4 | 7 | 5 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 19 | -| noe | Nimadi | Indo-European | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nop | Numanggang | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nor | Norwegian | Indo-European | 0 | 1 | 0 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7 | -| not | Nomatsiguenga | Arawakan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nou | Ewage-Notu | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nov | Novial | Artificial Language | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| npi | Nepali (individual language) | Indo-European | 0 | 0 | 0 | 4 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 8 | -| npl | Southeastern Puebla Nahuatl | Uto-Aztecan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nqo | N'Ko | Artificial Language | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | -| nsn | Nehan | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nso | Pedi | Atlantic-Congo | 0 | 0 | 0 | 2 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | -| nss | Nali | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ntj | Ngaanyatjarra | Pama-Nyungan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ntp | Northern Tepehuan | Uto-Aztecan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ntu | Natügu | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nus | Nuer | Nilotic | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| nuy | Nunggubuyu | Gunwinyguan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nvm | Namiae | Koiarian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nwi | Southwest Tanna | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nya | Nyanja | Atlantic-Congo | 0 | 0 | 0 | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | -| nys | Nyungar | Pama-Nyungan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nyu | Nyungwe | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| obo | Obo Manobo | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| oci | Occitan (post 1500) | Indo-European | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | -| okv | Orokaiva | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| omw | South Tairora | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ong | Olo | Nuclear Torricelli | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ons | Ono | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ood | Tohono O'odham | Uto-Aztecan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| opm | Oksapmin | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ori | Oriya (macrolanguage) | Unclassified | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| orm | Oromo | Unclassified | 0 | 0 | 0 | 1 | 1 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | -| orv | Old Russian | Indo-European | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ory | Odia | Indo-European | 0 | 0 | 0 | 5 | 4 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 15 | -| ote | Mezquital Otomi | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| otm | Eastern Highland Otomi | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| otn | Tenango Otomi | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| otq | Querétaro Otomi | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ots | Estado de México Otomi | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| pab | Parecís | Arawakan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| pad | Paumarí | Arawan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| pag | Pangasinan | Austronesian | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| pah | Tenharim | Tupian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| pam | Pampanga | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| pan | Panjabi | Indo-European | 0 | 0 | 0 | 6 | 6 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 18 | -| pao | Northern Paiute | Uto-Aztecan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| pap | Papiamento | Indo-European | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| pbt | Southern Pashto | Indo-European | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | -| pcm | Nigerian Pidgin | Indo-European | 0 | 0 | 0 | 1 | 4 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7 | -| pes | Iranian Persian | Indo-European | 0 | 0 | 0 | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | -| pib | Yine | Arawakan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| pio | Piapoco | Arawakan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| pir | Piratapuyo | Tucanoan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| piu | Pintupi-Luritja | Pama-Nyungan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| pjt | Pitjantjatjara | Pama-Nyungan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| pls | San Marcos Tlacoyalco Popoloca | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| plt | Plateau Malagasy | Austronesian | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | -| plu | Palikúr | Arawakan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| pma | Paama | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| pms | Piemontese | Indo-European | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| poe | San Juan Atzingo Popoloca | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| poh | Poqomchi' | Mayan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| poi | Highland Popoluca | Mixe-Zoque | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| pol | Polish | Indo-European | 0 | 1 | 0 | 6 | 11 | 4 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 4 | 0 | 19 | 4 | 0 | 0 | 0 | 0 | 2 | 0 | 52 | -| pon | Pohnpeian | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| por | Portuguese | Indo-European | 0 | 1 | 0 | 6 | 9 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 3 | 1 | 6 | 3 | 0 | 0 | 0 | 0 | 2 | 0 | 34 | -| poy | Pogolo | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ppo | Folopa | Teberan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| prf | Paranan | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| pri | Paicî | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| prs | Dari | Indo-European | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | -| ptp | Patep | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ptu | Bambam | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| pus | Pushto | Unclassified | 0 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | -| pwg | Gapapaiwa | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| qub | Huallaga Huánuco Quechua | Quechuan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| quc | K'iche' | Mayan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| quf | Lambayeque Quechua | Quechuan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| quh | South Bolivian Quechua | Quechuan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| qul | North Bolivian Quechua | Quechuan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| qup | Southern Pastaza Quechua | Quechuan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| quy | Ayacucho Quechua | Quechuan | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| quz | Cusco Quechua | Quechuan | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| qvc | Cajamarca Quechua | Quechuan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| qve | Eastern Apurímac Quechua | Quechuan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| qvh | Huamalíes-Dos de Mayo Huánuco Quechua | Quechuan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| qvm | Margos-Yarowilca-Lauricocha Quechua | Quechuan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| qvn | North Junín Quechua | Quechuan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| qvs | San Martín Quechua | Quechuan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| qvw | Huaylla Wanca Quechua | Quechuan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| qvz | Northern Pastaza Quichua | Quechuan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| qwh | Huaylas Ancash Quechua | Quechuan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| qxh | Panao Huánuco Quechua | Quechuan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| qxn | Northern Conchucos Ancash Quechua | Quechuan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| qxo | Southern Conchucos Ancash Quechua | Quechuan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| rai | Ramoaaina | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| raj | Rajasthani | Unclassified | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | -| reg | Kara (Tanzania) | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| rej | Rejang | Austronesian | 0 | 0 | 0 | 1 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| rgu | Ringgou | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| rkb | Rikbaktsa | Nuclear-Macro-Je | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| rmc | Carpathian Romani | Indo-European | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| rmy | Vlax Romani | Indo-European | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| rom | Romany | Unclassified | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | -| ron | Romanian | Indo-European | 0 | 1 | 0 | 7 | 6 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 4 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 22 | -| roo | Rotokas | North Bougainville | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| rop | Kriol | Indo-European | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| row | Dela-Oenale | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| rro | Waima | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ruf | Luguru | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| rug | Roviana | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| run | Rundi | Atlantic-Congo | 0 | 0 | 0 | 1 | 2 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | -| rus | Russian | Indo-European | 0 | 2 | 0 | 7 | 13 | 6 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 4 | 2 | 17 | 4 | 0 | 0 | 0 | 0 | 2 | 0 | 59 | -| rwo | Rawa | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| sab | Buglere | Chibchan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| sag | Sango | Atlantic-Congo | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| sah | Yakut | Turkic | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| san | Sanskrit | Indo-European | 0 | 0 | 0 | 5 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 10 | -| sat | Santali | Austroasiatic | 0 | 0 | 0 | 4 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7 | -| sbe | Saliba | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| sbk | Safwa | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| sbs | Subiya | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| scn | Sicilian | Indo-European | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| sco | Scots | Indo-European | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| seh | Sena | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| sey | Secoya | Tucanoan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| sgb | Mag-antsi Ayta | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| sgz | Sursurunga | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| shi | Tachelhit | Afro-Asiatic | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| shj | Shatt | Dajuic | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| shn | Shan | Tai-Kadai | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | -| shp | Shipibo-Conibo | Pano-Tacanan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| sim | Mende (Papua New Guinea) | Sepik | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| sin | Sinhala | Indo-European | 0 | 0 | 0 | 2 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7 | -| sja | Epena | Chocoan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| slk | Slovak | Indo-European | 0 | 0 | 0 | 5 | 4 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 15 | -| sll | Salt-Yui | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| slv | Slovenian | Indo-European | 0 | 0 | 0 | 5 | 4 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 13 | -| smk | Bolinao | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| smo | Samoan | Austronesian | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | -| sna | Shona | Atlantic-Congo | 0 | 0 | 0 | 2 | 2 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 8 | -| snc | Sinaugoro | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| snd | Sindhi | Indo-European | 0 | 0 | 0 | 4 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 8 | -| snn | Siona | Tucanoan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| snp | Siane | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| snx | Sam | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| sny | Saniyo-Hiyewe | Sepik | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| som | Somali | Afro-Asiatic | 0 | 0 | 0 | 3 | 2 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 9 | -| soq | Kanasi | Dagan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| sot | Southern Sotho | Atlantic-Congo | 0 | 0 | 0 | 1 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | -| soy | Miyobe | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| spa | Spanish | Indo-European | 0 | 2 | 0 | 6 | 13 | 4 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 3 | 2 | 15 | 4 | 0 | 0 | 0 | 0 | 4 | 0 | 54 | -| spl | Selepet | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| spm | Akukem | Ramu | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| spp | Supyire Senoufo | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| sps | Saposa | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| spy | Sabaot | Nilotic | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| sqi | Albanian | Unclassified | 0 | 0 | 0 | 2 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | -| srd | Sardinian | Unclassified | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| sri | Siriano | Tucanoan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| srm | Saramaccan | Indo-European | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| srn | Sranan Tongo | Indo-European | 0 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | -| srp | Serbian | Indo-European | 0 | 0 | 0 | 6 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 12 | -| srq | Sirionó | Tupian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ssd | Siroi | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ssg | Seimat | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ssw | Swati | Atlantic-Congo | 0 | 0 | 0 | 2 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7 | -| ssx | Samberigi | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| stp | Southeastern Tepehuan | Uto-Aztecan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| sua | Sulka | Unclassified | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| sue | Suena | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| sun | Sundanese | Austronesian | 0 | 0 | 0 | 3 | 4 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 9 | -| sus | Susu | Mande | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| suz | Sunwar | Sino-Tibetan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| svk | Slovakian Sign Language | Sign Language | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| swa | Swahili (macrolanguage) | Atlantic-Congo | 0 | 1 | 0 | 1 | 7 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 16 | -| swe | Swedish | Indo-European | 0 | 1 | 0 | 6 | 8 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 5 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 26 | -| swg | Swabian | Indo-European | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| swh | Swahili (individual language) | Atlantic-Congo | 0 | 0 | 0 | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | -| swp | Suau | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| sxb | Suba | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| szl | Silesian | Indo-European | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| tac | Lowland Tarahumara | Uto-Aztecan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tah | Tahitian | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| taj | Eastern Tamang | Sino-Tibetan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tam | Tamil | Dravidian | 0 | 0 | 0 | 7 | 7 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 21 | -| taq | Tamasheq | Afro-Asiatic | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| tat | Tatar | Turkic | 0 | 0 | 0 | 3 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | -| tav | Tatuyo | Tucanoan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| taw | Tai | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tbc | Takia | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tbf | Mandara | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tbg | North Tairora | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tbo | Tawala | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tbz | Ditammari | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tca | Ticuna | Ticuna-Yuri | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tcs | Torres Strait Creole | Indo-European | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tcz | Thado Chin | Sino-Tibetan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tdt | Tetun Dili | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tee | Huehuetla Tepehua | Totonacan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tel | Telugu | Dravidian | 0 | 1 | 0 | 7 | 7 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 5 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 25 | -| ter | Tereno | Arawakan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tet | Tetum | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tew | Tewa (USA) | Kiowa-Tanoan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tfr | Teribe | Chibchan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tgk | Tajik | Indo-European | 0 | 0 | 0 | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | -| tgl | Tagalog | Austronesian | 0 | 0 | 0 | 5 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 11 | -| tgo | Sudest | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tgp | Tangoa | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tha | Thai | Tai-Kadai | 0 | 1 | 0 | 6 | 8 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 7 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 25 | -| tif | Tifal | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tim | Timbe | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tir | Tigrinya | Afro-Asiatic | 0 | 0 | 0 | 2 | 2 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 8 | -| tiw | Tiwi | Unclassified | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tiy | Tiruray | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tke | Takwane | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tku | Upper Necaxa Totonac | Totonacan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tlf | Telefol | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tmd | Haruai | Piawi | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tna | Tacana | Pano-Tacanan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tnc | Tanimuca-Retuarã | Tucanoan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tnk | Kwamera | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tnn | North Tanna | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tnp | Whitesands | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| toc | Coyutla Totonac | Totonacan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tod | Toma | Mande | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tof | Gizrra | Eastern Trans-Fly | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| toj | Tojolabal | Mayan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ton | Tonga (Tonga Islands) | Austronesian | 0 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | -| too | Xicotepec De Juárez Totonac | Totonacan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| top | Papantla Totonac | Totonacan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tos | Highland Totonac | Totonacan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tpa | Taupota | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tpi | Tok Pisin | Indo-European | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | -| tpt | Tlachichilco Tepehua | Totonacan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tpz | Tinputz | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| trc | Copala Triqui | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tsn | Tswana | Atlantic-Congo | 0 | 0 | 0 | 2 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7 | -| tso | Tsonga | Atlantic-Congo | 0 | 0 | 0 | 1 | 4 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7 | -| tsw | Tsishingini | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ttc | Tektiteko | Mayan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tte | Bwanabwana | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tuc | Mutu | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tue | Tuyuca | Tucanoan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tuf | Central Tunebo | Chibchan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tuk | Turkmen | Turkic | 0 | 0 | 0 | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | -| tum | Tumbuka | Atlantic-Congo | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| tuo | Tucano | Tucanoan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tur | Turkish | Turkic | 0 | 3 | 0 | 6 | 7 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0 | 4 | 2 | 0 | 0 | 0 | 0 | 2 | 0 | 28 | -| tvk | Southeast Ambrym | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| twi | Twi | Unclassified | 0 | 0 | 0 | 2 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | -| txq | Tii | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| txu | Kayapó | Nuclear-Macro-Je | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tyv | Tuvinian | Turkic | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tzj | Tz'utujil | Mayan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tzl | Talossan | Artificial Language | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tzm | Central Atlas Tamazight | Afro-Asiatic | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| tzo | Tzotzil | Mayan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ubr | Ubir | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ubu | Umbu-Ungu | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| udu | Uduk | Koman | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| uig | Uighur | Turkic | 0 | 0 | 0 | 4 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7 | -| ukr | Ukrainian | Indo-European | 0 | 1 | 0 | 6 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 12 | -| uli | Ulithian | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ulk | Meriam Mir | Eastern Trans-Fly | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| umb | Umbundu | Atlantic-Congo | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| upv | Uripiv-Wala-Rano-Atchin | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ura | Urarina | Unclassified | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| urb | Urubú-Kaapor | Tupian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| urd | Urdu | Indo-European | 0 | 0 | 0 | 9 | 8 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 22 | -| uri | Urim | Nuclear Torricelli | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| urt | Urat | Nuclear Torricelli | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| urw | Sop | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| usa | Usarufa | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| usp | Uspanteco | Mayan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| uvh | Uri | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| uvl | Lote | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| uzb | Uzbek | Unclassified | 0 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| uzn | Northern Uzbek | Turkic | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | -| vec | Venetian | Indo-European | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| ven | Venda | Atlantic-Congo | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | -| vid | Vidunda | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| vie | Vietnamese | Austroasiatic | 0 | 2 | 0 | 7 | 6 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 6 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 23 | -| viv | Iduna | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| vmy | Ayautla Mazatec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| waj | Waffa | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| wal | Wolaytta | Ta-Ne-Omotic | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| wap | Wapishana | Arawakan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| war | Waray (Philippines) | Austronesian | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | -| wat | Kaninuwa | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| wbi | Vwanji | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| wbp | Warlpiri | Pama-Nyungan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| wed | Wedau | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| wer | Weri | Kunimaipan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| wim | Wik-Mungkan | Pama-Nyungan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| wiu | Wiru | Unclassified | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| wiv | Vitu | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| wln | Walloon | Indo-European | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| wmt | Walmajarri | Pama-Nyungan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| wmw | Mwani | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| wnc | Wantoat | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| wnu | Usan | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| wol | Wolof | Atlantic-Congo | 0 | 0 | 0 | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | -| wos | Hanga Hundi | Ndu | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| wrk | Garrwa | Garrwan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| wro | Worrorra | Worrorran | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| wrs | Waris | Border | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| wsk | Waskia | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| wuu | Wu Chinese | Sino-Tibetan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| wuv | Wuvulu-Aua | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| xav | Xavánte | Nuclear-Macro-Je | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| xbi | Kombio | Nuclear Torricelli | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| xed | Hdi | Afro-Asiatic | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| xho | Xhosa | Atlantic-Congo | 0 | 0 | 0 | 3 | 3 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 10 | -| xla | Kamula | Kamula-Elevala | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| xnn | Northern Kankanay | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| xon | Konkomba | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| xsi | Sio | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| xtd | Diuxi-Tilantongo Mixtec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| xtm | Magdalena Peñasco Mixtec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| yaa | Yaminahua | Pano-Tacanan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| yad | Yagua | Peba-Yagua | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| yal | Yalunka | Mande | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| yap | Yapese | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| yaq | Yaqui | Uto-Aztecan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| yby | Yaweyuha | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ycn | Yucuna | Arawakan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ydd | Eastern Yiddish | Indo-European | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| yid | Yiddish | Unclassified | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| yka | Yakan | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| yle | Yele | Unclassified | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| yml | Iamalele | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| yon | Yongkom | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| yor | Yoruba | Atlantic-Congo | 0 | 0 | 0 | 4 | 5 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 16 | -| yrb | Yareba | Yareban | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| yre | Yaouré | Mande | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| yss | Yessan-Mayo | Sepik | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| yue | Yue Chinese | Sino-Tibetan | 0 | 0 | 0 | 3 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | -| yuj | Karkar-Yuri | Pauwasi | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| yut | Yopno | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| yuw | Yau (Morobe Province) | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| yva | Yawa | Yawa-Saweru | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zaa | Sierra de Juárez Zapotec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zab | Western Tlacolula Valley Zapotec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zac | Ocotlán Zapotec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zad | Cajonos Zapotec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zai | Isthmus Zapotec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zaj | Zaramo | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zam | Miahuatlán Zapotec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zao | Ozolotepec Zapotec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zap | Zapotec | Unclassified | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zar | Rincón Zapotec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zas | Santo Domingo Albarradas Zapotec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zat | Tabaa Zapotec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zav | Yatzachi Zapotec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zaw | Mitla Zapotec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zca | Coatecas Altas Zapotec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zga | Kinga | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zho | Chinese | Unclassified | 0 | 2 | 0 | 4 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 | 14 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 26 | -| zia | Zia | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ziw | Zigula | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zlm | Malay (individual language) | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zos | Francisco León Zoque | Mixe-Zoque | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zpc | Choapan Zapotec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zpl | Lachixío Zapotec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zpm | Mixtepec Zapotec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zpo | Amatlán Zapotec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zpq | Zoogocho Zapotec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zpu | Yalálag Zapotec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zpv | Chichicapan Zapotec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zpz | Texmelucan Zapotec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zsm | Standard Malay | Austronesian | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | -| zsr | Southern Rincon Zapotec | Unclassified | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ztq | Quioquitani-Quierí Zapotec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zty | Yatee Zapotec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zul | Zulu | Atlantic-Congo | 0 | 0 | 0 | 2 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7 | -| zyp | Zyphe Chin | Sino-Tibetan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| Total | None | None | None | 0 | 55 | 49 | 1492 | 836 | 316 | 7 | 10 | 22 | 5 | 0 | 3 | 28 | 91 | 56 | 591 | 88 | 2 | 2 | 6 | 7 | 37 | 24 | -<<<<<<< HEAD -======= +| ISO Code | Language | Family | Any2AnyMultiChoice | Any2AnyMultilingualRetrieval | Any2AnyRetrieval | AudioClassification | AudioClustering | AudioCrossFoldClassification | AudioMultilabelClassification | AudioPairClassification | AudioZeroshotClassification | BitextMining | Classification | Clustering | Compositionality | DocumentUnderstanding | ImageClassification | ImageClustering | ImageMultilabelClassification | InstructionRetrieval | MultilabelClassification | PairClassification | Reranking | Retrieval | STS | Speed | Summarization | VisionCentricQA | VisualSTS(eng) | VisualSTS(multi) | ZeroShotClassification | Sum | +|----------|----------|--------|--------------------|------------------------------|------------------|---------------------|-----------------|------------------------------|-------------------------------|-------------------------|-----------------------------|--------------|----------------|------------|------------------|-----------------------|---------------------|-----------------|-------------------------------|----------------------|--------------------------|--------------------|-----------|-----------|-----|-------|---------------|-----------------|----------------|------------------|------------------------|-----| +| aai | Arifama-Miniafia | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| aak | Ankave | Angan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| aau | Abau | Sepik | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| aaz | Amarasi | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| abs | Ambonese Malay | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| abt | Ambulas | Ndu | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| abx | Inabaknon | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| aby | Aneme Wake | Yareban | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ace | Achinese | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | +| acf | Saint Lucian Creole French | Indo-European | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| acm | Mesopotamian Arabic | Afro-Asiatic | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | +| acq | Ta'izzi-Adeni Arabic | Afro-Asiatic | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| acr | Achi | Mayan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| acu | Achuar-Shiwiar | Chicham | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| adz | Adzera | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| aeb | Tunisian Arabic | Afro-Asiatic | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| aer | Eastern Arrernte | Pama-Nyungan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| aey | Amele | Nuclear Trans New Guinea | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| afr | Afrikaans | Indo-European | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 4 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 10 | +| agd | Agarabi | Nuclear Trans New Guinea | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| agg | Angor | Senagi | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| agm | Angaataha | Angan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| agn | Agutaynen | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| agr | Aguaruna | Chicham | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| agt | Central Cagayan Agta | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| agu | Aguacateco | Mayan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| aia | Arosi | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| aii | Assyrian Neo-Aramaic | Afro-Asiatic | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ajp | South Levantine Arabic | Unclassified | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| aka | Akan | Atlantic-Congo | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | +| ake | Akawaio | Cariban | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| alp | Alune | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| alq | Algonquin | Algic | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| als | Tosk Albanian | Indo-European | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | +| aly | Alyawarr | Pama-Nyungan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ame | Yanesha' | Arawakan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| amf | Hamer-Banna | South Omotic | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| amh | Amharic | Afro-Asiatic | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 6 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 14 | +| amk | Ambai | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| amm | Ama (Papua New Guinea) | Left May | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| amn | Amanab | Border | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| amo | Amo | Atlantic-Congo | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| amp | Alamblak | Sepik | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| amr | Amarakaeri | Harakmbut | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| amu | Guerrero Amuzgo | Otomanguean | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| amx | Anmatyerre | Pama-Nyungan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ang | Old English (ca. 450-1100) | Indo-European | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| anh | Nend | Nuclear Trans New Guinea | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| anp | Angika | Indo-European | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| anv | Denya | Atlantic-Congo | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| aoi | Anindilyakwa | Gunwinyguan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| aoj | Mufian | Nuclear Torricelli | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| aom | Ömie | Koiarian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| aon | Bumbita Arapesh | Nuclear Torricelli | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| apb | Sa'a | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| apc | Levantine Arabic | Afro-Asiatic | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | +| ape | Bukiyip | Nuclear Torricelli | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| apn | Apinayé | Nuclear-Macro-Je | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| apr | Arop-Lokep | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| apu | Apurinã | Arawakan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| apw | Western Apache | Athabaskan-Eyak-Tlingit | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| apz | Safeyoka | Angan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ara | Arabic | Unclassified | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 12 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 2 | 10 | 2 | 0 | 0 | 0 | 0 | 2 | 0 | 36 | +| arb | Standard Arabic | Afro-Asiatic | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 8 | +| are | Western Arrarnta | Pama-Nyungan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| arl | Arabela | Zaparoan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| arn | Mapudungun | Araucanian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| arp | Arapaho | Algic | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| arq | Algerian Arabic | Afro-Asiatic | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | +| ars | Najdi Arabic | Afro-Asiatic | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | +| ary | Moroccan Arabic | Afro-Asiatic | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 7 | +| arz | Egyptian Arabic | Afro-Asiatic | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | +| asm | Assamese | Indo-European | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | 3 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 14 | +| aso | Dano | Nuclear Trans New Guinea | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ast | Asturian | Indo-European | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | +| ata | Pele-Ata | Unclassified | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| atb | Zaiwa | Sino-Tibetan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| atd | Ata Manobo | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| atg | Ivbie North-Okpela-Arhe | Atlantic-Congo | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| att | Pamplona Atta | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| auc | Waorani | Unclassified | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| aui | Anuki | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| auy | Awiyaana | Nuclear Trans New Guinea | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| avt | Au | Nuclear Torricelli | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| awa | Awadhi | Indo-European | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | +| awb | Awa (Papua New Guinea) | Nuclear Trans New Guinea | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| awk | Awabakal | Pama-Nyungan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| awx | Awara | Nuclear Trans New Guinea | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ayr | Central Aymara | Aymaran | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| azb | South Azerbaijani | Turkic | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | +| aze | Azerbaijani | Unclassified | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7 | +| azg | San Pedro Amuzgos Amuzgo | Otomanguean | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| azj | North Azerbaijani | Turkic | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | +| azz | Highland Puebla Nahuatl | Uto-Aztecan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bak | Bashkir | Turkic | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | +| bam | Bambara | Mande | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | +| ban | Balinese | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | +| bao | Waimaha | Tucanoan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bba | Baatonum | Atlantic-Congo | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bbb | Barai | Koiarian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bbc | Batak Toba | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | +| bbr | Girawa | Nuclear Trans New Guinea | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bch | Bariai | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bco | Kaluli | Bosavi | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bdd | Bunama | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bea | Beaver | Athabaskan-Eyak-Tlingit | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bef | Benabena | Nuclear Trans New Guinea | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bel | Belarusian | Indo-European | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | +| bem | Bemba (Zambia) | Atlantic-Congo | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | +| ben | Bengali | Indo-European | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 9 | 9 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 2 | 7 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 32 | +| beo | Beami | Bosavi | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ber | Berber (Other) | Unclassified | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| beu | Blagar | Timor-Alor-Pantar | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bew | Betawi | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| bgc | Haryanvi | Indo-European | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | +| bgs | Tagabawa | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bgt | Bughotu | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bhb | Bhili | Indo-European | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bhd | Bhadrawahi | Indo-European | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bhg | Binandere | Nuclear Trans New Guinea | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bhl | Bimin | Nuclear Trans New Guinea | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bho | Bhojpuri | Indo-European | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | +| bhp | Bima | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| big | Biangai | Kunimaipan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bjj | Kanauji | Indo-European | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bjk | Barok | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bjn | Banjar | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | +| bjp | Fanamaket | Unclassified | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bjr | Binumarien | Nuclear Trans New Guinea | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bjv | Bedjond | Central Sudanic | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bjz | Baruga | Nuclear Trans New Guinea | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bkd | Binukid | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bki | Baki | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bkq | Bakairí | Cariban | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bkx | Baikeno | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| blw | Balangao | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| blz | Balantak | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bmh | Kein | Nuclear Trans New Guinea | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bmk | Ghayavi | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bmr | Muinane | Boran | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bmu | Somba-Siawari | Nuclear Trans New Guinea | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bnp | Bola | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bns | Bundeli | Indo-European | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| boa | Bora | Boran | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bod | Tibetan | Sino-Tibetan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | +| boj | Anjam | Nuclear Trans New Guinea | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bon | Bine | Eastern Trans-Fly | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bos | Bosnian | Indo-European | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | +| box | Buamu | Atlantic-Congo | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| boy | Bodo (Central African Republic) | Atlantic-Congo | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bpr | Koronadal Blaan | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bps | Sarangani Blaan | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bqc | Boko (Benin) | Mande | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bqp | Busa | Mande | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bra | Braj | Indo-European | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bre | Breton | Indo-European | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | +| brx | Bodo (India) | Sino-Tibetan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | +| bsj | Bangwinji | Atlantic-Congo | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bsn | Barasana-Eduria | Tucanoan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bsp | Baga Sitemu | Atlantic-Congo | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bss | Akoose | Atlantic-Congo | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bug | Buginese | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 4 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7 | +| buk | Bugawac | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bul | Bulgarian | Indo-European | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | 4 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 17 | +| bus | Bokobaru | Mande | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bvd | Baeggu | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bvr | Burarra | Maningrida | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bxh | Buhutu | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| byr | Baruya | Angan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| byx | Qaqet | Baining | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bzd | Bribri | Chibchan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bzh | Mapos Buang | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bzj | Belize Kriol English | Indo-European | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| caa | Chortí | Mayan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cab | Garifuna | Arawakan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cac | Chuj | Mayan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| caf | Southern Carrier | Athabaskan-Eyak-Tlingit | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cak | Kaqchikel | Mayan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cao | Chácobo | Pano-Tacanan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cap | Chipaya | Uru-Chipaya | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| car | Galibi Carib | Cariban | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cat | Catalan | Indo-European | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | 2 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 11 | +| cav | Cavineña | Pano-Tacanan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cax | Chiquitano | Chiquitano | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cbc | Carapana | Tucanoan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cbi | Chachi | Barbacoan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cbk | Chavacano | Indo-European | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | +| cbr | Cashibo-Cacataibo | Pano-Tacanan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cbs | Cashinahua | Pano-Tacanan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cbt | Chayahuita | Cahuapanan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cbu | Candoshi-Shapra | Unclassified | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cbv | Cacua | Kakua-Nukak | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cco | Comaltepec Chinantec | Otomanguean | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ceb | Cebuano | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | +| cek | Eastern Khumi Chin | Sino-Tibetan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ces | Czech | Indo-European | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | 5 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 2 | 1 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 21 | +| cgc | Kagayanen | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cha | Chamorro | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | +| chd | Highland Oaxaca Chontal | Tequistlatecan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| chf | Tabasco Chontal | Mayan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| chk | Chuukese | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| chq | Quiotepec Chinantec | Otomanguean | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| chv | Chuvash | Turkic | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| chz | Ozumacín Chinantec | Otomanguean | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cjk | Chokwe | Atlantic-Congo | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| cjo | Ashéninka Pajonal | Arawakan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cjv | Chuave | Nuclear Trans New Guinea | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ckb | Central Kurdish | Indo-European | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | +| cle | Lealao Chinantec | Otomanguean | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| clu | Caluyanun | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cme | Cerma | Atlantic-Congo | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cmn | Mandarin Chinese | Sino-Tibetan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 10 | 4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 4 | 10 | 9 | 0 | 0 | 0 | 0 | 2 | 0 | 46 | +| cmo | Central Mnong | Austroasiatic | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | +| cni | Asháninka | Arawakan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cnl | Lalana Chinantec | Otomanguean | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cnt | Tepetotutla Chinantec | Otomanguean | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| code | unknown | Unclassified | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 37 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 41 | +| cof | Colorado | Barbacoan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| con | Cofán | Unclassified | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cop | Coptic | Afro-Asiatic | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cor | Cornish | Indo-European | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cot | Caquinte | Arawakan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cpa | Palantla Chinantec | Otomanguean | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cpb | Ucayali-Yurúa Ashéninka | Arawakan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cpc | Ajyíninka Apurucayali | Arawakan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cpu | Pichis Ashéninka | Arawakan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cpy | South Ucayali Ashéninka | Arawakan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| crh | Crimean Tatar | Turkic | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| crn | El Nayar Cora | Uto-Aztecan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| crx | Carrier | Athabaskan-Eyak-Tlingit | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| csb | Kashubian | Indo-European | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cso | Sochiapam Chinantec | Otomanguean | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| csy | Siyin Chin | Sino-Tibetan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cta | Tataltepec Chatino | Otomanguean | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cth | Thaiphum Chin | Bookkeeping | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ctp | Western Highland Chatino | Otomanguean | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ctu | Chol | Mayan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cub | Cubeo | Tucanoan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cuc | Usila Chinantec | Otomanguean | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cui | Cuiba | Guahiboan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cuk | San Blas Kuna | Chibchan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cut | Teutila Cuicatec | Otomanguean | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cux | Tepeuxila Cuicatec | Otomanguean | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cwe | Kwere | Atlantic-Congo | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cya | Nopala Chatino | Otomanguean | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cym | Welsh | Indo-European | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 4 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 8 | +| daa | Dangaléat | Afro-Asiatic | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| dad | Marik | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| dah | Gwahatike | Nuclear Trans New Guinea | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| dan | Danish | Indo-European | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7 | 10 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 6 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 30 | +| ded | Dedua | Nuclear Trans New Guinea | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| deu | German | Indo-European | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 8 | 14 | 7 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 7 | 2 | 20 | 4 | 0 | 0 | 0 | 0 | 4 | 0 | 69 | +| dgc | Casiguran Dumagat Agta | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| dgr | Dogrib | Athabaskan-Eyak-Tlingit | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| dgz | Daga | Dagan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| dhg | Dhangu-Djangu | Pama-Nyungan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| dif | Dieri | Pama-Nyungan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| dik | Southwestern Dinka | Nilotic | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | +| div | Dhivehi | Indo-European | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| dji | Djinang | Pama-Nyungan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| djk | Eastern Maroon Creole | Indo-European | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| djr | Djambarrpuyngu | Pama-Nyungan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| dob | Dobu | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| doi | Dogri (macrolanguage) | Unclassified | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| dop | Lukpa | Atlantic-Congo | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| dov | Dombe | Atlantic-Congo | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| dsb | Lower Sorbian | Indo-European | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| dtp | Kadazan Dusun | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| dwr | Dawro | Ta-Ne-Omotic | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| dww | Dawawa | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| dwy | Dhuwaya | Unclassified | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| dyu | Dyula | Mande | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| dza | Tunzu | Atlantic-Congo | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| dzo | Dzongkha | Sino-Tibetan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | +| ebk | Eastern Bontok | Unclassified | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| eko | Koti | Atlantic-Congo | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ell | Modern Greek (1453-) | Indo-European | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | 6 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 2 | 0 | 4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 21 | +| emi | Mussau-Emira | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| emp | Northern Emberá | Chocoan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| eng | English | Indo-European | 0 | 3 | 49 | 16 | 3 | 0 | 3 | 0 | 3 | 19 | 161 | 21 | 7 | 10 | 22 | 5 | 0 | 3 | 1 | 13 | 9 | 113 | 13 | 2 | 1 | 6 | 7 | 3 | 24 | 517 | +| enq | Enga | Nuclear Trans New Guinea | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| epo | Esperanto | Artificial Language | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | +| eri | Ogea | Nuclear Trans New Guinea | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ese | Ese Ejja | Pano-Tacanan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| esk | Northwest Alaska Inupiatun | Eskimo-Aleut | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| est | Estonian | Uralic | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 12 | +| etr | Edolo | Bosavi | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| eus | Basque | Unclassified | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 2 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 8 | +| ewe | Ewe | Atlantic-Congo | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | +| faa | Fasu | Unclassified | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| fai | Faiwol | Nuclear Trans New Guinea | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| fao | Faroese | Indo-European | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 7 | +| far | Fataleka | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| fas | Persian | Indo-European | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | 28 | 5 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 8 | 2 | 41 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 94 | +| ffm | Maasina Fulfulde | Atlantic-Congo | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| fij | Fijian | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | +| fil | Filipino | Austronesian | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | +| fin | Finnish | Uralic | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | 6 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 2 | 6 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 24 | +| fon | Fon | Atlantic-Congo | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| for | Fore | Nuclear Trans New Guinea | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| fra | French | Indo-European | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 9 | 13 | 8 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 6 | 3 | 17 | 4 | 0 | 1 | 0 | 0 | 4 | 0 | 67 | +| fry | Western Frisian | Indo-European | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| fuc | Pulaar | Atlantic-Congo | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| fue | Borgu Fulfulde | Atlantic-Congo | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| fuf | Pular | Atlantic-Congo | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| fuh | Western Niger Fulfulde | Atlantic-Congo | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| fur | Friulian | Indo-European | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| fuv | Nigerian Fulfulde | Atlantic-Congo | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | +| gah | Alekano | Nuclear Trans New Guinea | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| gai | Borei | Ramu | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| gam | Kandawo | Nuclear Trans New Guinea | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| gaw | Nobonob | Nuclear Trans New Guinea | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| gaz | West Central Oromo | Afro-Asiatic | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | +| gbm | Garhwali | Indo-European | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | +| gdn | Umanakaina | Dagan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| gdr | Wipi | Eastern Trans-Fly | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| geb | Kire | Ramu | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| gfk | Patpatar | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ghs | Guhu-Samane | Nuclear Trans New Guinea | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| gla | Scottish Gaelic | Indo-European | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | +| gle | Irish | Indo-European | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | +| glg | Galician | Indo-European | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | +| glk | Gilaki | Indo-European | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| glv | Manx | Indo-European | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| gmv | Gamo | Ta-Ne-Omotic | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| gng | Ngangam | Atlantic-Congo | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| gnn | Gumatj | Pama-Nyungan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| gnw | Western Bolivian Guaraní | Tupian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| gof | Gofa | Ta-Ne-Omotic | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| gom | Goan Konkani | Indo-European | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | +| grc | Ancient Greek (to 1453) | Indo-European | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| grn | Guarani | Unclassified | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | +| gsw | Swiss German | Indo-European | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| gub | Guajajára | Unclassified | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| guh | Guahibo | Guahiboan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| gui | Eastern Bolivian Guaraní | Tupian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| guj | Gujarati | Indo-European | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | 6 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 18 | +| gul | Sea Island Creole English | Indo-European | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| gum | Guambiano | Barbacoan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| gun | Mbyá Guaraní | Tupian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| guo | Guayabero | Guahiboan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| gup | Gunwinggu | Gunwinyguan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| gux | Gourmanchéma | Atlantic-Congo | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| gvc | Guanano | Tucanoan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| gvf | Golin | Nuclear Trans New Guinea | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| gvn | Kuku-Yalanji | Pama-Nyungan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| gvs | Gumawana | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| gwi | Gwichʼin | Athabaskan-Eyak-Tlingit | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| gym | Ngäbere | Chibchan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| gyr | Guarayu | Tupian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| hat | Haitian | Indo-European | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | +| hau | Hausa | Afro-Asiatic | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 5 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 14 | +| haw | Hawaiian | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| hbo | Ancient Hebrew | Afro-Asiatic | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| hch | Huichol | Uto-Aztecan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| heb | Hebrew | Afro-Asiatic | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | 5 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 15 | +| heg | Helong | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| hin | Hindi | Indo-European | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 11 | 12 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 2 | 11 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 43 | +| hix | Hixkaryána | Cariban | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| hla | Halia | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| hlt | Matu Chin | Sino-Tibetan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| hmn | Hmong | Unclassified | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| hmo | Hiri Motu | Pidgin | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| hne | Chhattisgarhi | Indo-European | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | +| hns | Caribbean Hindustani | Indo-European | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| hop | Hopi | Uto-Aztecan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| hot | Hote | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| hrv | Croatian | Indo-European | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 14 | +| hsb | Upper Sorbian | Indo-European | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| hto | Minica Huitoto | Huitotoan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| hub | Huambisa | Chicham | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| hui | Huli | Nuclear Trans New Guinea | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| hun | Hungarian | Uralic | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 16 | +| hus | Huastec | Mayan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| huu | Murui Huitoto | Huitotoan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| huv | San Mateo Del Mar Huave | Huavean | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| hvn | Sabu | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| hye | Armenian | Indo-European | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 9 | +| ian | Iatmul | Ndu | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ibo | Igbo | Atlantic-Congo | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 5 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 12 | +| ido | Ido | Artificial Language | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ign | Ignaciano | Arawakan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ikk | Ika | Atlantic-Congo | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ikw | Ikwere | Atlantic-Congo | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ile | Interlingue | Artificial Language | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ilo | Iloko | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | +| imo | Imbongu | Nuclear Trans New Guinea | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ina | Interlingua (International Auxiliary Language Association) | Artificial Language | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| inb | Inga | Quechuan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ind | Indonesian | Austronesian | 0 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 8 | 7 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 5 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 27 | +| ino | Inoke-Yate | Nuclear Trans New Guinea | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| iou | Tuma-Irumu | Nuclear Trans New Guinea | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ipi | Ipili | Nuclear Trans New Guinea | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| isl | Icelandic | Indo-European | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | 4 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 12 | +| isn | Isanzu | Atlantic-Congo | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ita | Italian | Indo-European | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7 | 9 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 2 | 1 | 7 | 3 | 0 | 0 | 0 | 0 | 4 | 0 | 36 | +| iws | Sepik Iwam | Sepik | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ixl | Ixil | Mayan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| jac | Popti' | Mayan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| jae | Yabem | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| jao | Yanyuwa | Pama-Nyungan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| jav | Javanese | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 7 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 13 | +| jic | Tol | Jicaquean | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| jid | Bu (Kaduna State) | Atlantic-Congo | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| jiv | Shuar | Chicham | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| jni | Janji | Atlantic-Congo | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| jpn | Japanese | Japonic | 0 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7 | 8 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 3 | 14 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 42 | +| jvn | Caribbean Javanese | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kab | Kabyle | Afro-Asiatic | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | +| kac | Kachin | Sino-Tibetan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | +| kam | Kamba (Kenya) | Atlantic-Congo | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| kan | Kannada | Dravidian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | 7 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 19 | +| kaq | Capanahua | Pano-Tacanan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kas | Kashmiri | Indo-European | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | +| kat | Georgian | Kartvelian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 13 | +| kaz | Kazakh | Turkic | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 11 | +| kbc | Kadiwéu | Guaicuruan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kbh | Camsá | Unclassified | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kbm | Iwal | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kbp | Kabiyè | Atlantic-Congo | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| kbq | Kamano | Nuclear Trans New Guinea | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kdc | Kutu | Atlantic-Congo | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kde | Makonde | Atlantic-Congo | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kdl | Tsikimba | Atlantic-Congo | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kea | Kabuverdianu | Indo-European | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | +| kek | Kekchí | Mayan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ken | Kenyang | Atlantic-Congo | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kew | West Kewa | Nuclear Trans New Guinea | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kfg | Kudiya | Dravidian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kfy | Kumaoni | Indo-European | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kgf | Kube | Nuclear Trans New Guinea | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kgk | Kaiwá | Tupian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kgp | Kaingang | Nuclear-Macro-Je | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| khk | Halh Mongolian | Mongolic-Khitan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | +| khm | Khmer | Austroasiatic | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 8 | +| khs | Kasua | Bosavi | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| khz | Keapara | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kik | Kikuyu | Atlantic-Congo | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | +| kin | Kinyarwanda | Atlantic-Congo | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 8 | +| kir | Kirghiz | Turkic | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7 | +| kiw | Northeast Kiwai | Kiwaian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kiz | Kisi | Atlantic-Congo | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kje | Kisar | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kjs | East Kewa | Nuclear Trans New Guinea | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kkc | Odoodee | East Strickland | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kkl | Kosarek Yale | Nuclear Trans New Guinea | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| klt | Nukna | Nuclear Trans New Guinea | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| klv | Maskelynes | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kmb | Kimbundu | Atlantic-Congo | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| kmg | Kâte | Nuclear Trans New Guinea | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kmh | Kalam | Nuclear Trans New Guinea | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kmk | Limos Kalinga | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kmo | Kwoma | Sepik | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kmr | Northern Kurdish | Indo-European | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | +| kms | Kamasau | Nuclear Torricelli | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kmu | Kanite | Nuclear Trans New Guinea | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| knc | Central Kanuri | Saharan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| kne | Kankanaey | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| knf | Mankanya | Atlantic-Congo | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| knj | Western Kanjobal | Mayan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| knv | Tabo | Unclassified | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kon | Kongo | Unclassified | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| kor | Korean | Koreanic | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | 8 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 3 | 1 | 10 | 3 | 0 | 0 | 0 | 0 | 2 | 0 | 39 | +| kos | Kosraean | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kpf | Komba | Nuclear Trans New Guinea | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kpg | Kapingamarangi | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kpj | Karajá | Nuclear-Macro-Je | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kpr | Korafe-Yegha | Nuclear Trans New Guinea | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kpw | Kobon | Nuclear Trans New Guinea | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kpx | Mountain Koiali | Koiarian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kqa | Mum | Nuclear Trans New Guinea | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kqc | Doromu-Koki | Manubaran | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kqf | Kakabai | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kql | Kyenele | Yuat | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kqw | Kandas | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| krc | Karachay-Balkar | Turkic | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ksd | Kuanua | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ksj | Uare | Kwalean | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ksr | Borong | Nuclear Trans New Guinea | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ktm | Kurti | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kto | Kuot | Unclassified | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kud | 'Auhelawa | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kue | Kuman (Papua New Guinea) | Nuclear Trans New Guinea | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kup | Kunimaipa | Kunimaipan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kur | Kurdish | Unclassified | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| kvg | Kuni-Boazi | Anim | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kvn | Border Kuna | Chibchan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kwd | Kwaio | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kwf | Kwara'ae | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kwi | Awa-Cuaiquer | Barbacoan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kwj | Kwanga | Sepik | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kyc | Kyaka | Nuclear Trans New Guinea | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kyf | Kouya | Kru | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kyg | Keyagana | Nuclear Trans New Guinea | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kyq | Kenga | Central Sudanic | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kyz | Kayabí | Tupian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kze | Kosena | Bookkeeping | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kzj | Coastal Kadazan | Unclassified | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| lac | Lacandon | Mayan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| lao | Lao | Tai-Kadai | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | +| lat | Latin | Indo-European | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | +| lav | Latvian | Indo-European | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 8 | +| lbb | Label | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| lbk | Central Bontok | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| lcm | Tungag | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| leu | Kara (Papua New Guinea) | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| lex | Luang | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| lfn | Lingua Franca Nova | Artificial Language | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| lgl | Wala | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| lid | Nyindrou | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| lif | Limbu | Sino-Tibetan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| lij | Ligurian | Indo-European | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| lim | Limburgan | Indo-European | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| lin | Lingala | Atlantic-Congo | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 2 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 8 | +| lit | Lithuanian | Indo-European | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 11 | +| llg | Lole | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| lmo | Lombard | Indo-European | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| ltg | Latgalian | Unclassified | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| ltz | Luxembourgish | Indo-European | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | +| lua | Luba-Lulua | Atlantic-Congo | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| lug | Ganda | Atlantic-Congo | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 2 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 8 | +| luo | Luo (Kenya and Tanzania) | Nilotic | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | +| lus | Lushai | Sino-Tibetan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| lvs | Standard Latvian | Unclassified | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | +| lww | Lewo | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| maa | San Jerónimo Tecóatl Mazatec | Otomanguean | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mad | Madurese | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | +| mag | Magahi | Indo-European | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | +| mai | Maithili | Indo-European | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7 | +| maj | Jalapa De Díaz Mazatec | Otomanguean | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mak | Makasar | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| mal | Malayalam | Dravidian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7 | 7 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 19 | +| mam | Mam | Mayan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| maq | Chiquihuitlán Mazatec | Otomanguean | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mar | Marathi | Indo-European | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 9 | 6 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 3 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 23 | +| mau | Huautla Mazatec | Otomanguean | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mav | Sateré-Mawé | Tupian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| max | North Moluccan Malay | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| maz | Central Mazahua | Otomanguean | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mbb | Western Bukidnon Manobo | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mbc | Macushi | Cariban | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mbh | Mangseng | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mbj | Nadëb | Naduhup | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mbl | Maxakalí | Nuclear-Macro-Je | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mbs | Sarangani Manobo | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mbt | Matigsalug Manobo | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mca | Maca | Mataguayan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mcb | Machiguenga | Arawakan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mcd | Sharanahua | Pano-Tacanan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mcf | Matsés | Pano-Tacanan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mco | Coatlán Mixe | Mixe-Zoque | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mcp | Makaa | Atlantic-Congo | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mcq | Ese | Koiarian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mcr | Menya | Angan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mdy | Male (Ethiopia) | Ta-Ne-Omotic | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| med | Melpa | Nuclear Trans New Guinea | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mee | Mengen | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mek | Mekeo | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| meq | Merey | Afro-Asiatic | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| met | Mato | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| meu | Motu | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mey | Hassaniyya | Afro-Asiatic | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mgc | Morokodo | Central Sudanic | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mgh | Makhuwa-Meetto | Atlantic-Congo | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mgw | Matumbi | Atlantic-Congo | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mhl | Mauwake | Nuclear Trans New Guinea | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mhr | Eastern Mari | Uralic | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mib | Atatláhuca Mixtec | Otomanguean | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mic | Mi'kmaq | Algic | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mie | Ocotepec Mixtec | Otomanguean | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mig | San Miguel El Grande Mixtec | Otomanguean | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mih | Chayuco Mixtec | Otomanguean | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mil | Peñoles Mixtec | Otomanguean | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| min | Minangkabau | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 4 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 9 | +| mio | Pinotepa Nacional Mixtec | Otomanguean | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mir | Isthmus Mixe | Mixe-Zoque | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mit | Southern Puebla Mixtec | Otomanguean | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| miz | Coatzospan Mixtec | Otomanguean | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mjc | San Juan Colorado Mixtec | Otomanguean | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mkd | Macedonian | Indo-European | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7 | +| mkj | Mokilese | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mkl | Mokole | Atlantic-Congo | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mkn | Kupang Malay | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mks | Silacayoapan Mixtec | Otomanguean | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mle | Manambu | Ndu | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mlg | Malagasy | Unclassified | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mlh | Mape | Nuclear Trans New Guinea | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mlp | Bargam | Nuclear Trans New Guinea | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mlt | Maltese | Afro-Asiatic | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 2 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 9 | +| mmo | Mangga Buang | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mmx | Madak | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mna | Mbula | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mni | Manipuri | Sino-Tibetan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7 | +| mon | Mongolian | Unclassified | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | +| mop | Mopán Maya | Mayan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mos | Mossi | Atlantic-Congo | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| mox | Molima | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mph | Maung | Iwaidjan Proper | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mpj | Martu Wangka | Pama-Nyungan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mpm | Yosondúa Mixtec | Otomanguean | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mpp | Migabac | Nuclear Trans New Guinea | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mps | Dadibi | Teberan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mpt | Mian | Nuclear Trans New Guinea | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mpx | Misima-Panaeati | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mqb | Mbuko | Afro-Asiatic | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mqj | Mamasa | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mri | Maori | Austronesian | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | +| msa | Malay (macrolanguage) | Unclassified | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7 | +| msb | Masbatenyo | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| msc | Sankaran Maninka | Mande | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| msk | Mansaka | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| msm | Agusan Manobo | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| msy | Aruamu | Ramu | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mti | Maiwa (Papua New Guinea) | Dagan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mto | Totontepec Mixe | Mixe-Zoque | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mui | Musi | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| mup | Malvi | Indo-European | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | +| mux | Bo-Ung | Nuclear Trans New Guinea | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| muy | Muyang | Afro-Asiatic | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mva | Manam | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mvn | Minaveha | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mwc | Are | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mwe | Mwera (Chimwera) | Atlantic-Congo | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mwf | Murrinh-Patha | Southern Daly | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mwp | Kala Lagaw Ya | Pama-Nyungan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mwr | Marwari | Unclassified | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mxb | Tezoatlán Mixtec | Otomanguean | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mxp | Tlahuitoltepec Mixe | Mixe-Zoque | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mxq | Juquila Mixe | Mixe-Zoque | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mxt | Jamiltepec Mixtec | Otomanguean | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mya | Burmese | Sino-Tibetan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 4 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 9 | +| myk | Mamara Senoufo | Atlantic-Congo | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| myu | Mundurukú | Tupian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| myw | Muyuw | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| myy | Macuna | Tucanoan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mzz | Maiadomu | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nab | Southern Nambikuára | Nambiquaran | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| naf | Nabak | Nuclear Trans New Guinea | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nak | Nakanai | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nas | Naasioi | South Bougainville | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nbl | South Ndebele | Unclassified | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nbq | Nggem | Nuclear Trans New Guinea | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nca | Iyo | Nuclear Trans New Guinea | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nch | Central Huasteca Nahuatl | Uto-Aztecan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ncj | Northern Puebla Nahuatl | Uto-Aztecan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ncl | Michoacán Nahuatl | Uto-Aztecan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ncu | Chumburung | Atlantic-Congo | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nde | North Ndebele | Atlantic-Congo | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ndg | Ndengereko | Atlantic-Congo | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ndj | Ndamba | Atlantic-Congo | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nds | Low German | Indo-European | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nep | Nepali (macrolanguage) | Unclassified | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| nfa | Dhao | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ngp | Ngulu | Atlantic-Congo | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ngu | Guerrero Nahuatl | Uto-Aztecan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nhe | Eastern Huasteca Nahuatl | Uto-Aztecan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nhg | Tetelcingo Nahuatl | Uto-Aztecan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nhi | Zacatlán-Ahuacatlán-Tepetzintla Nahuatl | Uto-Aztecan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nho | Takuu | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nhr | Naro | Khoe-Kwadi | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nhu | Noone | Atlantic-Congo | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nhw | Western Huasteca Nahuatl | Uto-Aztecan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nhy | Northern Oaxaca Nahuatl | Uto-Aztecan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nif | Nek | Nuclear Trans New Guinea | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nii | Nii | Nuclear Trans New Guinea | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nij | Ngaju | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | +| nin | Ninzo | Atlantic-Congo | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nko | Nkonya | Atlantic-Congo | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nld | Dutch | Indo-European | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 8 | 6 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 30 | 2 | 0 | 0 | 0 | 0 | 4 | 0 | 55 | +| nlg | Gela | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nna | Nyangumarta | Pama-Nyungan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nno | Norwegian Nynorsk | Unclassified | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 8 | +| nnq | Ngindo | Atlantic-Congo | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| noa | Woun Meu | Chocoan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nob | Norwegian Bokmål | Unclassified | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 8 | 5 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 20 | +| noe | Nimadi | Indo-European | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nop | Numanggang | Nuclear Trans New Guinea | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nor | Norwegian | Indo-European | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7 | +| not | Nomatsiguenga | Arawakan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nou | Ewage-Notu | Nuclear Trans New Guinea | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nov | Novial | Artificial Language | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| npi | Nepali (individual language) | Indo-European | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 8 | +| npl | Southeastern Puebla Nahuatl | Uto-Aztecan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nqo | N'Ko | Artificial Language | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | +| nsn | Nehan | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nso | Pedi | Atlantic-Congo | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | +| nss | Nali | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ntj | Ngaanyatjarra | Pama-Nyungan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ntp | Northern Tepehuan | Uto-Aztecan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ntu | Natügu | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nus | Nuer | Nilotic | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| nuy | Nunggubuyu | Gunwinyguan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nvm | Namiae | Koiarian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nwi | Southwest Tanna | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nya | Nyanja | Atlantic-Congo | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | +| nys | Nyungar | Pama-Nyungan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nyu | Nyungwe | Atlantic-Congo | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| obo | Obo Manobo | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| oci | Occitan (post 1500) | Indo-European | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | +| okv | Orokaiva | Nuclear Trans New Guinea | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| omw | South Tairora | Nuclear Trans New Guinea | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ong | Olo | Nuclear Torricelli | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ons | Ono | Nuclear Trans New Guinea | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ood | Tohono O'odham | Uto-Aztecan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| opm | Oksapmin | Nuclear Trans New Guinea | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ori | Oriya (macrolanguage) | Unclassified | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| orm | Oromo | Unclassified | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | +| orv | Old Russian | Indo-European | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ory | Odia | Indo-European | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | 4 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 15 | +| ote | Mezquital Otomi | Otomanguean | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| otm | Eastern Highland Otomi | Otomanguean | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| otn | Tenango Otomi | Otomanguean | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| otq | Querétaro Otomi | Otomanguean | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ots | Estado de México Otomi | Otomanguean | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| pab | Parecís | Arawakan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| pad | Paumarí | Arawan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| pag | Pangasinan | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| pah | Tenharim | Tupian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| pam | Pampanga | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| pan | Panjabi | Indo-European | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | 6 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 18 | +| pao | Northern Paiute | Uto-Aztecan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| pap | Papiamento | Indo-European | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| pbt | Southern Pashto | Indo-European | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | +| pcm | Nigerian Pidgin | Indo-European | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 4 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7 | +| pes | Iranian Persian | Indo-European | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | +| pib | Yine | Arawakan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| pio | Piapoco | Arawakan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| pir | Piratapuyo | Tucanoan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| piu | Pintupi-Luritja | Pama-Nyungan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| pjt | Pitjantjatjara | Pama-Nyungan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| pls | San Marcos Tlacoyalco Popoloca | Otomanguean | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| plt | Plateau Malagasy | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | +| plu | Palikúr | Arawakan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| pma | Paama | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| pms | Piemontese | Indo-European | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| poe | San Juan Atzingo Popoloca | Otomanguean | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| poh | Poqomchi' | Mayan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| poi | Highland Popoluca | Mixe-Zoque | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| pol | Polish | Indo-European | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | 11 | 4 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 4 | 0 | 19 | 4 | 0 | 0 | 0 | 0 | 2 | 0 | 52 | +| pon | Pohnpeian | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| por | Portuguese | Indo-European | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | 9 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 3 | 1 | 6 | 3 | 0 | 0 | 0 | 0 | 2 | 0 | 34 | +| poy | Pogolo | Atlantic-Congo | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ppo | Folopa | Teberan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| prf | Paranan | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| pri | Paicî | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| prs | Dari | Indo-European | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | +| ptp | Patep | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ptu | Bambam | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| pus | Pushto | Unclassified | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | +| pwg | Gapapaiwa | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| qub | Huallaga Huánuco Quechua | Quechuan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| quc | K'iche' | Mayan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| quf | Lambayeque Quechua | Quechuan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| quh | South Bolivian Quechua | Quechuan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| qul | North Bolivian Quechua | Quechuan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| qup | Southern Pastaza Quechua | Quechuan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| quy | Ayacucho Quechua | Quechuan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| quz | Cusco Quechua | Quechuan | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| qvc | Cajamarca Quechua | Quechuan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| qve | Eastern Apurímac Quechua | Quechuan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| qvh | Huamalíes-Dos de Mayo Huánuco Quechua | Quechuan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| qvm | Margos-Yarowilca-Lauricocha Quechua | Quechuan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| qvn | North Junín Quechua | Quechuan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| qvs | San Martín Quechua | Quechuan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| qvw | Huaylla Wanca Quechua | Quechuan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| qvz | Northern Pastaza Quichua | Quechuan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| qwh | Huaylas Ancash Quechua | Quechuan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| qxh | Panao Huánuco Quechua | Quechuan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| qxn | Northern Conchucos Ancash Quechua | Quechuan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| qxo | Southern Conchucos Ancash Quechua | Quechuan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| rai | Ramoaaina | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| raj | Rajasthani | Unclassified | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | +| reg | Kara (Tanzania) | Atlantic-Congo | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| rej | Rejang | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| rgu | Ringgou | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| rkb | Rikbaktsa | Nuclear-Macro-Je | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| rmc | Carpathian Romani | Indo-European | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| rmy | Vlax Romani | Indo-European | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| rom | Romany | Unclassified | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | +| ron | Romanian | Indo-European | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7 | 6 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 4 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 22 | +| roo | Rotokas | North Bougainville | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| rop | Kriol | Indo-European | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| row | Dela-Oenale | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| rro | Waima | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ruf | Luguru | Atlantic-Congo | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| rug | Roviana | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| run | Rundi | Atlantic-Congo | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 2 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | +| rus | Russian | Indo-European | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7 | 18 | 6 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 4 | 2 | 17 | 4 | 0 | 0 | 0 | 0 | 2 | 0 | 64 | +| rwo | Rawa | Nuclear Trans New Guinea | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| sab | Buglere | Chibchan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| sag | Sango | Atlantic-Congo | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| sah | Yakut | Turkic | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| san | Sanskrit | Indo-European | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 10 | +| sat | Santali | Austroasiatic | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7 | +| sbe | Saliba | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| sbk | Safwa | Atlantic-Congo | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| sbs | Subiya | Atlantic-Congo | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| scn | Sicilian | Indo-European | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| sco | Scots | Indo-European | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| seh | Sena | Atlantic-Congo | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| sey | Secoya | Tucanoan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| sgb | Mag-antsi Ayta | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| sgz | Sursurunga | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| shi | Tachelhit | Afro-Asiatic | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| shj | Shatt | Dajuic | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| shn | Shan | Tai-Kadai | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | +| shp | Shipibo-Conibo | Pano-Tacanan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| sim | Mende (Papua New Guinea) | Sepik | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| sin | Sinhala | Indo-European | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7 | +| sja | Epena | Chocoan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| slk | Slovak | Indo-European | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | 4 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 15 | +| sll | Salt-Yui | Nuclear Trans New Guinea | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| slv | Slovenian | Indo-European | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | 4 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 13 | +| smk | Bolinao | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| smo | Samoan | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | +| sna | Shona | Atlantic-Congo | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 2 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 8 | +| snc | Sinaugoro | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| snd | Sindhi | Indo-European | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 8 | +| snn | Siona | Tucanoan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| snp | Siane | Nuclear Trans New Guinea | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| snx | Sam | Nuclear Trans New Guinea | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| sny | Saniyo-Hiyewe | Sepik | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| som | Somali | Afro-Asiatic | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 2 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 9 | +| soq | Kanasi | Dagan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| sot | Southern Sotho | Atlantic-Congo | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | +| soy | Miyobe | Atlantic-Congo | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| spa | Spanish | Indo-European | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | 13 | 4 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 3 | 2 | 15 | 4 | 0 | 0 | 0 | 0 | 4 | 0 | 54 | +| spl | Selepet | Nuclear Trans New Guinea | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| spm | Akukem | Ramu | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| spp | Supyire Senoufo | Atlantic-Congo | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| sps | Saposa | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| spy | Sabaot | Nilotic | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| sqi | Albanian | Unclassified | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | +| srd | Sardinian | Unclassified | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| sri | Siriano | Tucanoan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| srm | Saramaccan | Indo-European | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| srn | Sranan Tongo | Indo-European | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | +| srp | Serbian | Indo-European | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 12 | +| srq | Sirionó | Tupian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ssd | Siroi | Nuclear Trans New Guinea | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ssg | Seimat | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ssw | Swati | Atlantic-Congo | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7 | +| ssx | Samberigi | Nuclear Trans New Guinea | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| stp | Southeastern Tepehuan | Uto-Aztecan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| sua | Sulka | Unclassified | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| sue | Suena | Nuclear Trans New Guinea | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| sun | Sundanese | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 4 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 9 | +| sus | Susu | Mande | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| suz | Sunwar | Sino-Tibetan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| svk | Slovakian Sign Language | Sign Language | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| swa | Swahili (macrolanguage) | Atlantic-Congo | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 7 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 16 | +| swe | Swedish | Indo-European | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | 9 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 5 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 27 | +| swg | Swabian | Indo-European | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| swh | Swahili (individual language) | Atlantic-Congo | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | +| swp | Suau | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| sxb | Suba | Atlantic-Congo | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| szl | Silesian | Indo-European | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| tac | Lowland Tarahumara | Uto-Aztecan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tah | Tahitian | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| taj | Eastern Tamang | Sino-Tibetan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tam | Tamil | Dravidian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7 | 7 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 21 | +| taq | Tamasheq | Afro-Asiatic | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| tat | Tatar | Turkic | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | +| tav | Tatuyo | Tucanoan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| taw | Tai | Nuclear Trans New Guinea | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tbc | Takia | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tbf | Mandara | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tbg | North Tairora | Nuclear Trans New Guinea | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tbo | Tawala | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tbz | Ditammari | Atlantic-Congo | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tca | Ticuna | Ticuna-Yuri | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tcs | Torres Strait Creole | Indo-European | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tcz | Thado Chin | Sino-Tibetan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tdt | Tetun Dili | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tee | Huehuetla Tepehua | Totonacan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tel | Telugu | Dravidian | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7 | 7 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 5 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 25 | +| ter | Tereno | Arawakan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tet | Tetum | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tew | Tewa (USA) | Kiowa-Tanoan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tfr | Teribe | Chibchan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tgk | Tajik | Indo-European | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | +| tgl | Tagalog | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 11 | +| tgo | Sudest | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tgp | Tangoa | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tha | Thai | Tai-Kadai | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | 8 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 7 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 25 | +| tif | Tifal | Nuclear Trans New Guinea | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tim | Timbe | Nuclear Trans New Guinea | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tir | Tigrinya | Afro-Asiatic | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 2 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 8 | +| tiw | Tiwi | Unclassified | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tiy | Tiruray | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tke | Takwane | Atlantic-Congo | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tku | Upper Necaxa Totonac | Totonacan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tlf | Telefol | Nuclear Trans New Guinea | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tmd | Haruai | Piawi | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tna | Tacana | Pano-Tacanan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tnc | Tanimuca-Retuarã | Tucanoan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tnk | Kwamera | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tnn | North Tanna | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tnp | Whitesands | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| toc | Coyutla Totonac | Totonacan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tod | Toma | Mande | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tof | Gizrra | Eastern Trans-Fly | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| toj | Tojolabal | Mayan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ton | Tonga (Tonga Islands) | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | +| too | Xicotepec De Juárez Totonac | Totonacan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| top | Papantla Totonac | Totonacan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tos | Highland Totonac | Totonacan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tpa | Taupota | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tpi | Tok Pisin | Indo-European | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | +| tpt | Tlachichilco Tepehua | Totonacan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tpz | Tinputz | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| trc | Copala Triqui | Otomanguean | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tsn | Tswana | Atlantic-Congo | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7 | +| tso | Tsonga | Atlantic-Congo | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 4 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7 | +| tsw | Tsishingini | Atlantic-Congo | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ttc | Tektiteko | Mayan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tte | Bwanabwana | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tuc | Mutu | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tue | Tuyuca | Tucanoan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tuf | Central Tunebo | Chibchan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tuk | Turkmen | Turkic | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | +| tum | Tumbuka | Atlantic-Congo | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| tuo | Tucano | Tucanoan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tur | Turkish | Turkic | 0 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | 7 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0 | 4 | 2 | 0 | 0 | 0 | 0 | 2 | 0 | 28 | +| tvk | Southeast Ambrym | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| twi | Twi | Unclassified | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | +| txq | Tii | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| txu | Kayapó | Nuclear-Macro-Je | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tyv | Tuvinian | Turkic | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tzj | Tz'utujil | Mayan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tzl | Talossan | Artificial Language | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tzm | Central Atlas Tamazight | Afro-Asiatic | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| tzo | Tzotzil | Mayan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ubr | Ubir | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ubu | Umbu-Ungu | Nuclear Trans New Guinea | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| udu | Uduk | Koman | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| uig | Uighur | Turkic | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7 | +| ukr | Ukrainian | Indo-European | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 12 | +| uli | Ulithian | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ulk | Meriam Mir | Eastern Trans-Fly | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| umb | Umbundu | Atlantic-Congo | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| upv | Uripiv-Wala-Rano-Atchin | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ura | Urarina | Unclassified | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| urb | Urubú-Kaapor | Tupian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| urd | Urdu | Indo-European | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 9 | 8 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 22 | +| uri | Urim | Nuclear Torricelli | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| urt | Urat | Nuclear Torricelli | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| urw | Sop | Nuclear Trans New Guinea | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| usa | Usarufa | Nuclear Trans New Guinea | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| usp | Uspanteco | Mayan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| uvh | Uri | Nuclear Trans New Guinea | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| uvl | Lote | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| uzb | Uzbek | Unclassified | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| uzn | Northern Uzbek | Turkic | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | +| vec | Venetian | Indo-European | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| ven | Venda | Atlantic-Congo | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | +| vid | Vidunda | Atlantic-Congo | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| vie | Vietnamese | Austroasiatic | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7 | 6 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 8 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 25 | +| viv | Iduna | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| vmy | Ayautla Mazatec | Otomanguean | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| waj | Waffa | Nuclear Trans New Guinea | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| wal | Wolaytta | Ta-Ne-Omotic | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| wap | Wapishana | Arawakan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| war | Waray (Philippines) | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | +| wat | Kaninuwa | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| wbi | Vwanji | Atlantic-Congo | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| wbp | Warlpiri | Pama-Nyungan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| wed | Wedau | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| wer | Weri | Kunimaipan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| wim | Wik-Mungkan | Pama-Nyungan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| wiu | Wiru | Unclassified | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| wiv | Vitu | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| wln | Walloon | Indo-European | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| wmt | Walmajarri | Pama-Nyungan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| wmw | Mwani | Atlantic-Congo | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| wnc | Wantoat | Nuclear Trans New Guinea | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| wnu | Usan | Nuclear Trans New Guinea | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| wol | Wolof | Atlantic-Congo | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | +| wos | Hanga Hundi | Ndu | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| wrk | Garrwa | Garrwan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| wro | Worrorra | Worrorran | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| wrs | Waris | Border | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| wsk | Waskia | Nuclear Trans New Guinea | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| wuu | Wu Chinese | Sino-Tibetan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| wuv | Wuvulu-Aua | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| xav | Xavánte | Nuclear-Macro-Je | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| xbi | Kombio | Nuclear Torricelli | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| xed | Hdi | Afro-Asiatic | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| xho | Xhosa | Atlantic-Congo | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 3 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 10 | +| xla | Kamula | Kamula-Elevala | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| xnn | Northern Kankanay | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| xon | Konkomba | Atlantic-Congo | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| xsi | Sio | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| xtd | Diuxi-Tilantongo Mixtec | Otomanguean | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| xtm | Magdalena Peñasco Mixtec | Otomanguean | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| yaa | Yaminahua | Pano-Tacanan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| yad | Yagua | Peba-Yagua | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| yal | Yalunka | Mande | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| yap | Yapese | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| yaq | Yaqui | Uto-Aztecan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| yby | Yaweyuha | Nuclear Trans New Guinea | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ycn | Yucuna | Arawakan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ydd | Eastern Yiddish | Indo-European | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| yid | Yiddish | Unclassified | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| yka | Yakan | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| yle | Yele | Unclassified | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| yml | Iamalele | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| yon | Yongkom | Nuclear Trans New Guinea | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| yor | Yoruba | Atlantic-Congo | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 5 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 16 | +| yrb | Yareba | Yareban | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| yre | Yaouré | Mande | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| yss | Yessan-Mayo | Sepik | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| yue | Yue Chinese | Sino-Tibetan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | +| yuj | Karkar-Yuri | Pauwasi | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| yut | Yopno | Nuclear Trans New Guinea | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| yuw | Yau (Morobe Province) | Nuclear Trans New Guinea | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| yva | Yawa | Yawa-Saweru | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| zaa | Sierra de Juárez Zapotec | Otomanguean | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| zab | Western Tlacolula Valley Zapotec | Otomanguean | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| zac | Ocotlán Zapotec | Otomanguean | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| zad | Cajonos Zapotec | Otomanguean | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| zai | Isthmus Zapotec | Otomanguean | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| zaj | Zaramo | Atlantic-Congo | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| zam | Miahuatlán Zapotec | Otomanguean | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| zao | Ozolotepec Zapotec | Otomanguean | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| zap | Zapotec | Unclassified | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| zar | Rincón Zapotec | Otomanguean | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| zas | Santo Domingo Albarradas Zapotec | Otomanguean | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| zat | Tabaa Zapotec | Otomanguean | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| zav | Yatzachi Zapotec | Otomanguean | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| zaw | Mitla Zapotec | Otomanguean | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| zca | Coatecas Altas Zapotec | Otomanguean | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| zga | Kinga | Atlantic-Congo | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| zho | Chinese | Unclassified | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 | 14 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 26 | +| zia | Zia | Nuclear Trans New Guinea | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ziw | Zigula | Atlantic-Congo | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| zlm | Malay (individual language) | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| zos | Francisco León Zoque | Mixe-Zoque | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| zpc | Choapan Zapotec | Otomanguean | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| zpl | Lachixío Zapotec | Otomanguean | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| zpm | Mixtepec Zapotec | Otomanguean | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| zpo | Amatlán Zapotec | Otomanguean | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| zpq | Zoogocho Zapotec | Otomanguean | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| zpu | Yalálag Zapotec | Otomanguean | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| zpv | Chichicapan Zapotec | Otomanguean | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| zpz | Texmelucan Zapotec | Otomanguean | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| zsm | Standard Malay | Austronesian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | +| zsr | Southern Rincon Zapotec | Unclassified | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ztq | Quioquitani-Quierí Zapotec | Otomanguean | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| zty | Yatee Zapotec | Otomanguean | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| zul | Zulu | Atlantic-Congo | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7 | +| zyp | Zyphe Chin | Sino-Tibetan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| Total | None | None | None | 0 | 55 | 49 | 16 | 3 | 0 | 3 | 0 | 3 | 1492 | 846 | 316 | 7 | 10 | 22 | 5 | 0 | 3 | 28 | 92 | 56 | 593 | 88 | 2 | 2 | 6 | 7 | 37 | 24 | ->>>>>>> origin/main
diff --git a/docs/usage/results.md b/docs/usage/results.md new file mode 100644 index 0000000000..7ae7bda3cd --- /dev/null +++ b/docs/usage/results.md @@ -0,0 +1,123 @@ +# Loading and working with results + +Multiple models have already been run on tasks available within MTEB. These results are available results [repository](https://github.com/embeddings-benchmark/results). + +To make the results more easily accessible, we have designed custom functionality for retrieving from the repository. For instance, if you are selecting the best model for your French and English retrieval task on legal documents you could fetch the relevant tasks and create a dataframe of the results using the following code: + +```python +import mteb + +# select your tasks +tasks = mteb.get_tasks(task_types=["Retrieval"], languages=["eng", "fra"], domains=["Legal"]) +# or use a benchmark +tasks = mteb.get_benchmark("MTEB(Multilingual, v1)").tasks + +model_names = [ + "GritLM/GritLM-7B", + "intfloat/multilingual-e5-large", +] + +results = mteb.load_results(models=model_names, tasks=tasks) +``` + +From this you will get a results object: +```py +results +# BenchmarkResults(model_results=[...](#10)) +type(results) +# mteb.load_results.benchmark_results.BenchmarkResults +``` + +## Working with the result objects + +The result object is a convenient object in `mteb` for working with dataframes and allows you to quick examine your results. + +![](images/visualizations/result_objects.png) + +The object contain a lot of convenience functions for inspecting and examining the results: +```py +print(results.model_names) +# ['GritLM/GritLM-7B', 'intfloat/multilingual-e5-large'] + +task_names = results.task_names +print(task_names) +# ['SpartQA', 'PlscClusteringP2P.v2', 'StackOverflowQA', 'JSICK', ... +``` + +### Filtering Results + +There is also utility function that allows you to select certain models or tasks: +```py +# select only gritLM +results = results.select_models(["GritLM/GritLM-7B"]) + +# select only retrieval tasks +tasks = mteb.get_tasks(tasks=task_names) +retrieval_tasks = [task for task in tasks if task.metadata.type == "Retrieval"] + +results = results.select_tasks(retrieval_tasks) +``` + +### Creating a Dataframe + +```py +df = results.to_dataframe() + +print(df) +# model_name task_name GritLM/GritLM-7B +# 0 AILAStatutes 0.418000 +# 1 ArguAna 0.631710 +# 2 BelebeleRetrieval 0.717035 +# 3 CovidRetrieval 0.734010 +# 4 HagridRetrieval 0.986730 +# 5 LEMBPasskeyRetrieval 0.382500 +# 6 LegalBenchCorporateLobbying 0.949990 +# 7 MIRACLRetrievalHardNegatives 0.516793 +# 8 MLQARetrieval 0.727420 +# 9 SCIDOCS 0.244090 +# 10 SpartQA 0.093550 +# 11 StackOverflowQA 0.933670 +# 12 StatcanDialogueDatasetRetrieval 0.457587 +# 13 TRECCOVID 0.743130 +# 14 TempReasonL1 0.071640 +# 15 TwitterHjerneRetrieval 0.432660 +# 16 WikipediaRetrievalMultilingual 0.917722 +# 17 WinoGrande 0.536970 +``` + +By default this will give you the results in a `"wide"` format. However, you can just as well get them in a long format: + +```py +long_format_df = results.to_dataframe(format="long") + +print(long_format_df.head(5)) +# model_name task_name score +# 0 GritLM/GritLM-7B AILAStatutes 0.418000 +# 1 GritLM/GritLM-7B ArguAna 0.631710 +# 2 GritLM/GritLM-7B BelebeleRetrieval 0.717035 +# 3 GritLM/GritLM-7B CovidRetrieval 0.734010 +# 4 GritLM/GritLM-7B HagridRetrieval 0.986730 +``` + +### Adding metadata to table + +One might want to add some more metadata to the table. This is luckily quite easy using: + +```py +import pandas as pd + +task_df = tasks.to_dataframe(properties=["name", "type", "domains"]) +task_df = task_df.rename(columns={"name": "task_name"}) + +df_with_meta = pd.merge(task_df, df) + +print(df_with_meta.head(5)) +# task_name type domains GritLM/GritLM-7B +# 0 SpartQA Retrieval [Encyclopaedic, Written] 0.093550 +# 1 StackOverflowQA Retrieval [Programming, Written] 0.933670 +# 2 BelebeleRetrieval Retrieval [Web, News, Written] 0.717035 +# 3 ArguAna Retrieval [Medical, Written] 0.631710 +# 4 TempReasonL1 Retrieval [Encyclopaedic, Written] 0.071640 +``` + + diff --git a/docs/usage/usage.md b/docs/usage/usage.md index cba88f21ea..7eccd7d07c 100644 --- a/docs/usage/usage.md +++ b/docs/usage/usage.md @@ -438,48 +438,37 @@ mteb run -t NFCorpus -m all-MiniLM-L6-v2 --output_folder results --save_predicti ### Caching Embeddings To Re-Use Them -There are times you may want to cache the embeddings so you can re-use them. This may be true if you have multiple query sets for the same corpus (e.g. Wikipedia) or are doing some optimization over the queries (e.g. prompting, other experiments). You can setup a cache by using a simple wrapper, which will save the cache per task in the `cache_embeddings/{task_name}` folder: +There are times you may want to cache the embeddings so you can re-use them. This may be true if you have multiple query sets for the same corpus (e.g. Wikipedia) or are doing some optimization over the queries (e.g. prompting, other experiments). You can setup a cache by using a simple wrapper, which will save the cache per task in the `/` folder: ```python -# define your task and model above as normal +# define your task(s) and model above as normal ... # wrap the model with the cache wrapper from mteb.models.cache_wrapper import CachedEmbeddingWrapper -model_with_cached_emb = CachedEmbeddingWrapper(model, cache_path='path_to_cache_dir') +model_with_cached_emb = CachedEmbeddingWrapper(model, cache_path='') # run as normal evaluation.run(model, ...) ``` -## Leaderboard - -This section contains information on how to interact with the leaderboard including running it locally, analysing the results, annotating contamination and more. - -### Fetching results from the Leaderboard - -Multiple models have already been run on tasks available within MTEB. These results are available results [repository](https://github.com/embeddings-benchmark/results). - -To make the results more easily accessible, we have designed custom functionality for retrieving from the repository. For instance, if you are selecting the best model for your French and English retrieval task on legal documents you could fetch the relevant tasks and create a dataframe of the results using the following code: +If you want to directly access the cached embeddings (e.g. for subsequent analyses) follow this example: ```python -import mteb -from mteb.task_selection import results_to_dataframe +import numpy as np +from mteb.models.cache_wrapper import TextVectorMap -tasks = mteb.get_tasks( - task_types=["Retrieval"], languages=["eng", "fra"], domains=["Legal"] -) +# Access the memory-mapped file and convert to array +vector_map = TextVectorMap("/AppsRetrieval") +vector_map.load(name="AppsRetrieval") +vectors = np.asarray(vector_map.vectors) -model_names = [ - "GritLM/GritLM-7B", - "intfloat/multilingual-e5-small", - "intfloat/multilingual-e5-base", - "intfloat/multilingual-e5-large", -] -models = [mteb.get_model_meta(name) for name in model_names] +# Remove all "placeholders" in the embedding cache +zero_mask = (vectors == 0).all(axis=1) +vectors = vectors[~zero_mask] +``` -results = mteb.load_results(models=models, tasks=tasks) +## Leaderboard -df = results_to_dataframe(results) -``` +This section contains information on how to interact with the leaderboard including running it locally, analysing the results, annotating contamination and more. ### Annotate Contamination @@ -503,8 +492,8 @@ It is possible to completely deploy the leaderboard locally or self-host it. Thi integrate build their own benchmarks or integrate custom tasks into existing benchmarks. Running the leaderboard is quite easy. Simply run: -```py -python -m mteb.leaderboard.app +```bash +make run-leaderboard ``` -The leaderboard requires gradio install, which can be installed using `pip install mteb[gradio]` and requires python >3.10. +The leaderboard requires gradio install, which can be installed using `pip install mteb[leaderboard]` and requires python >3.10. diff --git a/mteb/abstasks/AbsTask.py b/mteb/abstasks/AbsTask.py index b81c860b83..93e990c76b 100644 --- a/mteb/abstasks/AbsTask.py +++ b/mteb/abstasks/AbsTask.py @@ -347,6 +347,13 @@ def filter_languages( self.hf_subsets = subsets_to_keep return self + @property + def is_aggregate( + self, + ) -> bool: # Overrided by subclasses (AbsTaskAggregate) that are aggregate + """Whether the task is aggregate. Subclasses that are aggregate should override this with `True`.""" + return False + @property def eval_splits(self) -> list[str]: if self._eval_splits: diff --git a/mteb/abstasks/AbsTaskSpeedTask.py b/mteb/abstasks/AbsTaskSpeedTask.py index 0d3f8ae8dd..ff095d8332 100644 --- a/mteb/abstasks/AbsTaskSpeedTask.py +++ b/mteb/abstasks/AbsTaskSpeedTask.py @@ -3,6 +3,7 @@ import logging import platform import time +import warnings from pathlib import Path import numpy as np @@ -85,6 +86,10 @@ def get_system_info(self) -> dict[str, str]: return info def _evaluate_subset(self, model: Encoder, data_split, **kwargs) -> ScoresDict: + warnings.warn( + "SpeedTask is deprecated and will be removed in `v2`.", + DeprecationWarning, + ) model.encode( ["encode this"], device=self.device, task_name=self.metadata.name ) # ensure model is loaded diff --git a/mteb/abstasks/TaskMetadata.py b/mteb/abstasks/TaskMetadata.py index b0df961f21..70cd17a829 100644 --- a/mteb/abstasks/TaskMetadata.py +++ b/mteb/abstasks/TaskMetadata.py @@ -16,10 +16,7 @@ from ..encoder_interface import PromptType from ..languages import ( ISO_LANGUAGE_SCRIPT, - ISO_TO_LANGUAGE, - ISO_TO_SCRIPT, - path_to_lang_codes, - path_to_lang_scripts, + check_language_code, ) TASK_SUBTYPE = Literal[ @@ -100,10 +97,9 @@ "Programming", "Chemistry", "Financial", - "Chemistry", - "Financial", "Music", "Speech", + "Entertainment", ] SAMPLE_CREATION_METHOD = Literal[ @@ -205,23 +201,6 @@ list[ISO_LANGUAGE_SCRIPT], Mapping[HFSubset, list[ISO_LANGUAGE_SCRIPT]] ] -PROGRAMMING_LANGS = [ - "python", - "javascript", - "typescript", - "go", - "ruby", - "java", - "php", - "c", - "c++", - "rust", - "swift", - "scala", - "shell", - "sql", -] - METRIC_NAME = str METRIC_VALUE = Union[int, float, dict[str, Any]] @@ -365,30 +344,10 @@ def eval_langs_are_valid(self, eval_langs: LANGUAGES) -> None: if isinstance(eval_langs, dict): for langs in eval_langs.values(): for code in langs: - self._check_language_code(code) + check_language_code(code) else: for code in eval_langs: - self._check_language_code(code) - - @staticmethod - def _check_language_code(code): - """This method checks that the language code (e.g. "eng-Latn") is valid.""" - lang, script = code.split("-") - if script == "Code": - if lang in PROGRAMMING_LANGS: - return # override for code - else: - raise ValueError( - f"Programming language {lang} is not a valid programming language." - ) - if lang not in ISO_TO_LANGUAGE: - raise ValueError( - f"Invalid language code: {lang}, you can find valid ISO 639-3 codes in {path_to_lang_codes}" - ) - if script not in ISO_TO_SCRIPT: - raise ValueError( - f"Invalid script code: {script}, you can find valid ISO 15924 codes in {path_to_lang_scripts}" - ) + check_language_code(code) @property def bcp47_codes(self) -> list[ISO_LANGUAGE_SCRIPT]: diff --git a/mteb/abstasks/aggregated_task.py b/mteb/abstasks/aggregated_task.py index 4c79db01ae..f43c19f237 100644 --- a/mteb/abstasks/aggregated_task.py +++ b/mteb/abstasks/aggregated_task.py @@ -154,6 +154,10 @@ def _calculate_metrics_from_split( "Aggregate tasks does not implement a _calculate_metrics_from_split. Instead use the individual tasks." ) + @property + def is_aggregate(self): # Overrides the is_aggregate method on AbsTask + return True + @property def eval_splits(self) -> list[str]: if self._eval_splits: diff --git a/mteb/benchmarks/benchmark.py b/mteb/benchmarks/benchmark.py index e8d61d4e0b..37b654ac92 100644 --- a/mteb/benchmarks/benchmark.py +++ b/mteb/benchmarks/benchmark.py @@ -48,6 +48,8 @@ class Benchmark: citation: str | None = None contacts: list[str] | None = None display_on_leaderboard: bool = True + icon: str | None = None + display_name: str | None = None def __iter__(self): return iter(self.tasks) diff --git a/mteb/benchmarks/benchmarks.py b/mteb/benchmarks/benchmarks.py index 694babfc18..2438ade45a 100644 --- a/mteb/benchmarks/benchmarks.py +++ b/mteb/benchmarks/benchmarks.py @@ -16,18 +16,20 @@ ] # Allows the type to be a string, but ensures that the string is a URL -MMTEB_CITATION = """@article{enevoldsen2025mmtebmassivemultilingualtext, - title={MMTEB: Massive Multilingual Text Embedding Benchmark}, - author={Kenneth Enevoldsen and Isaac Chung and Imene Kerboua and Márton Kardos and Ashwin Mathur and David Stap and Jay Gala and Wissam Siblini and Dominik Krzemiński and Genta Indra Winata and Saba Sturua and Saiteja Utpala and Mathieu Ciancone and Marion Schaeffer and Gabriel Sequeira and Diganta Misra and Shreeya Dhakal and Jonathan Rystrøm and Roman Solomatin and Ömer Çağatan and Akash Kundu and Martin Bernstorff and Shitao Xiao and Akshita Sukhlecha and Bhavish Pahwa and Rafał Poświata and Kranthi Kiran GV and Shawon Ashraf and Daniel Auras and Björn Plüster and Jan Philipp Harries and Loïc Magne and Isabelle Mohr and Mariya Hendriksen and Dawei Zhu and Hippolyte Gisserot-Boukhlef and Tom Aarsen and Jan Kostkan and Konrad Wojtasik and Taemin Lee and Marek Šuppa and Crystina Zhang and Roberta Rocca and Mohammed Hamdy and Andrianos Michail and John Yang and Manuel Faysse and Aleksei Vatolin and Nandan Thakur and Manan Dey and Dipam Vasani and Pranjal Chitale and Simone Tedeschi and Nguyen Tai and Artem Snegirev and Michael Günther and Mengzhou Xia and Weijia Shi and Xing Han Lù and Jordan Clive and Gayatri Krishnakumar and Anna Maksimova and Silvan Wehrli and Maria Tikhonova and Henil Panchal and Aleksandr Abramov and Malte Ostendorff and Zheng Liu and Simon Clematide and Lester James Miranda and Alena Fenogenova and Guangyu Song and Ruqiya Bin Safi and Wen-Ding Li and Alessia Borghini and Federico Cassano and Hongjin Su and Jimmy Lin and Howard Yen and Lasse Hansen and Sara Hooker and Chenghao Xiao and Vaibhav Adlakha and Orion Weller and Siva Reddy and Niklas Muennighoff}, - publisher = {arXiv}, - journal={arXiv preprint arXiv:2502.13595}, - year={2025}, - url={https://arxiv.org/abs/2502.13595}, - doi = {10.48550/arXiv.2502.13595}, +MMTEB_CITATION = r"""@article{enevoldsen2025mmtebmassivemultilingualtext, + author = {Kenneth Enevoldsen and Isaac Chung and Imene Kerboua and Márton Kardos and Ashwin Mathur and David Stap and Jay Gala and Wissam Siblini and Dominik Krzemiński and Genta Indra Winata and Saba Sturua and Saiteja Utpala and Mathieu Ciancone and Marion Schaeffer and Gabriel Sequeira and Diganta Misra and Shreeya Dhakal and Jonathan Rystrøm and Roman Solomatin and Ömer Çağatan and Akash Kundu and Martin Bernstorff and Shitao Xiao and Akshita Sukhlecha and Bhavish Pahwa and Rafał Poświata and Kranthi Kiran GV and Shawon Ashraf and Daniel Auras and Björn Plüster and Jan Philipp Harries and Loïc Magne and Isabelle Mohr and Mariya Hendriksen and Dawei Zhu and Hippolyte Gisserot-Boukhlef and Tom Aarsen and Jan Kostkan and Konrad Wojtasik and Taemin Lee and Marek Šuppa and Crystina Zhang and Roberta Rocca and Mohammed Hamdy and Andrianos Michail and John Yang and Manuel Faysse and Aleksei Vatolin and Nandan Thakur and Manan Dey and Dipam Vasani and Pranjal Chitale and Simone Tedeschi and Nguyen Tai and Artem Snegirev and Michael Günther and Mengzhou Xia and Weijia Shi and Xing Han Lù and Jordan Clive and Gayatri Krishnakumar and Anna Maksimova and Silvan Wehrli and Maria Tikhonova and Henil Panchal and Aleksandr Abramov and Malte Ostendorff and Zheng Liu and Simon Clematide and Lester James Miranda and Alena Fenogenova and Guangyu Song and Ruqiya Bin Safi and Wen-Ding Li and Alessia Borghini and Federico Cassano and Hongjin Su and Jimmy Lin and Howard Yen and Lasse Hansen and Sara Hooker and Chenghao Xiao and Vaibhav Adlakha and Orion Weller and Siva Reddy and Niklas Muennighoff}, + doi = {10.48550/arXiv.2502.13595}, + journal = {arXiv preprint arXiv:2502.13595}, + publisher = {arXiv}, + title = {MMTEB: Massive Multilingual Text Embedding Benchmark}, + url = {https://arxiv.org/abs/2502.13595}, + year = {2025}, }""" MTEB_EN = Benchmark( name="MTEB(eng, v2)", + display_name="English", + icon="https://github.com/lipis/flag-icons/raw/refs/heads/main/flags/4x3/us.svg", tasks=MTEBTasks( get_tasks( tasks=[ @@ -97,6 +99,8 @@ MTEB_ENG_CLASSIC = Benchmark( name="MTEB(eng, v1)", + display_name="English Legacy", + icon="https://github.com/lipis/flag-icons/raw/260c91531be024944c6514130c5defb2ebb02b7d/flags/4x3/gb.svg", tasks=MTEBTasks( get_tasks( tasks=[ @@ -175,20 +179,24 @@ This page is an adaptation of the [old MTEB leaderboard](https://huggingface.co/spaces/mteb/leaderboard_legacy). We recommend that you use [MTEB(eng, v2)](http://mteb-leaderboard.hf.space/?benchmark_name=MTEB%28eng%2C+v2%29) instead, as it uses updated versions of the task, making it notably faster to run and resolving [a known bug](https://github.com/embeddings-benchmark/mteb/issues/1156) in existing tasks. This benchmark also removes datasets common for fine-tuning, such as MSMARCO, which makes model performance scores more comparable. However, generally, both benchmarks provide similar estimates. """, - citation="""@article{muennighoff2022mteb, - author = {Muennighoff, Niklas and Tazi, Nouamane and Magne, Lo{\"\i}c and Reimers, Nils}, - title = {MTEB: Massive Text Embedding Benchmark}, - publisher = {arXiv}, - journal={arXiv preprint arXiv:2210.07316}, - year = {2022} - url = {https://arxiv.org/abs/2210.07316}, - doi = {10.48550/ARXIV.2210.07316}, -}""", + citation=r""" +@article{muennighoff2022mteb, + author = {Muennighoff, Niklas and Tazi, Nouamane and Magne, Lo{\"\i}c and Reimers, Nils}, + doi = {10.48550/ARXIV.2210.07316}, + journal = {arXiv preprint arXiv:2210.07316}, + publisher = {arXiv}, + title = {MTEB: Massive Text Embedding Benchmark}, + url = {https://arxiv.org/abs/2210.07316}, + year = {2022}, +} +""", contacts=["Muennighoff"], ) MTEB_MAIN_RU = Benchmark( name="MTEB(rus, v1)", + display_name="Russian", + icon="https://github.com/lipis/flag-icons/raw/260c91531be024944c6514130c5defb2ebb02b7d/flags/4x3/ru.svg", tasks=get_tasks( languages=["rus"], tasks=[ @@ -220,25 +228,31 @@ "RuBQRetrieval", # STS "RUParaPhraserSTS", - "RuSTSBenchmarkSTS", "STS22", ], + ) + + get_tasks( + tasks=["RuSTSBenchmarkSTS"], + eval_splits=["test"], ), description="A Russian version of the Massive Text Embedding Benchmark with a number of novel Russian tasks in all task categories of the original MTEB.", reference="https://aclanthology.org/2023.eacl-main.148/", - citation="""@misc{snegirev2024russianfocusedembeddersexplorationrumteb, - title={The Russian-focused embedders' exploration: ruMTEB benchmark and Russian embedding model design}, - author={Artem Snegirev and Maria Tikhonova and Anna Maksimova and Alena Fenogenova and Alexander Abramov}, - year={2024}, - eprint={2408.12503}, - archivePrefix={arXiv}, - primaryClass={cs.CL}, - url={https://arxiv.org/abs/2408.12503}, -}""", + citation=r""" +@misc{snegirev2024russianfocusedembeddersexplorationrumteb, + archiveprefix = {arXiv}, + author = {Artem Snegirev and Maria Tikhonova and Anna Maksimova and Alena Fenogenova and Alexander Abramov}, + eprint = {2408.12503}, + primaryclass = {cs.CL}, + title = {The Russian-focused embedders' exploration: ruMTEB benchmark and Russian embedding model design}, + url = {https://arxiv.org/abs/2408.12503}, + year = {2024}, +} +""", ) MTEB_RETRIEVAL_WITH_INSTRUCTIONS = Benchmark( name="FollowIR", + display_name="Instruction Following", tasks=get_tasks( tasks=[ "Robust04InstructionRetrieval", @@ -248,18 +262,22 @@ ), description="Retrieval w/Instructions is the task of finding relevant documents for a query that has detailed instructions.", reference="https://arxiv.org/abs/2403.15246", - citation="""@misc{weller2024followir, - title={FollowIR: Evaluating and Teaching Information Retrieval Models to Follow Instructions}, - author={Orion Weller and Benjamin Chang and Sean MacAvaney and Kyle Lo and Arman Cohan and Benjamin Van Durme and Dawn Lawrie and Luca Soldaini}, - year={2024}, - eprint={2403.15246}, - archivePrefix={arXiv}, - primaryClass={cs.IR} -}""", + citation=r""" +@misc{weller2024followir, + archiveprefix = {arXiv}, + author = {Orion Weller and Benjamin Chang and Sean MacAvaney and Kyle Lo and Arman Cohan and Benjamin Van Durme and Dawn Lawrie and Luca Soldaini}, + eprint = {2403.15246}, + primaryclass = {cs.IR}, + title = {FollowIR: Evaluating and Teaching Information Retrieval Models to Follow Instructions}, + year = {2024}, +} +""", ) MTEB_RETRIEVAL_LAW = Benchmark( name="MTEB(Law, v1)", # This benchmark is likely in the need of an update + display_name="Legal", + icon="https://github.com/DennisSuitters/LibreICONS/raw/2d2172d15e3c6ca03c018629d60050e4b99e5c55/svg-color/libre-map-library.svg", tasks=get_tasks( tasks=[ "AILACasedocs", @@ -279,6 +297,8 @@ MTEB_RETRIEVAL_MEDICAL = Benchmark( name="MTEB(Medical, v1)", + display_name="Medical", + icon="https://github.com/DennisSuitters/LibreICONS/raw/2d2172d15e3c6ca03c018629d60050e4b99e5c55/svg-color/libre-map-hospital.svg", tasks=get_tasks( tasks=[ "CUREv1", @@ -318,16 +338,20 @@ including bitext mining and classification via retrieval-augmented contexts. """, reference="https://arxiv.org/pdf/2406.07424", - citation="""@article{winata2024miners, - title={MINERS: Multilingual Language Models as Semantic Retrievers}, - author={Winata, Genta Indra and Zhang, Ruochen and Adelani, David Ifeoluwa}, - journal={arXiv preprint arXiv:2406.07424}, - year={2024} -}""", + citation=r""" +@article{winata2024miners, + author = {Winata, Genta Indra and Zhang, Ruochen and Adelani, David Ifeoluwa}, + journal = {arXiv preprint arXiv:2406.07424}, + title = {MINERS: Multilingual Language Models as Semantic Retrievers}, + year = {2024}, +} +""", ) SEB = Benchmark( name="MTEB(Scandinavian, v1)", + display_name="Scandinavian", + icon="https://github.com/lipis/flag-icons/raw/260c91531be024944c6514130c5defb2ebb02b7d/flags/4x3/dk.svg", tasks=get_tasks( tasks=[ # Bitext @@ -367,18 +391,21 @@ ), description="A curated selection of tasks coverering the Scandinavian languages; Danish, Swedish and Norwegian, including Bokmål and Nynorsk.", reference="https://kennethenevoldsen.github.io/scandinavian-embedding-benchmark/", - citation="""@inproceedings{enevoldsen2024scandinavian, - title={The Scandinavian Embedding Benchmarks: Comprehensive Assessment of Multilingual and Monolingual Text Embedding}, - author={Enevoldsen, Kenneth and Kardos, M{\'a}rton and Muennighoff, Niklas and Nielbo, Kristoffer}, - booktitle={Advances in Neural Information Processing Systems}, - year={2024}, - url={https://nips.cc/virtual/2024/poster/97869} -}""", + citation=r""" +@inproceedings{enevoldsen2024scandinavian, + author = {Enevoldsen, Kenneth and Kardos, M{\'a}rton and Muennighoff, Niklas and Nielbo, Kristoffer}, + booktitle = {Advances in Neural Information Processing Systems}, + title = {The Scandinavian Embedding Benchmarks: Comprehensive Assessment of Multilingual and Monolingual Text Embedding}, + url = {https://nips.cc/virtual/2024/poster/97869}, + year = {2024}, +} +""", contacts=["KennethEnevoldsen", "x-tabdeveloping", "Samoed"], ) CoIR = Benchmark( name="CoIR", + display_name="Code Information Retrieval", tasks=get_tasks( tasks=[ "AppsRetrieval", @@ -395,19 +422,22 @@ ), description="CoIR: A Comprehensive Benchmark for Code Information Retrieval Models", reference="https://github.com/CoIR-team/coir", - citation="""@misc{li2024coircomprehensivebenchmarkcode, - title={CoIR: A Comprehensive Benchmark for Code Information Retrieval Models}, - author={Xiangyang Li and Kuicai Dong and Yi Quan Lee and Wei Xia and Yichun Yin and Hao Zhang and Yong Liu and Yasheng Wang and Ruiming Tang}, - year={2024}, - eprint={2407.02883}, - archivePrefix={arXiv}, - primaryClass={cs.IR}, - url={https://arxiv.org/abs/2407.02883}, -}""", + citation=r""" +@misc{li2024coircomprehensivebenchmarkcode, + archiveprefix = {arXiv}, + author = {Xiangyang Li and Kuicai Dong and Yi Quan Lee and Wei Xia and Yichun Yin and Hao Zhang and Yong Liu and Yasheng Wang and Ruiming Tang}, + eprint = {2407.02883}, + primaryclass = {cs.IR}, + title = {CoIR: A Comprehensive Benchmark for Code Information Retrieval Models}, + url = {https://arxiv.org/abs/2407.02883}, + year = {2024}, +} +""", ) RAR_b = Benchmark( name="RAR-b", + display_name="Reasoning retrieval", tasks=get_tasks( tasks=[ "ARCChallenge", @@ -431,17 +461,21 @@ ), description="A benchmark to evaluate reasoning capabilities of retrievers.", reference="https://arxiv.org/abs/2404.06347", - citation="""@article{xiao2024rar, - title={RAR-b: Reasoning as Retrieval Benchmark}, - author={Xiao, Chenghao and Hudson, G Thomas and Al Moubayed, Noura}, - journal={arXiv preprint arXiv:2404.06347}, - year={2024} -}""", + citation=r""" +@article{xiao2024rar, + author = {Xiao, Chenghao and Hudson, G Thomas and Al Moubayed, Noura}, + journal = {arXiv preprint arXiv:2404.06347}, + title = {RAR-b: Reasoning as Retrieval Benchmark}, + year = {2024}, +} +""", contacts=["gowitheflow-1998"], ) MTEB_FRA = Benchmark( name="MTEB(fra, v1)", + display_name="French", + icon="https://github.com/lipis/flag-icons/raw/260c91531be024944c6514130c5defb2ebb02b7d/flags/4x3/fr.svg", tasks=MTEBTasks( get_tasks( languages=["fra"], @@ -482,20 +516,24 @@ ), description="MTEB-French, a French expansion of the original benchmark with high-quality native French datasets.", reference="https://arxiv.org/abs/2405.20468", - citation="""@misc{ciancone2024mtebfrenchresourcesfrenchsentence, - title={MTEB-French: Resources for French Sentence Embedding Evaluation and Analysis}, - author={Mathieu Ciancone and Imene Kerboua and Marion Schaeffer and Wissam Siblini}, - year={2024}, - eprint={2405.20468}, - archivePrefix={arXiv}, - primaryClass={cs.CL}, - url={https://arxiv.org/abs/2405.20468}, -}""", + citation=r""" +@misc{ciancone2024mtebfrenchresourcesfrenchsentence, + archiveprefix = {arXiv}, + author = {Mathieu Ciancone and Imene Kerboua and Marion Schaeffer and Wissam Siblini}, + eprint = {2405.20468}, + primaryclass = {cs.CL}, + title = {MTEB-French: Resources for French Sentence Embedding Evaluation and Analysis}, + url = {https://arxiv.org/abs/2405.20468}, + year = {2024}, +} +""", contacts=["imenelydiaker"], ) MTEB_DEU = Benchmark( name="MTEB(deu, v1)", + display_name="German", + icon="https://github.com/lipis/flag-icons/raw/260c91531be024944c6514130c5defb2ebb02b7d/flags/4x3/de.svg", tasks=get_tasks( languages=["deu"], exclusive_language_filter=True, @@ -529,20 +567,24 @@ ), description="A benchmark for text-embedding performance in German.", reference="https://arxiv.org/html/2401.02709v1", - citation="""@misc{wehrli2024germantextembeddingclustering, - title={German Text Embedding Clustering Benchmark}, - author={Silvan Wehrli and Bert Arnrich and Christopher Irrgang}, - year={2024}, - eprint={2401.02709}, - archivePrefix={arXiv}, - primaryClass={cs.CL}, - url={https://arxiv.org/abs/2401.02709}, -}""", + citation=r""" +@misc{wehrli2024germantextembeddingclustering, + archiveprefix = {arXiv}, + author = {Silvan Wehrli and Bert Arnrich and Christopher Irrgang}, + eprint = {2401.02709}, + primaryclass = {cs.CL}, + title = {German Text Embedding Clustering Benchmark}, + url = {https://arxiv.org/abs/2401.02709}, + year = {2024}, +} +""", contacts=["slvnwhrl"], ) MTEB_KOR = Benchmark( name="MTEB(kor, v1)", + display_name="Korean", + icon="https://github.com/lipis/flag-icons/raw/260c91531be024944c6514130c5defb2ebb02b7d/flags/4x3/kr.svg", tasks=get_tasks( languages=["kor"], tasks=[ # @KennethEnevoldsen: We could probably expand this to a more solid benchamrk, but for now I have left it as is. @@ -565,6 +607,8 @@ MTEB_POL = Benchmark( name="MTEB(pol, v1)", + display_name="Polish", + icon="https://github.com/lipis/flag-icons/raw/260c91531be024944c6514130c5defb2ebb02b7d/flags/4x3/pl.svg", tasks=MTEBTasks( get_tasks( languages=["pol"], @@ -599,17 +643,21 @@ consisting of titles and abstracts of scientific publications in Polish, which was used as the basis for two novel clustering tasks.""", # Rephrased from the abstract reference="https://arxiv.org/abs/2405.10138", - citation="""@article{poswiata2024plmteb, - title={PL-MTEB: Polish Massive Text Embedding Benchmark}, - author={Rafał Poświata and Sławomir Dadas and Michał Perełkiewicz}, - journal={arXiv preprint arXiv:2405.10138}, - year={2024} -}""", + citation=r""" +@article{poswiata2024plmteb, + author = {Rafał Poświata and Sławomir Dadas and Michał Perełkiewicz}, + journal = {arXiv preprint arXiv:2405.10138}, + title = {PL-MTEB: Polish Massive Text Embedding Benchmark}, + year = {2024}, +} +""", contacts=["rafalposwiata"], ) MTEB_code = Benchmark( name="MTEB(Code, v1)", + display_name="Code", + icon="https://github.com/DennisSuitters/LibreICONS/raw/2d2172d15e3c6ca03c018629d60050e4b99e5c55/svg-color/libre-tech-electronics.svg", tasks=get_tasks( tasks=[ # Retrieval @@ -647,152 +695,171 @@ citation=MMTEB_CITATION, ) +mteb_multilingual_tasks = get_tasks( + tasks=[ + "BornholmBitextMining", + "BibleNLPBitextMining", + "BUCC.v2", + "DiaBlaBitextMining", + "FloresBitextMining", + "IN22GenBitextMining", + "IndicGenBenchFloresBitextMining", + "NollySentiBitextMining", + "NorwegianCourtsBitextMining", + "NTREXBitextMining", + "NusaTranslationBitextMining", + "NusaXBitextMining", + "Tatoeba", + "BulgarianStoreReviewSentimentClassfication", + "CzechProductReviewSentimentClassification", + "GreekLegalCodeClassification", + "DBpediaClassification", + "FinancialPhrasebankClassification", + "PoemSentimentClassification", + "ToxicConversationsClassification", + "TweetTopicSingleClassification", + "EstonianValenceClassification", + "FilipinoShopeeReviewsClassification", + "GujaratiNewsClassification", + "SentimentAnalysisHindi", + "IndonesianIdClickbaitClassification", + "ItaCaseholdClassification", + "KorSarcasmClassification", + "KurdishSentimentClassification", + "MacedonianTweetSentimentClassification", + "AfriSentiClassification", + "AmazonCounterfactualClassification", + "CataloniaTweetClassification", + "CyrillicTurkicLangClassification", + "IndicLangClassification", + "MasakhaNEWSClassification", + "MassiveIntentClassification", + "MultiHateClassification", + "NordicLangClassification", + "NusaParagraphEmotionClassification", + "NusaX-senti", + "ScalaClassification", + "SwissJudgementClassification", + "NepaliNewsClassification", + "OdiaNewsClassification", + "PunjabiNewsClassification", + "PolEmo2.0-OUT", + "PAC", + "SinhalaNewsClassification", + "CSFDSKMovieReviewSentimentClassification", + "SiswatiNewsClassification", + "SlovakMovieReviewSentimentClassification", + "SwahiliNewsClassification", + "DalajClassification", + "TswanaNewsClassification", + "IsiZuluNewsClassification", + "WikiCitiesClustering", + "MasakhaNEWSClusteringS2S", + "RomaniBibleClustering", + "ArXivHierarchicalClusteringP2P", + "ArXivHierarchicalClusteringS2S", + "BigPatentClustering.v2", + "BiorxivClusteringP2P.v2", + "MedrxivClusteringP2P.v2", + "StackExchangeClustering.v2", + "AlloProfClusteringS2S.v2", + "HALClusteringS2S.v2", + "SIB200ClusteringS2S", + "WikiClusteringP2P.v2", + "PlscClusteringP2P.v2", + "SwednClusteringP2P", + "CLSClusteringP2P.v2", + "StackOverflowQA", + "TwitterHjerneRetrieval", + "AILAStatutes", + "ArguAna", + "HagridRetrieval", + "LegalBenchCorporateLobbying", + "LEMBPasskeyRetrieval", + "SCIDOCS", + "SpartQA", + "TempReasonL1", + "TRECCOVID", + "WinoGrande", + "BelebeleRetrieval", + "MLQARetrieval", + "StatcanDialogueDatasetRetrieval", + "WikipediaRetrievalMultilingual", + "CovidRetrieval", + "Core17InstructionRetrieval", + "News21InstructionRetrieval", + "Robust04InstructionRetrieval", + "KorHateSpeechMLClassification", + "MalteseNewsClassification", + "MultiEURLEXMultilabelClassification", + "BrazilianToxicTweetsClassification", + "CEDRClassification", + "CTKFactsNLI", + "SprintDuplicateQuestions", + "TwitterURLCorpus", + "ArmenianParaphrasePC", + "indonli", + "OpusparcusPC", + "PawsXPairClassification", + "RTE3", + "XNLI", + "PpcPC", + "TERRa", + "WebLINXCandidatesReranking", + "AlloprofReranking", + "VoyageMMarcoReranking", + "WikipediaRerankingMultilingual", + "RuBQReranking", + "T2Reranking", + "GermanSTSBenchmark", + "SICK-R", + "STS12", + "STS13", + "STS14", + "STS15", + "STSBenchmark", + "FaroeseSTS", + "FinParaSTS", + "JSICK", + "IndicCrosslingualSTS", + "SemRel24STS", + "STS17", + "STS22.v2", + "STSES", + "STSB", + "MIRACLRetrievalHardNegatives", + ], +) + MTEB_multilingual = Benchmark( name="MTEB(Multilingual, v1)", - tasks=get_tasks( - tasks=[ - "BornholmBitextMining", - "BibleNLPBitextMining", - "BUCC.v2", - "DiaBlaBitextMining", - "FloresBitextMining", - "IN22GenBitextMining", - "IndicGenBenchFloresBitextMining", - "NollySentiBitextMining", - "NorwegianCourtsBitextMining", - "NTREXBitextMining", - "NusaTranslationBitextMining", - "NusaXBitextMining", - "Tatoeba", - "BulgarianStoreReviewSentimentClassfication", - "CzechProductReviewSentimentClassification", - "GreekLegalCodeClassification", - "DBpediaClassification", - "FinancialPhrasebankClassification", - "PoemSentimentClassification", - "ToxicConversationsClassification", - "TweetTopicSingleClassification", - "EstonianValenceClassification", - "FilipinoShopeeReviewsClassification", - "GujaratiNewsClassification", - "SentimentAnalysisHindi", - "IndonesianIdClickbaitClassification", - "ItaCaseholdClassification", - "KorSarcasmClassification", - "KurdishSentimentClassification", - "MacedonianTweetSentimentClassification", - "AfriSentiClassification", - "AmazonCounterfactualClassification", - "CataloniaTweetClassification", - "CyrillicTurkicLangClassification", - "IndicLangClassification", - "MasakhaNEWSClassification", - "MassiveIntentClassification", - "MultiHateClassification", - "NordicLangClassification", - "NusaParagraphEmotionClassification", - "NusaX-senti", - "ScalaClassification", - "SwissJudgementClassification", - "NepaliNewsClassification", - "OdiaNewsClassification", - "PunjabiNewsClassification", - "PolEmo2.0-OUT", - "PAC", - "SinhalaNewsClassification", - "CSFDSKMovieReviewSentimentClassification", - "SiswatiNewsClassification", - "SlovakMovieReviewSentimentClassification", - "SwahiliNewsClassification", - "DalajClassification", - "TswanaNewsClassification", - "IsiZuluNewsClassification", - "WikiCitiesClustering", - "MasakhaNEWSClusteringS2S", - "RomaniBibleClustering", - "ArXivHierarchicalClusteringP2P", - "ArXivHierarchicalClusteringS2S", - "BigPatentClustering.v2", - "BiorxivClusteringP2P.v2", - "MedrxivClusteringP2P.v2", - "StackExchangeClustering.v2", - "AlloProfClusteringS2S.v2", - "HALClusteringS2S.v2", - "SIB200ClusteringS2S", - "WikiClusteringP2P.v2", - "SNLHierarchicalClusteringP2P", - "PlscClusteringP2P.v2", - "SwednClusteringP2P", - "CLSClusteringP2P.v2", - "StackOverflowQA", - "TwitterHjerneRetrieval", - "AILAStatutes", - "ArguAna", - "HagridRetrieval", - "LegalBenchCorporateLobbying", - "LEMBPasskeyRetrieval", - "SCIDOCS", - "SpartQA", - "TempReasonL1", - "TRECCOVID", - "WinoGrande", - "BelebeleRetrieval", - "MLQARetrieval", - "StatcanDialogueDatasetRetrieval", - "WikipediaRetrievalMultilingual", - "CovidRetrieval", - "Core17InstructionRetrieval", - "News21InstructionRetrieval", - "Robust04InstructionRetrieval", - "KorHateSpeechMLClassification", - "MalteseNewsClassification", - "MultiEURLEXMultilabelClassification", - "BrazilianToxicTweetsClassification", - "CEDRClassification", - "CTKFactsNLI", - "SprintDuplicateQuestions", - "TwitterURLCorpus", - "ArmenianParaphrasePC", - "indonli", - "OpusparcusPC", - "PawsXPairClassification", - "RTE3", - "XNLI", - "PpcPC", - "TERRa", - "WebLINXCandidatesReranking", - "AlloprofReranking", - "VoyageMMarcoReranking", - "WikipediaRerankingMultilingual", - "RuBQReranking", - "T2Reranking", - "GermanSTSBenchmark", - "SICK-R", - "STS12", - "STS13", - "STS14", - "STS15", - "STSBenchmark", - "FaroeseSTS", - "FinParaSTS", - "JSICK", - "IndicCrosslingualSTS", - "SemRel24STS", - "STS17", - "STS22.v2", - "STSES", - "STSB", - "MIRACLRetrievalHardNegatives", - ], + display_name="Multilingual", + icon="https://github.com/DennisSuitters/LibreICONS/raw/2d2172d15e3c6ca03c018629d60050e4b99e5c55/svg-color/libre-gui-globe.svg", + tasks=MTEBTasks( + mteb_multilingual_tasks + get_tasks(tasks=["SNLHierarchicalClusteringP2P"]) ), - description="A large-scale multilingual expansion of MTEB, driven mainly by highly-curated community contributions covering 250+ languages.", - reference=None, + description="A large-scale multilingual expansion of MTEB, driven mainly by highly-curated community contributions covering 250+ languages. This benhcmark has been replaced by MTEB(Multilingual, v2) as one of the datasets (SNLHierarchicalClustering) included in v1 was removed from the Hugging Face Hub.", + reference="https://arxiv.org/abs/2502.13595", + citation=MMTEB_CITATION, + contacts=["KennethEnevoldsen", "isaac-chung"], +) + + +MTEB_multilingual = Benchmark( + name="MTEB(Multilingual, v2)", + display_name="Multilingual", + icon="https://github.com/DennisSuitters/LibreICONS/raw/2d2172d15e3c6ca03c018629d60050e4b99e5c55/svg-color/libre-gui-globe.svg", + tasks=mteb_multilingual_tasks, + description="A large-scale multilingual expansion of MTEB, driven mainly by highly-curated community contributions covering 250+ languages. ", + reference="https://arxiv.org/abs/2502.13595", citation=MMTEB_CITATION, contacts=["KennethEnevoldsen", "isaac-chung"], ) MTEB_JPN = Benchmark( name="MTEB(jpn, v1)", + display_name="Japanese", + icon="https://github.com/lipis/flag-icons/raw/260c91531be024944c6514130c5defb2ebb02b7d/flags/4x3/jp.svg", tasks=get_tasks( languages=["jpn"], tasks=[ @@ -861,6 +928,8 @@ MTEB_INDIC = Benchmark( name="MTEB(Indic, v1)", + display_name="Indic", + icon="https://github.com/lipis/flag-icons/raw/260c91531be024944c6514130c5defb2ebb02b7d/flags/4x3/in.svg", tasks=MTEBTasks( get_tasks( tasks=[ @@ -901,7 +970,7 @@ (get_task("IndicCrosslingualSTS"),) ), description="A regional geopolitical text embedding benchmark targetting embedding performance on Indic languages.", - reference=None, + reference="https://arxiv.org/abs/2502.13595", citation=MMTEB_CITATION, contacts=["KennethEnevoldsen", "isaac-chung"], ) @@ -952,6 +1021,8 @@ MTEB_EU = Benchmark( name="MTEB(Europe, v1)", + display_name="European", + icon="https://github.com/lipis/flag-icons/raw/260c91531be024944c6514130c5defb2ebb02b7d/flags/4x3/eu.svg", tasks=get_tasks( tasks=[ "BornholmBitextMining", @@ -1033,13 +1104,14 @@ exclusive_language_filter=True, ), description="A regional geopolitical text embedding benchmark targetting embedding performance on European languages.", - reference=None, + reference="https://arxiv.org/abs/2502.13595", citation=MMTEB_CITATION, contacts=["KennethEnevoldsen", "isaac-chung"], ) LONG_EMBED = Benchmark( name="LongEmbed", + display_name="Long-context Retrieval", tasks=get_tasks( tasks=[ "LEMBNarrativeQARetrieval", @@ -1055,12 +1127,14 @@ featuring documents of varying length and dispersed target information. """, # Pieced together from paper abstract. reference="https://arxiv.org/abs/2404.12096v2", - citation="""@article{zhu2024longembed, - title={LongEmbed: Extending Embedding Models for Long Context Retrieval}, - author={Zhu, Dawei and Wang, Liang and Yang, Nan and Song, Yifan and Wu, Wenhao and Wei, Furu and Li, Sujian}, - journal={arXiv preprint arXiv:2404.12096}, - year={2024} -}""", + citation=r""" +@article{zhu2024longembed, + author = {Zhu, Dawei and Wang, Liang and Yang, Nan and Song, Yifan and Wu, Wenhao and Wei, Furu and Li, Sujian}, + journal = {arXiv preprint arXiv:2404.12096}, + title = {LongEmbed: Extending Embedding Models for Long Context Retrieval}, + year = {2024}, +} +""", ) BRIGHT = Benchmark( @@ -1074,12 +1148,14 @@ naturally occurring and carefully curated human data. """, reference="https://brightbenchmark.github.io/", - citation="""@article{su2024bright, - title={Bright: A realistic and challenging benchmark for reasoning-intensive retrieval}, - author={Su, Hongjin and Yen, Howard and Xia, Mengzhou and Shi, Weijia and Muennighoff, Niklas and Wang, Han-yu and Liu, Haisu and Shi, Quan and Siegel, Zachary S and Tang, Michael and others}, - journal={arXiv preprint arXiv:2407.12883}, - year={2024} -}""", + citation=r""" +@article{su2024bright, + author = {Su, Hongjin and Yen, Howard and Xia, Mengzhou and Shi, Weijia and Muennighoff, Niklas and Wang, Han-yu and Liu, Haisu and Shi, Quan and Siegel, Zachary S and Tang, Michael and others}, + journal = {arXiv preprint arXiv:2407.12883}, + title = {Bright: A realistic and challenging benchmark for reasoning-intensive retrieval}, + year = {2024}, +} +""", ) BRIGHT_LONG = Benchmark( @@ -1101,12 +1177,14 @@ This is the long version of the benchmark, which only filter longer documents. """, reference="https://brightbenchmark.github.io/", - citation="""@article{su2024bright, - title={Bright: A realistic and challenging benchmark for reasoning-intensive retrieval}, - author={Su, Hongjin and Yen, Howard and Xia, Mengzhou and Shi, Weijia and Muennighoff, Niklas and Wang, Han-yu and Liu, Haisu and Shi, Quan and Siegel, Zachary S and Tang, Michael and others}, - journal={arXiv preprint arXiv:2407.12883}, - year={2024} -}""", + citation=r""" +@article{su2024bright, + author = {Su, Hongjin and Yen, Howard and Xia, Mengzhou and Shi, Weijia and Muennighoff, Niklas and Wang, Han-yu and Liu, Haisu and Shi, Quan and Siegel, Zachary S and Tang, Michael and others}, + journal = {arXiv preprint arXiv:2407.12883}, + title = {Bright: A realistic and challenging benchmark for reasoning-intensive retrieval}, + year = {2024}, +} +""", ) CODE_RAG = Benchmark( @@ -1121,16 +1199,17 @@ ), description="A benchmark for evaluating code retrieval augmented generation, testing models' ability to retrieve relevant programming solutions, tutorials and documentation.", reference="https://arxiv.org/abs/2406.14497", - citation="""@misc{wang2024coderagbenchretrievalaugmentcode, - title={CodeRAG-Bench: Can Retrieval Augment Code Generation?}, - author={Zora Zhiruo Wang and Akari Asai and Xinyan Velocity Yu and Frank F. Xu and Yiqing Xie and Graham Neubig and Daniel Fried}, - year={2024}, - eprint={2406.14497}, - archivePrefix={arXiv}, - primaryClass={cs.SE}, - url={https://arxiv.org/abs/2406.14497}, -}""", - display_on_leaderboard=False, + citation=r""" +@misc{wang2024coderagbenchretrievalaugmentcode, + archiveprefix = {arXiv}, + author = {Zora Zhiruo Wang and Akari Asai and Xinyan Velocity Yu and Frank F. Xu and Yiqing Xie and Graham Neubig and Daniel Fried}, + eprint = {2406.14497}, + primaryclass = {cs.SE}, + title = {CodeRAG-Bench: Can Retrieval Augment Code Generation?}, + url = {https://arxiv.org/abs/2406.14497}, + year = {2024}, +} +""", ) BEIR = Benchmark( @@ -1156,11 +1235,12 @@ + get_tasks(tasks=["MSMARCO"], languages=["eng"], eval_splits=["dev"]), description="BEIR is a heterogeneous benchmark containing diverse IR tasks. It also provides a common and easy framework for evaluation of your NLP-based retrieval models within the benchmark.", reference="https://arxiv.org/abs/2104.08663", - citation="""@article{thakur2021beir, - title={Beir: A heterogenous benchmark for zero-shot evaluation of information retrieval models}, - author={Thakur, Nandan and Reimers, Nils and R{\"u}ckl{\'e}, Andreas and Srivastava, Abhishek and Gurevych, Iryna}, - journal={arXiv preprint arXiv:2104.08663}, - year={2021} + citation=r""" +@article{thakur2021beir, + author = {Thakur, Nandan and Reimers, Nils and R{\"u}ckl{\'e}, Andreas and Srivastava, Abhishek and Gurevych, Iryna}, + journal = {arXiv preprint arXiv:2104.08663}, + title = {Beir: A heterogenous benchmark for zero-shot evaluation of information retrieval models}, + year = {2021}, } """, ) @@ -1191,6 +1271,8 @@ C_MTEB = Benchmark( name="MTEB(cmn, v1)", + display_name="Chinese", + icon="https://github.com/lipis/flag-icons/raw/260c91531be024944c6514130c5defb2ebb02b7d/flags/4x3/cn.svg", tasks=MTEBTasks( get_tasks( tasks=[ @@ -1241,18 +1323,22 @@ ), description="The Chinese Massive Text Embedding Benchmark (C-MTEB) is a comprehensive benchmark for Chinese text embeddings covering 6 tasks and 35 datasets.", reference="https://github.com/FlagOpen/FlagEmbedding/tree/master/research/C_MTEB", - citation="""@misc{c-pack, - title={C-Pack: Packaged Resources To Advance General Chinese Embedding}, - author={Shitao Xiao and Zheng Liu and Peitian Zhang and Niklas Muennighoff}, - year={2023}, - eprint={2309.07597}, - archivePrefix={arXiv}, - primaryClass={cs.CL} -}""", + citation=r""" +@misc{c-pack, + archiveprefix = {arXiv}, + author = {Shitao Xiao and Zheng Liu and Peitian Zhang and Niklas Muennighoff}, + eprint = {2309.07597}, + primaryclass = {cs.CL}, + title = {C-Pack: Packaged Resources To Advance General Chinese Embedding}, + year = {2023}, +} +""", ) FA_MTEB = Benchmark( name="MTEB(fas, beta)", + display_name="Farsi (BETA)", + icon="https://github.com/lipis/flag-icons/raw/260c91531be024944c6514130c5defb2ebb02b7d/flags/4x3/ir.svg", tasks=get_tasks( languages=["fas"], tasks=[ @@ -1333,6 +1419,8 @@ CHEMTEB = Benchmark( name="ChemTEB", + display_name="Chemical", + icon="https://github.com/DennisSuitters/LibreICONS/raw/2d2172d15e3c6ca03c018629d60050e4b99e5c55/svg-color/libre-gui-purge.svg", tasks=get_tasks( tasks=[ "PubChemSMILESBitextMining", @@ -1366,12 +1454,14 @@ ), description="ChemTEB evaluates the performance of text embedding models on chemical domain data.", reference="https://arxiv.org/abs/2412.00532", - citation="""@article{kasmaee2024chemteb, - title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \\& Efficiency on a Specific Domain}, - author={Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, - journal={arXiv preprint arXiv:2412.00532}, - year={2024} -}""", + citation=r""" +@article{kasmaee2024chemteb, + author = {Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, + journal = {arXiv preprint arXiv:2412.00532}, + title = {ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \\& Efficiency on a Specific Domain}, + year = {2024}, +} +""", ) BEIR_NL = Benchmark( @@ -1399,15 +1489,437 @@ "translation.", reference="https://arxiv.org/abs/2412.08329", contacts=["nikolay-banar"], - citation="""@misc{banar2024beirnlzeroshotinformationretrieval, - title={BEIR-NL: Zero-shot Information Retrieval Benchmark for the Dutch Language}, - author={Nikolay Banar and Ehsan Lotfi and Walter Daelemans}, - year={2024}, - eprint={2412.08329}, - archivePrefix={arXiv}, - primaryClass={cs.CL}, - url={https://arxiv.org/abs/2412.08329}, -}""", + citation=r""" +@misc{banar2024beirnlzeroshotinformationretrieval, + archiveprefix = {arXiv}, + author = {Nikolay Banar and Ehsan Lotfi and Walter Daelemans}, + eprint = {2412.08329}, + primaryclass = {cs.CL}, + title = {BEIR-NL: Zero-shot Information Retrieval Benchmark for the Dutch Language}, + url = {https://arxiv.org/abs/2412.08329}, + year = {2024}, +} +""", +) + +MIEB_common_tasks = [ + # Image Classification + "Birdsnap", # fine + "Caltech101", # fine + "CIFAR10", # coarse + "CIFAR100", # fine + "Country211", # fine + "DTD", # coarse + "EuroSAT", # coarse + "FER2013", # coarse + "FGVCAircraft", # fine + "Food101Classification", # fine + "GTSRB", # coarse + "Imagenet1k", # fine + "MNIST", # coarse + "OxfordFlowersClassification", # fine + "OxfordPets", # fine + "PatchCamelyon", # coarse + "RESISC45", # fine + "StanfordCars", # fine + "STL10", # coarse + "SUN397", # fine + "UCF101", # fine + # ImageMultiLabelClassification + "VOC2007", # coarse + # Clustering + "CIFAR10Clustering", + "CIFAR100Clustering", + "ImageNetDog15Clustering", + "ImageNet10Clustering", + "TinyImageNetClustering", + # ZeroShotClassification + "BirdsnapZeroShot", + "Caltech101ZeroShot", + "CIFAR10ZeroShot", + "CIFAR100ZeroShot", + "CLEVRZeroShot", + "CLEVRCountZeroShot", + "Country211ZeroShot", + "DTDZeroShot", + "EuroSATZeroShot", + "FER2013ZeroShot", + "FGVCAircraftZeroShot", + "Food101ZeroShot", + "GTSRBZeroShot", + "Imagenet1kZeroShot", + "MNISTZeroShot", + "OxfordPetsZeroShot", + "PatchCamelyonZeroShot", + "RenderedSST2", + "RESISC45ZeroShot", + "StanfordCarsZeroShot", + "STL10ZeroShot", + "SUN397ZeroShot", + "UCF101ZeroShot", + # Any2AnyMultipleChoice + "BLINKIT2IMultiChoice", + "BLINKIT2TMultiChoice", + "CVBenchCount", + "CVBenchRelation", + "CVBenchDepth", + "CVBenchDistance", + # Compositionality + "AROCocoOrder", + "AROFlickrOrder", + "AROVisualAttribution", + "AROVisualRelation", + "SugarCrepe", + "Winoground", + "ImageCoDe", + # VisualSTS + "STS12VisualSTS", + "STS13VisualSTS", + "STS14VisualSTS", + "STS15VisualSTS", + "STS16VisualSTS", + # Any2AnyRetrieval + "BLINKIT2IRetrieval", + "BLINKIT2TRetrieval", + "CIRRIT2IRetrieval", + "CUB200I2IRetrieval", + "EDIST2ITRetrieval", + "Fashion200kI2TRetrieval", + "Fashion200kT2IRetrieval", + "FashionIQIT2IRetrieval", + "Flickr30kI2TRetrieval", + "Flickr30kT2IRetrieval", + "FORBI2IRetrieval", + "GLDv2I2IRetrieval", + "GLDv2I2TRetrieval", + "HatefulMemesI2TRetrieval", + "HatefulMemesT2IRetrieval", + "ImageCoDeT2IRetrieval", + "InfoSeekIT2ITRetrieval", + "InfoSeekIT2TRetrieval", + "MemotionI2TRetrieval", + "MemotionT2IRetrieval", + "METI2IRetrieval", + "MSCOCOI2TRetrieval", + "MSCOCOT2IRetrieval", + "NIGHTSI2IRetrieval", + "OVENIT2ITRetrieval", + "OVENIT2TRetrieval", + "ROxfordEasyI2IRetrieval", + "ROxfordMediumI2IRetrieval", + "ROxfordHardI2IRetrieval", + "RP2kI2IRetrieval", + "RParisEasyI2IRetrieval", + "RParisMediumI2IRetrieval", + "RParisHardI2IRetrieval", + "SciMMIRI2TRetrieval", + "SciMMIRT2IRetrieval", + "SketchyI2IRetrieval", + "SOPI2IRetrieval", + "StanfordCarsI2IRetrieval", + "TUBerlinT2IRetrieval", + "VidoreArxivQARetrieval", + "VidoreDocVQARetrieval", + "VidoreInfoVQARetrieval", + "VidoreTabfquadRetrieval", + "VidoreTatdqaRetrieval", + "VidoreShiftProjectRetrieval", + "VidoreSyntheticDocQAAIRetrieval", + "VidoreSyntheticDocQAEnergyRetrieval", + "VidoreSyntheticDocQAGovernmentReportsRetrieval", + "VidoreSyntheticDocQAHealthcareIndustryRetrieval", + "VisualNewsI2TRetrieval", + "VisualNewsT2IRetrieval", + "VizWizIT2TRetrieval", + "VQA2IT2TRetrieval", + "WebQAT2ITRetrieval", + "WebQAT2TRetrieval", +] + +MIEB_ENG = Benchmark( + name="MIEB(eng)", + display_name="Image-Text, English", + icon="https://github.com/DennisSuitters/LibreICONS/raw/2d2172d15e3c6ca03c018629d60050e4b99e5c55/svg-color/libre-gui-picture.svg", + tasks=get_tasks( + tasks=MIEB_common_tasks + + [ + "VisualSTS17Eng", + "VisualSTS-b-Eng", + ], + ), + description="""MIEB(eng) is a comprehensive image embeddings benchmark, spanning 8 task types, covering 125 tasks. + In addition to image classification (zero shot and linear probing), clustering, retrieval, MIEB includes tasks in compositionality evaluation, + document undestanding, visual STS, and CV-centric tasks.""", + reference="https://arxiv.org/abs/2504.10471", + contacts=["gowitheflow-1998", "isaac-chung"], + citation=r""" +@article{xiao2025mieb, + author = {Chenghao Xiao and Isaac Chung and Imene Kerboua and Jamie Stirling and Xin Zhang and Márton Kardos and Roman Solomatin and Noura Al Moubayed and Kenneth Enevoldsen and Niklas Muennighoff}, + doi = {10.48550/ARXIV.2504.10471}, + journal = {arXiv preprint arXiv:2504.10471}, + publisher = {arXiv}, + title = {MIEB: Massive Image Embedding Benchmark}, + url = {https://arxiv.org/abs/2504.10471}, + year = {2025}, +} +""", +) + +MIEB_MULTILINGUAL = Benchmark( + name="MIEB(Multilingual)", + display_name="Image-Text, Multilingual", + icon="https://github.com/DennisSuitters/LibreICONS/raw/2d2172d15e3c6ca03c018629d60050e4b99e5c55/svg-color/libre-gui-pictures.svg", + tasks=get_tasks( + tasks=MIEB_common_tasks + + [ + "WITT2IRetrieval", + "XFlickr30kCoT2IRetrieval", + "XM3600T2IRetrieval", + "VisualSTS17Eng", + "VisualSTS-b-Eng", + "VisualSTS17Multilingual", + "VisualSTS-b-Multilingual", + ], + ), + description="""MIEB(Multilingual) is a comprehensive image embeddings benchmark, spanning 10 task types, covering 130 tasks and a total of 39 languages. + In addition to image classification (zero shot and linear probing), clustering, retrieval, MIEB includes tasks in compositionality evaluation, + document undestanding, visual STS, and CV-centric tasks. This benchmark consists of MIEB(eng) + 3 multilingual retrieval + datasets + the multilingual parts of VisualSTS-b and VisualSTS-16.""", + reference="https://arxiv.org/abs/2504.10471", + contacts=["gowitheflow-1998", "isaac-chung"], + citation=r""" +@article{xiao2025mieb, + author = {Chenghao Xiao and Isaac Chung and Imene Kerboua and Jamie Stirling and Xin Zhang and Márton Kardos and Roman Solomatin and Noura Al Moubayed and Kenneth Enevoldsen and Niklas Muennighoff}, + doi = {10.48550/ARXIV.2504.10471}, + journal = {arXiv preprint arXiv:2504.10471}, + publisher = {arXiv}, + title = {MIEB: Massive Image Embedding Benchmark}, + url = {https://arxiv.org/abs/2504.10471}, + year = {2025}, +} +""", +) + +MIEB_LITE = Benchmark( + name="MIEB(lite)", + display_name="Image-Text, Lite", + icon="https://github.com/DennisSuitters/LibreICONS/raw/2d2172d15e3c6ca03c018629d60050e4b99e5c55/svg-color/libre-map-landscape.svg", + tasks=get_tasks( + tasks=[ + # Image Classification + "Country211", + "DTD", + "EuroSAT", + "GTSRB", + "OxfordPets", + "PatchCamelyon", + "RESISC45", + "SUN397", + # Clustering + "ImageNetDog15Clustering", + "TinyImageNetClustering", + # ZeroShotClassification + "CIFAR100ZeroShot", + "Country211ZeroShot", + "FER2013ZeroShot", + "FGVCAircraftZeroShot", + "Food101ZeroShot", + "OxfordPetsZeroShot", + "StanfordCarsZeroShot", + # Any2AnyMultipleChoice + "BLINKIT2IMultiChoice", + "CVBenchCount", + "CVBenchRelation", + "CVBenchDepth", + "CVBenchDistance", + # ImageTextPairClassification + "AROCocoOrder", + "AROFlickrOrder", + "AROVisualAttribution", + "AROVisualRelation", + "Winoground", + "ImageCoDe", + # VisualSTS + "STS13VisualSTS", + "STS15VisualSTS", + "VisualSTS17Multilingual", + "VisualSTS-b-Multilingual", + # Any2AnyRetrieval + "CIRRIT2IRetrieval", + "CUB200I2IRetrieval", + "Fashion200kI2TRetrieval", + "HatefulMemesI2TRetrieval", + "InfoSeekIT2TRetrieval", + "NIGHTSI2IRetrieval", + "OVENIT2TRetrieval", + "RP2kI2IRetrieval", + "VidoreDocVQARetrieval", + "VidoreInfoVQARetrieval", + "VidoreTabfquadRetrieval", + "VidoreTatdqaRetrieval", + "VidoreShiftProjectRetrieval", + "VidoreSyntheticDocQAAIRetrieval", + "VisualNewsI2TRetrieval", + "VQA2IT2TRetrieval", + "WebQAT2ITRetrieval", + "WITT2IRetrieval", + "XM3600T2IRetrieval", + ], + ), + description="""MIEB(lite) is a comprehensive image embeddings benchmark, spanning 10 task types, covering 51 tasks. + This is a lite version of MIEB(Multilingual), designed to be run at a fraction of the cost while maintaining + relative rank of models.""", + reference="https://arxiv.org/abs/2504.10471", + contacts=["gowitheflow-1998", "isaac-chung"], + citation=r""" +@article{xiao2025mieb, + author = {Chenghao Xiao and Isaac Chung and Imene Kerboua and Jamie Stirling and Xin Zhang and Márton Kardos and Roman Solomatin and Noura Al Moubayed and Kenneth Enevoldsen and Niklas Muennighoff}, + doi = {10.48550/ARXIV.2504.10471}, + journal = {arXiv preprint arXiv:2504.10471}, + publisher = {arXiv}, + title = {MIEB: Massive Image Embedding Benchmark}, + url = {https://arxiv.org/abs/2504.10471}, + year = {2025}, +} +""", +) + +MIEB_IMG = Benchmark( + name="MIEB(Img)", + display_name="Image only", + icon="https://github.com/DennisSuitters/LibreICONS/raw/2d2172d15e3c6ca03c018629d60050e4b99e5c55/svg-color/libre-gui-pictures.svg", + tasks=get_tasks( + tasks=[ + "CUB200I2IRetrieval", + "FORBI2IRetrieval", + "GLDv2I2IRetrieval", + "METI2IRetrieval", + "NIGHTSI2IRetrieval", + "ROxfordEasyI2IRetrieval", + "ROxfordMediumI2IRetrieval", + "ROxfordHardI2IRetrieval", + "RP2kI2IRetrieval", + "RParisEasyI2IRetrieval", + "RParisMediumI2IRetrieval", + "RParisHardI2IRetrieval", + "SketchyI2IRetrieval", + "SOPI2IRetrieval", + "StanfordCarsI2IRetrieval", + "Birdsnap", + "Caltech101", + "CIFAR10", + "CIFAR100", + "Country211", + "DTD", + "EuroSAT", + "FER2013", + "FGVCAircraft", + "Food101Classification", + "GTSRB", + "Imagenet1k", + "MNIST", + "OxfordFlowersClassification", + "OxfordPets", + "PatchCamelyon", + "RESISC45", + "StanfordCars", + "STL10", + "SUN397", + "UCF101", + "CIFAR10Clustering", + "CIFAR100Clustering", + "ImageNetDog15Clustering", + "ImageNet10Clustering", + "TinyImageNetClustering", + "VOC2007", + "STS12VisualSTS", + "STS13VisualSTS", + "STS14VisualSTS", + "STS15VisualSTS", + "STS16VisualSTS", + "STS17MultilingualVisualSTS", + "STSBenchmarkMultilingualVisualSTS", + ], + ), + description="A image-only version of MIEB(Multilingual) that consists of 49 tasks.", + reference="https://arxiv.org/abs/2504.10471", + citation=r""" +@article{xiao2025mieb, + author = {Chenghao Xiao and Isaac Chung and Imene Kerboua and Jamie Stirling and Xin Zhang and Márton Kardos and Roman Solomatin and Noura Al Moubayed and Kenneth Enevoldsen and Niklas Muennighoff}, + doi = {10.48550/ARXIV.2504.10471}, + journal = {arXiv preprint arXiv:2504.10471}, + publisher = {arXiv}, + title = {MIEB: Massive Image Embedding Benchmark}, + url = {https://arxiv.org/abs/2504.10471}, + year = {2025}, +} +""", + contacts=["gowitheflow-1998", "isaac-chung"], +) + +BUILT_MTEB = Benchmark( + name="BuiltBench(eng)", + tasks=get_tasks( + tasks=[ + "BuiltBenchClusteringP2P", + "BuiltBenchClusteringS2S", + "BuiltBenchRetrieval", + "BuiltBenchReranking", + ], + ), + description='"Built-Bench" is an ongoing effort aimed at evaluating text embedding models in the context of built asset management, spanning over various dicsiplines such as architeture, engineering, constrcution, and operations management of the built environment.', + reference="https://arxiv.org/abs/2411.12056", + citation=r""" +@article{shahinmoghadam2024benchmarking, + author = {Shahinmoghadam, Mehrzad and Motamedi, Ali}, + journal = {arXiv preprint arXiv:2411.12056}, + title = {Benchmarking pre-trained text embedding models in aligning built asset information}, + year = {2024}, +} +""", + contacts=["mehrzadshm"], +) + +ENCODECHKA = Benchmark( + name="Encodechka", + tasks=MTEBTasks( + get_tasks( + tasks=[ + # PI + "RUParaPhraserSTS", + # SA + "SentiRuEval2016", + # TI + "RuToxicOKMLCUPClassification", + # IA + "InappropriatenessClassificationv2", + # IC, ICX + "RuNLUIntentClassification", + ] + ) + + + # NLI + get_tasks(tasks=["XNLI"], eval_splits=["test"], languages=["rus-Cyrl"]) + # STS + + get_tasks( + tasks=["RuSTSBenchmarkSTS"], + eval_splits=["validation"], + languages=["rus-Cyrl"], + ), + ), + description="A benchmark for evaluating text embedding models on Russian data.", + reference="https://github.com/avidale/encodechka", + citation=r""" +@misc{dale_encodechka, + author = {Dale, David}, + editor = {habr.com}, + month = {June}, + note = {[Online; posted 12-June-2022]}, + title = {Russian rating of sentence encoders}, + url = {https://habr.com/ru/articles/669674/}, + year = {2022}, +} +""", ) MIEB_common_tasks = [ @@ -1668,11 +2180,13 @@ ), description='"Built-Bench" is an ongoing effort aimed at evaluating text embedding models in the context of built asset management, spanning over various dicsiplines such as architeture, engineering, constrcution, and operations management of the built environment.', reference="https://arxiv.org/abs/2411.12056", - citation="""@article{shahinmoghadam2024benchmarking, - title={Benchmarking pre-trained text embedding models in aligning built asset information}, - author={Shahinmoghadam, Mehrzad and Motamedi, Ali}, - journal={arXiv preprint arXiv:2411.12056}, - year={2024} -}""", + citation=r""" +@article{shahinmoghadam2024benchmarking, + author = {Shahinmoghadam, Mehrzad and Motamedi, Ali}, + journal = {arXiv preprint arXiv:2411.12056}, + title = {Benchmarking pre-trained text embedding models in aligning built asset information}, + year = {2024}, +} +""", contacts=["mehrzadshm"], ) diff --git a/mteb/custom_validators.py b/mteb/custom_validators.py index ab97f8ce22..6f11d8a27c 100644 --- a/mteb/custom_validators.py +++ b/mteb/custom_validators.py @@ -42,6 +42,7 @@ "mpl-2.0", "msr-la-nc", "multiple", + "openrail", ] ) diff --git a/mteb/encoder_interface.py b/mteb/encoder_interface.py index 39b606b953..b2ea85eee9 100644 --- a/mteb/encoder_interface.py +++ b/mteb/encoder_interface.py @@ -182,29 +182,24 @@ def encode( # current a 1-1 match with Encoder.encode ) -> np.ndarray: pass - def get_image_embeddings( # Seems like sentence transformers use a singular encode for both images and text. Not sure if we want to do the same. - # If not it might be ideal to redefine Encoder.encode + def get_image_embeddings( self, images: list[Image.Image] | DataLoader, **kwargs, - # removed batch_size, it is not required that it will accept kwargs - ) -> np.ndarray: # added standard output (I believe we actually expect tensors in the code, but would like to be consistent) + ) -> np.ndarray: pass - def get_text_embeddings( # any reason for this? + def get_text_embeddings( self, texts: list[str], **kwargs, ) -> np.ndarray: pass - def get_fused_embeddings( # hmm what if I have a document with images at specific positions? + def get_fused_embeddings( self, texts: list[str] | None = None, - images: list[Image.Image] - | DataLoader - | None = None, # the requirement for these two to be the same seems odd (docs without images, images without associated text, docs with multiple images) - # fusion_mode: str="sum", # will remove this as it should be required in the interface + images: list[Image.Image] | DataLoader | None = None, **kwargs: Any, ) -> np.ndarray: pass diff --git a/mteb/evaluation/MTEB.py b/mteb/evaluation/MTEB.py index a6bacc189e..f1ae39ef9c 100644 --- a/mteb/evaluation/MTEB.py +++ b/mteb/evaluation/MTEB.py @@ -17,7 +17,6 @@ from sentence_transformers import CrossEncoder, SentenceTransformer from mteb.abstasks.AbsTask import ScoresDict -from mteb.abstasks.aggregated_task import AbsTaskAggregate from mteb.encoder_interface import Encoder from mteb.model_meta import ModelMeta from mteb.models import model_meta_from_sentence_transformers @@ -467,7 +466,7 @@ def run( f"\n\n********************** Evaluating {task.metadata.name} **********************" ) - if isinstance(task, AbsTaskAggregate): + if task.is_aggregate: self_ = MTEB(tasks=task.metadata.tasks) task_results = self_.run( model, diff --git a/mteb/languages.py b/mteb/languages.py index e83dd308cd..4761229f91 100644 --- a/mteb/languages.py +++ b/mteb/languages.py @@ -20,6 +20,24 @@ path_to_lang_scripts = Path(__file__).parent / "iso_15924_to_script.json" path_to_lang_fam = Path(__file__).parent / "language_family.json" +PROGRAMMING_LANGS = [ + "python", + "javascript", + "typescript", + "go", + "ruby", + "java", + "php", + "c", + "c++", + "c#", + "rust", + "swift", + "scala", + "shell", + "sql", +] + with path_to_lang_codes.open("r") as f: ISO_TO_LANGUAGE = json.load(f) @@ -98,3 +116,23 @@ def contains_scripts(self, scripts: Iterable[str]) -> bool: if not self.contains_script(s): return False return True + + +def check_language_code(code: str) -> None: + """This method checks that the language code (e.g. "eng-Latn") is valid.""" + lang, script = code.split("-") + if script == "Code": + if lang in PROGRAMMING_LANGS: + return # override for code + else: + raise ValueError( + f"Programming language {lang} is not a valid programming language." + ) + if lang not in ISO_TO_LANGUAGE: + raise ValueError( + f"Invalid language code: {lang}, you can find valid ISO 639-3 codes in {path_to_lang_codes}" + ) + if script not in ISO_TO_SCRIPT: + raise ValueError( + f"Invalid script code: {script}, you can find valid ISO 15924 codes in {path_to_lang_scripts}" + ) diff --git a/mteb/leaderboard/app.py b/mteb/leaderboard/app.py index e3833b5ce3..97bf42266e 100644 --- a/mteb/leaderboard/app.py +++ b/mteb/leaderboard/app.py @@ -20,6 +20,7 @@ from mteb.benchmarks.benchmarks import MTEB_multilingual from mteb.custom_validators import MODALITIES from mteb.languages import ISO_TO_LANGUAGE +from mteb.leaderboard.benchmark_selector import BENCHMARK_ENTRIES, make_selector from mteb.leaderboard.figures import performance_size_plot, radar_chart from mteb.leaderboard.table import create_tables @@ -104,7 +105,7 @@ def update_description( benchmark_name: str, languages: list[str], task_types: list[str], domains: list[str] ) -> str: benchmark = mteb.get_benchmark(benchmark_name) - description = f"## {benchmark.name}\n{benchmark.description}\n" + description = f"{benchmark.description}\n" n_languages = len(languages) n_task_types = len(task_types) n_tasks = len(benchmark.tasks) @@ -156,7 +157,13 @@ def update_task_info(task_names: str) -> gr.DataFrame: } ) df = df.drop(columns="reference") - return gr.DataFrame(df, datatype=["markdown"] + ["str"] * (len(df.columns) - 1)) + return gr.DataFrame( + df, + datatype=["markdown"] + ["str"] * (len(df.columns) - 1), + show_copy_button=True, + show_fullscreen_button=True, + show_search="filter", + ) # Model sizes in million parameters @@ -235,13 +242,6 @@ def get_leaderboard_app() -> gr.Blocks: summary_table, per_task_table = create_tables( [entry for entry in default_scores if entry["model_name"] in filtered_models] ) - - benchmark_select = gr.Dropdown( - [bench.name for bench in benchmarks], - value=default_benchmark.name, - label="Prebuilt Benchmarks", - info="Select one of our expert-selected benchmarks from MTEB publications.", - ) lang_select = gr.Dropdown( ISO_TO_LANGUAGE, value=sorted(default_results.languages), @@ -284,116 +284,46 @@ def get_leaderboard_app() -> gr.Blocks: """ - with gr.Blocks(fill_width=True, theme=gr.themes.Base(), head=head) as demo: + with gr.Blocks( + fill_width=True, + theme=gr.themes.Soft( + font=[gr.themes.GoogleFont("Roboto Mono"), "Arial", "sans-serif"], + ), + head=head, + ) as demo: + with gr.Sidebar( + position="left", + label="Benchmark Selection and Customization", + visible=True, + width="25%", + ): + gr.Markdown("## Select Benchmark") + benchmark_select, column = make_selector(BENCHMARK_ENTRIES) gr.Markdown( """ ## Embedding Leaderboard - This leaderboard compares 100+ text and image (soon) embedding models across 1000+ languages. We refer to the publication of each selectable benchmark for details on metrics, languages, tasks, and task types. Anyone is welcome [to add a model](https://github.com/embeddings-benchmark/mteb/blob/main/docs/adding_a_model.md), [add benchmarks](https://github.com/embeddings-benchmark/mteb/blob/main/docs/adding_a_benchmark.md), [help us improve zero-shot annotations](https://github.com/embeddings-benchmark/mteb/blob/06489abca007261c7e6b11f36d4844c5ed5efdcb/mteb/models/bge_models.py#L91) or [propose other changes to the leaderboard](https://github.com/embeddings-benchmark/mteb/tree/main/mteb/leaderboard) 🤗 Also, check out [MTEB Arena](https://huggingface.co/spaces/mteb/arena) ⚔️ - - > Looking for the previous MTEB leaderboard? We have made it available [here](https://huggingface.co/spaces/mteb/leaderboard_legacy) but it will no longer be updated. + This leaderboard compares 100+ text and image embedding models across 1000+ languages. We refer to the publication of each selectable benchmark for details on metrics, languages, tasks, and task types. Anyone is welcome [to add a model](https://github.com/embeddings-benchmark/mteb/blob/main/docs/adding_a_model.md), [add benchmarks](https://github.com/embeddings-benchmark/mteb/blob/main/docs/adding_a_benchmark.md), [help us improve zero-shot annotations](https://github.com/embeddings-benchmark/mteb/blob/06489abca007261c7e6b11f36d4844c5ed5efdcb/mteb/models/bge_models.py#L91) or [propose other changes to the leaderboard](https://github.com/embeddings-benchmark/mteb/tree/main/mteb/leaderboard) 🤗 Also, check out [MTEB Arena](https://huggingface.co/spaces/mteb/arena) ⚔️ """ ) + gr.Markdown( + lambda name: f"

{name}


", + inputs=benchmark_select, + ) - with gr.Row(): - with gr.Column(scale=5): - gr.Markdown( - "### Benchmarks\n" - "Select one of the hand-curated benchmarks from our publications and modify them using one of the following filters to fit your needs." - ) - with gr.Group(): - with gr.Row(elem_classes="overflow-y-scroll max-h-80"): - with gr.Column(): - benchmark_select.render() - with gr.Accordion("Select Languages", open=False): - lang_select.render() - with gr.Accordion("Select Task Types", open=False): - type_select.render() - with gr.Accordion("Select Domains", open=False): - domain_select.render() - with gr.Accordion("Select Modalities", open=False): - modality_select.render() - with gr.Accordion("Add and remove tasks:", open=False): - task_select.render() - with gr.Column(scale=8): - gr.Markdown( - """ - ### Model Selection - Select models to rank based on an assortment of criteria. - """, - ) - with gr.Group(): - with gr.Row(): - searchbar = gr.Textbox( - label="Search Models", - info="Press Enter to search.\nSearch models by name (RegEx sensitive. Separate queries with `|`)", - interactive=True, - ) - compatibility = gr.CheckboxGroup( - [ - ( - "Should be sentence-transformers compatible", - "Sentence Transformers", - ) - ], - value=[], - label="Compatibility", - interactive=True, - ) - with gr.Row(elem_classes=""): - with gr.Column(): - availability = gr.Radio( - [ - ("Only Open", True), - ("Only Proprietary", False), - ("Both", None), - ], - value=None, - label="Availability", - interactive=True, - ) - instructions = gr.Radio( - [ - ("Only Instruction-tuned", True), - ("Only non-instruction", False), - ("Both", None), - ], - value=None, - label="Instructions", - interactive=True, - ) - with gr.Column(): - zero_shot = gr.Radio( - [ - ( - "Only Zero-shot", - "only_zero_shot", - ), - ("Remove Unknown", "remove_unknown"), - ("Allow All", "allow_all"), - ], - value="allow_all", - label="Zero-shot", - interactive=True, - ) - model_size = RangeSlider( - minimum=MIN_MODEL_SIZE, - maximum=MAX_MODEL_SIZE, - value=(MIN_MODEL_SIZE, MAX_MODEL_SIZE), - label="Model Size (#M Parameters)", - ) scores = gr.State(default_scores) models = gr.State(filtered_models) with gr.Row(): - with gr.Column(): + with gr.Column(scale=1): description = gr.Markdown( # noqa: F841 update_description, inputs=[benchmark_select, lang_select, type_select, domain_select], ) - citation = gr.Markdown(update_citation, inputs=[benchmark_select]) # noqa: F841 + with gr.Accordion("Cite this benchmark:", open=False): + citation = gr.Markdown(update_citation, inputs=[benchmark_select]) # noqa: F841 with gr.Accordion("Share this benchmark:", open=False): gr.Markdown(produce_benchmark_link, inputs=[benchmark_select]) - with gr.Column(): + with gr.Column(scale=2): with gr.Tab("Performance per Model Size"): plot = gr.Plot(performance_size_plot, inputs=[summary_table]) # noqa: F841 gr.Markdown( @@ -404,6 +334,76 @@ def get_leaderboard_app() -> gr.Blocks: gr.Markdown( "*We only display models that have been run on all task types in the benchmark*" ) + + with gr.Accordion("Customize this Benchmark", open=False): + with gr.Column(): + with gr.Row(): + type_select.render() + with gr.Row(): + domain_select.render() + with gr.Row(): + modality_select.render() + with gr.Row(elem_classes="overflow-y-scroll max-h-80"): + lang_select.render() + with gr.Row(elem_classes="overflow-y-scroll max-h-80"): + task_select.render() + + with gr.Accordion("Advanced Model Filters", open=False): + with gr.Group(): + with gr.Row(elem_classes=""): + with gr.Column(): + compatibility = gr.CheckboxGroup( + [ + ( + "Should be sentence-transformers compatible", + "Sentence Transformers", + ) + ], + value=[], + label="Compatibility", + interactive=True, + ) + availability = gr.Radio( + [ + ("Only Open", True), + ("Only Proprietary", False), + ("Both", None), + ], + value=None, + label="Availability", + interactive=True, + ) + instructions = gr.Radio( + [ + ("Only Instruction-tuned", True), + ("Only non-instruction", False), + ("Both", None), + ], + value=None, + label="Instructions", + interactive=True, + ) + with gr.Column(): + zero_shot = gr.Radio( + [ + ( + "Only Zero-shot", + "only_zero_shot", + ), + ("Remove Unknown", "remove_unknown"), + ("Allow All", "allow_all"), + ], + value="allow_all", + label="Zero-shot", + interactive=True, + ) + model_size = RangeSlider( + minimum=MIN_MODEL_SIZE, + maximum=MAX_MODEL_SIZE, + value=(MIN_MODEL_SIZE, MAX_MODEL_SIZE), + label="Model Size (#M Parameters)", + ) + with gr.Tab("Summary"): summary_table.render() download_summary = gr.DownloadButton("Download Table") @@ -512,7 +512,7 @@ def on_benchmark_select(benchmark_name): elapsed = time.time() - start_time benchmark_results = all_benchmark_results[benchmark_name] scores = benchmark_results.get_scores(format="long") - logger.info(f"on_benchmark_select callback: {elapsed}s") + logger.debug(f"on_benchmark_select callback: {elapsed}s") return ( languages, domains, @@ -543,10 +543,12 @@ def on_benchmark_select(benchmark_name): ) def update_scores_on_lang_change(benchmark_name, languages): start_time = time.time() + if not len(languages): + return [] benchmark_results = all_benchmark_results[benchmark_name] scores = benchmark_results.get_scores(languages=languages, format="long") elapsed = time.time() - start_time - logger.info(f"update_scores callback: {elapsed}s") + logger.debug(f"update_scores callback: {elapsed}s") return scores lang_select.input( @@ -574,20 +576,28 @@ def update_scores_on_lang_change(benchmark_name, languages): def update_task_list( benchmark_name, type_select, domain_select, lang_select, modality_select ): + if not len(lang_select): + return [] start_time = time.time() tasks_to_keep = [] for task in mteb.get_benchmark(benchmark_name).tasks: if task.metadata.type not in type_select: continue - if not (set(task.metadata.domains or []) & set(domain_select)): + if task.metadata.domains is not None and not ( + set(task.metadata.domains) & set(domain_select) + ): continue - if not (set(task.languages or []) & set(lang_select)): + if task.languages is not None and not ( + set(task.languages) & set(lang_select) + ): continue - if not (set(task.metadata.modalities or []) & set(modality_select)): + if task.metadata.modalities and not ( + set(task.metadata.modalities) & set(modality_select) + ): continue tasks_to_keep.append(task.metadata.name) elapsed = time.time() - start_time - logger.info(f"update_task_list callback: {elapsed}s") + logger.debug(f"update_task_list callback: {elapsed}s") return sorted(tasks_to_keep) type_select.input( @@ -679,7 +689,7 @@ def update_models( if model_names == filtered_models: # This indicates that the models should not be filtered return None - logger.info(f"update_models callback: {elapsed}s") + logger.debug(f"update_models callback: {elapsed}s") return sorted(filtered_models) scores.change( @@ -776,14 +786,9 @@ def update_models( @cachetools.cached( cache={}, - key=lambda scores, - search_query, - tasks, - models_to_keep, - benchmark_name: hash( + key=lambda scores, tasks, models_to_keep, benchmark_name: hash( ( id(scores), - hash(search_query), hash(tuple(tasks)), id(models_to_keep), hash(benchmark_name), @@ -792,7 +797,6 @@ def update_models( ) def update_tables( scores, - search_query: str, tasks, models_to_keep, benchmark_name: str, @@ -813,33 +817,33 @@ def update_tables( filtered_scores.append(entry) else: filtered_scores = scores - summary, per_task = create_tables(filtered_scores, search_query) + summary, per_task = create_tables(filtered_scores) elapsed = time.time() - start_time - logger.info(f"update_tables callback: {elapsed}s") + logger.debug(f"update_tables callback: {elapsed}s") return summary, per_task task_select.change( update_tables, - inputs=[scores, searchbar, task_select, models, benchmark_select], + inputs=[scores, task_select, models, benchmark_select], outputs=[summary_table, per_task_table], ) scores.change( update_tables, - inputs=[scores, searchbar, task_select, models, benchmark_select], + inputs=[scores, task_select, models, benchmark_select], outputs=[summary_table, per_task_table], ) models.change( update_tables, - inputs=[scores, searchbar, task_select, models, benchmark_select], - outputs=[summary_table, per_task_table], - ) - searchbar.submit( - update_tables, - inputs=[scores, searchbar, task_select, models, benchmark_select], + inputs=[scores, task_select, models, benchmark_select], outputs=[summary_table, per_task_table], ) gr.Markdown(acknowledgment_md, elem_id="ack_markdown") + gr.Markdown( + """ + > Looking for the previous MTEB leaderboard? We have made it available [here](https://huggingface.co/spaces/mteb/leaderboard_legacy) but it will no longer be updated. + """ + ) # Prerun on all benchmarks, so that results of callbacks get cached for benchmark in benchmarks: @@ -862,7 +866,7 @@ def update_tables( ) # We have to call this both on the filtered and unfiltered task because the callbacks # also gets called twice for some reason - update_tables(bench_scores, "", bench_tasks, filtered_models, benchmark.name) + update_tables(bench_scores, bench_tasks, filtered_models, benchmark.name) filtered_tasks = update_task_list( benchmark.name, bench_types, @@ -870,7 +874,7 @@ def update_tables( bench_languages, bench_modalities, ) - update_tables(bench_scores, "", filtered_tasks, filtered_models, benchmark.name) + update_tables(bench_scores, filtered_tasks, filtered_models, benchmark.name) return demo diff --git a/mteb/leaderboard/benchmark_selector.py b/mteb/leaderboard/benchmark_selector.py new file mode 100644 index 0000000000..837c6b7f8e --- /dev/null +++ b/mteb/leaderboard/benchmark_selector.py @@ -0,0 +1,176 @@ +from __future__ import annotations + +import gradio as gr + +import mteb +from mteb import Benchmark + +""" +Each entry is a tuple, where the first element is a label, and the second is either a single benchmark or a group of benchmarks. + +Example: +[ + ("First Benchmark", dict(value="MTEB(something)", icon="icon_url")), + ("Group of Benchmarks", + [ + ("Second Benchmark", dict(value="MTEB(something)", icon="icon_url")), + ("Third Benchmark", dict(value="MTEB(something)", icon="icon_url")), + ], + ), +] +""" +BENCHMARK_ENTRIES = [ + mteb.get_benchmarks(["MTEB(Multilingual, v2)", "MTEB(eng, v2)"]), + ( + "Image Benchmarks", + mteb.get_benchmarks( + [ + "MIEB(Multilingual)", + "MIEB(eng)", + "MIEB(lite)", + "MIEB(Img)", + ] + ), + ), + ( + "Domain-Specific Benchmarks", + mteb.get_benchmarks( + [ + "MTEB(Code, v1)", + "MTEB(Law, v1)", + "MTEB(Medical, v1)", + "ChemTEB", + ] + ), + ), + ( + "Regional Benchmarks", + mteb.get_benchmarks( + [ + "MTEB(Europe, v1)", + "MTEB(Indic, v1)", + "MTEB(Scandinavian, v1)", + ] + ), + ), + ( + "Language-specific Benchmarks", + mteb.get_benchmarks( + [ + "MTEB(cmn, v1)", + "MTEB(deu, v1)", + "MTEB(fra, v1)", + "MTEB(jpn, v1)", + "MTEB(kor, v1)", + "MTEB(pol, v1)", + "MTEB(rus, v1)", + "MTEB(fas, beta)", + ] + ), + ), + ( + "Miscellaneous", + mteb.get_benchmarks( + [ + "BEIR", + "BEIR-NL", + "NanoBEIR", + "BRIGHT", + "BRIGHT (long)", + "BuiltBench(eng)", + "CoIR", + "FollowIR", + "LongEmbed", + "MINERSBitextMining", + "RAR-b", + ] + ), + ), + ( + "Legacy", + mteb.get_benchmarks( + [ + "MTEB(eng, v1)", + ] + ), + ), +] + + +def _create_button( + i: int, + benchmark: Benchmark, + state: gr.State, + label_to_value: dict[str, str], + **kwargs, +): + val = benchmark.name + label = ( + benchmark.display_name if benchmark.display_name is not None else benchmark.name + ) + label_to_value[label] = benchmark.name + button = gr.Button( + label, + variant="secondary" if i != 0 else "primary", + icon=benchmark.icon, + key=f"{i}_button_{val}", + elem_classes="text-white", + **kwargs, + ) + + def _update_variant(state: str, label: str) -> gr.Button: + if state == label_to_value[label]: + return gr.Button(variant="primary") + else: + return gr.Button(variant="secondary") + + def _update_value(label: str) -> str: + return label_to_value[label] + + state.change(_update_variant, inputs=[state, button], outputs=[button]) + button.click(_update_value, inputs=[button], outputs=[state]) + return button + + +def make_selector( + entries: list[list[Benchmark] | tuple[str, list[Benchmark]]], +) -> tuple[gr.State, gr.Column]: + if not entries: + raise ValueError("No entries were specified, can't build selector.") + label_to_value = {} + state = None + with gr.Column() as column: + i = 0 + for entry in entries: + if i == 0: + if isinstance(entry, list): + fist_entry = entry[0] + state = gr.State(fist_entry.name) + elif isinstance(entry, tuple): + _label, _entry = entry + state = gr.State(_entry[0].name) + else: + raise ValueError("Benchmark selector specified incorrectly") + if isinstance(entry, list): + for benchmark in entry: + button = _create_button( + i, benchmark, state, label_to_value, size="lg" + ) + i += 1 + elif isinstance(entry, tuple): + label, _entry = entry + gr.Markdown(f"### **{label}**") + for benchmark in _entry: + button = _create_button( # noqa: F841 + i, benchmark, state, label_to_value, size="md" + ) + i += 1 + + return state, column + + +if __name__ == "__main__": + with gr.Blocks() as b: + selector = make_selector(BENCHMARK_ENTRIES) + + b.launch() diff --git a/mteb/leaderboard/figures.py b/mteb/leaderboard/figures.py index 57a282327c..6a945346f2 100644 --- a/mteb/leaderboard/figures.py +++ b/mteb/leaderboard/figures.py @@ -147,6 +147,7 @@ def performance_size_plot(df: pd.DataFrame) -> go.Figure: "model_text": False, }, hover_name="Model", + color_continuous_scale=px.colors.sequential.Greens, ) # Note: it's important that this comes before setting the size mode fig = add_size_guide(fig) diff --git a/mteb/leaderboard/table.py b/mteb/leaderboard/table.py index b848406ba5..ca61ba02ea 100644 --- a/mteb/leaderboard/table.py +++ b/mteb/leaderboard/table.py @@ -61,6 +61,23 @@ def get_column_types(df: pd.DataFrame) -> list[str]: return types +def get_column_widths(df: pd.DataFrame) -> list[str]: + # Please do not remove this function when refactoring. + # Column width calculation seeminlgy changes regularly with Gradio releases, + # and this piece of logic is good enough to quickly fix related issues. + widths = [] + for column_name in df.columns: + column_word_lengths = [len(word) for word in column_name.split()] + if is_numeric_dtype(df[column_name]): + value_lengths = [len(f"{value:.2f}") for value in df[column_name]] + else: + value_lengths = [len(str(value)) for value in df[column_name]] + max_length = max(max(column_word_lengths), max(value_lengths)) + n_pixels = 25 + (max_length * 10) + widths.append(f"{n_pixels}px") + return widths + + def get_means_per_types(per_task: pd.DataFrame): task_names_per_type = defaultdict(list) for task_name in per_task.columns: @@ -237,7 +254,6 @@ def apply_styling( ] light_green_cmap = create_light_green_cmap() numeric_data = joint_table.copy() - numeric_data["Zero-shot"] = numeric_data["Zero-shot"].replace(-1, np.nan) joint_table["Zero-shot"] = joint_table["Zero-shot"].apply(format_zero_shot) joint_table[score_columns] = joint_table[score_columns].map(format_scores) joint_table_style = joint_table.style.format( @@ -278,20 +294,39 @@ def apply_styling( per_task_style = per_task.style.format( "{:.2f}", subset=task_score_columns, na_rep="" ).highlight_max(subset=task_score_columns, props="font-weight: bold") - for col in task_score_columns: - if col != "Model": - mask = per_task[col].notna() - per_task_style = per_task_style.background_gradient( - cmap=light_green_cmap, - subset=pd.IndexSlice[mask, col], - gmap=per_task[col].loc[mask], - ) + # TODO: uncomment this when Gradio fixes it. + # The fix is already merged and contained in this release: https://github.com/gradio-app/gradio/pull/11032 + # It will be available in Gradio 5.25.3 + # for col in task_score_columns: + # if col != "Model": + # mask = per_task[col].notna() + # per_task_style = per_task_style.background_gradient( + # cmap=light_green_cmap, + # subset=pd.IndexSlice[mask, col], + # gmap=per_task[col].loc[mask], + # ) + column_widths = get_column_widths(joint_table_style.data) + column_widths[0] = "100px" + column_widths[1] = "250px" return ( gr.DataFrame( joint_table_style, datatype=column_types, interactive=False, pinned_columns=3, + column_widths=column_widths, + wrap=True, + show_fullscreen_button=True, + show_copy_button=True, + show_search="filter", + ), + gr.DataFrame( + per_task_style, + interactive=False, + pinned_columns=1, + show_fullscreen_button=True, + show_copy_button=True, + show_search="filter", ), gr.DataFrame(per_task_style, interactive=False, pinned_columns=1), ) diff --git a/mteb/load_results/benchmark_results.py b/mteb/load_results/benchmark_results.py index ed6b654b09..4c83d3b156 100644 --- a/mteb/load_results/benchmark_results.py +++ b/mteb/load_results/benchmark_results.py @@ -1,6 +1,7 @@ from __future__ import annotations import json +import logging import warnings from collections import defaultdict from collections.abc import Iterable, Sequence @@ -21,21 +22,71 @@ from mteb.custom_validators import MODALITIES from mteb.languages import ISO_LANGUAGE from mteb.load_results.task_results import TaskResult -from mteb.models.overview import get_model_metas +from mteb.models.overview import ModelMeta, get_model_metas Split = str Score = Any +logger = logging.getLogger(__name__) + + +def _aggregate_and_pivot( + df: pd.DataFrame, + columns: list[str], + aggregation_level: Literal["subset", "split", "task"], + format: Literal["wide", "long"], + aggregation_fn: Callable[[list[Score]], Any] | None, +) -> pd.DataFrame: + if aggregation_level == "subset": + index_columns = ["task_name", "split", "subset"] + + elif aggregation_level == "split": + index_columns = ["task_name", "split"] + + elif aggregation_level == "task": + index_columns = ["task_name"] + + # perform aggregation + if aggregation_fn is None: + aggregation_fn = np.mean + + if format == "wide": + return df.pivot_table( + index=index_columns, + columns=columns, + values="score", + aggfunc=aggregation_fn, + ).reset_index() + elif format == "long": + return ( + df.groupby(columns + index_columns) + .agg(score=("score", aggregation_fn)) + .reset_index() + ) + + class ModelResult(BaseModel): + """Data class to hold the results of a model on a set of tasks. + + Attributes: + model_name: Name of the model. + model_revision: Revision of the model. + task_results: List of TaskResult objects. + """ + + # TODO: v2, move to its own file model_result.py + model_name: str model_revision: str | None task_results: list[TaskResult] default_modalities: list[MODALITIES] = Field( default_factory=lambda: ["text"], alias="modalities" ) - model_config = ConfigDict( - protected_namespaces=(), + model_config = ( + ConfigDict( # to free up the name model_* which is otherwise protected + protected_namespaces=(), + ) ) def __repr__(self) -> str: @@ -57,6 +108,7 @@ def filter_tasks( task_types: list[TASK_TYPE] | None = None, modalities: list[MODALITIES] | None = None, ) -> ModelResult: + # TODO: v2 see filter_tasks in BenchmarkResults - but can be moved to a private function or removed new_task_results = [] for task_result in self.task_results: if (task_names is not None) and (task_result.task_name not in task_names): @@ -104,6 +156,7 @@ def get_scores( aggregation: Callable[[list[Score]], Any] | None = None, format: Literal["wide", "long"] = "wide", ) -> dict | list: + # TODO: Convert to private function in v2 - potentially remove if (getter is not None) or (aggregation is not None) or (scripts is not None): use_fast = False getter = ( @@ -140,14 +193,15 @@ def get_scores( try: if use_fast: score = task_res.get_score_fast( - splits=splits, languages=languages + splits=splits, + languages=languages, # type: ignore ) else: score = task_res.get_score( splits=splits, languages=languages, - aggregation=aggregation, - getter=getter, + aggregation=aggregation, # type: ignore + getter=getter, # type: ignore scripts=scripts, ) entry = dict( # noqa @@ -167,7 +221,88 @@ def get_scores( ) return entries - def __iter__(self): + def _get_score_for_table(self) -> list[dict[str, str | float]]: + scores_data = [] + model_name = self.model_name + for task_result in self.task_results: + task_name = task_result.task_name + for split, scores_list in task_result.scores.items(): + for score_item in scores_list: + row = { + "model_name": model_name, + "model_revision": self.model_revision, + "task_name": task_name, + "split": split, + "subset": score_item.get("hf_subset", "default"), + "score": score_item.get("main_score", None), + } + + scores_data.append(row) + + return scores_data + + def to_dataframe( + self, + aggregation_level: Literal["subset", "split", "task"] = "task", + aggregation_fn: Callable[[list[Score]], Any] | None = None, + include_model_revision: bool = False, + format: Literal["wide", "long"] = "wide", + ) -> pd.DataFrame: + """Get a DataFrame with the scores for all models and tasks. + The DataFrame will have the following columns in addition to the metadata columns: + + - model_name: The name of the model. + - task_name: The name of the task. + - score: The main score of the model on the task. + + In addition, the DataFrame can have the following columns depending on the aggregation level: + + - split: The split of the task. E.g. "test", "train", "validation". + - subset: The subset of the task. E.g. "en", "fr-en". + + Afterwards, the DataFrame will be aggregated according to the aggregation method and pivoted to either a wide format. + + Args: + aggregation_level: The aggregation to use. Can be one of: + - "subset"/None: No aggregation will be done. The DataFrame will have one row per model, task, split and subset. + - "split": Aggregates the scores by split. The DataFrame will have one row per model, task and split. + - "task": Aggregates the scores by task. The DataFrame will have one row per model and task. + aggregation_fn: The function to use for aggregation. If None, the mean will be used. + include_model_revision: If True, the model revision will be included in the DataFrame. If False, it will be excluded. + format: The format of the DataFrame. Can be one of: + - "wide": The DataFrame will be of shape (number of tasks, number of models). Scores will be in the cells. + - "long": The DataFrame will of length (number of tasks * number of model). Scores will be in columns. + + Returns: + A DataFrame with the scores for all models and tasks. + """ + scores_data = self._get_score_for_table() + + if not scores_data: + logger.warning("No scores data available. Returning empty DataFrame.") + return pd.DataFrame() + + # Create DataFrame + df = pd.DataFrame(scores_data) + + _columns = ["model_name"] + if include_model_revision is False: + df = df.drop(columns=["model_revision"]) + else: + _columns.append("model_revision") + + return _aggregate_and_pivot( + df, + columns=_columns, + aggregation_level=aggregation_level, + format=format, + aggregation_fn=aggregation_fn, + ) + + def __hash__(self) -> int: + return id(self) + + def __iter__(self) -> Iterable[TaskResult]: return iter(self.task_results) def __getitem__(self, index) -> TaskResult: @@ -175,6 +310,11 @@ def __getitem__(self, index) -> TaskResult: @property def languages(self) -> list[str]: + """Get all languages in the model results. + + Returns: + A list of languages in the model results. + """ langs = [] for task_res in self.task_results: langs.extend(task_res.languages) @@ -182,6 +322,12 @@ def languages(self) -> list[str]: @property def domains(self) -> list[str]: + """Get all domains in the model results. + + Returns: + A list of domains in the model results. + + """ ds = [] for task_res in self.task_results: ds.extend(task_res.domains) @@ -189,14 +335,29 @@ def domains(self) -> list[str]: @property def task_types(self) -> list[str]: + """Get all task types in the model results. + + Returns: + A list of task types in the model results. + """ return list({task_res.task_type for task_res in self.task_results}) @property def task_names(self) -> list[str]: + """Get all task names in the model results. + + Returns: + A list of task names in the model results. + """ return [task_res.task_name for task_res in self.task_results] @property def modalities(self) -> list[str]: + """Get all modalities in the task results. + + Returns: + A list of modalities in the task results. + """ mods = [] for task_res in self.task_results: task_modalities = getattr(task_res, "modalities", []) @@ -207,9 +368,17 @@ def modalities(self) -> list[str]: class BenchmarkResults(BaseModel): + """Data class to hold the benchmark results of a model. + + Attributes: + model_results: List of ModelResult objects. + """ + model_results: list[ModelResult] - model_config = ConfigDict( - protected_namespaces=(), + model_config = ( + ConfigDict( # to free up the name model_results which is otherwise protected + protected_namespaces=(), + ) ) def __repr__(self) -> str: @@ -224,9 +393,10 @@ def filter_tasks( task_names: list[str] | None = None, languages: list[str] | None = None, domains: list[TASK_DOMAIN] | None = None, - task_types: list[TASK_TYPE] | None = None, + task_types: list[TASK_TYPE] | None = None, # type: ignore modalities: list[MODALITIES] | None = None, ) -> BenchmarkResults: + # TODO: Same as filter_models model_results = [ res.filter_tasks( task_names=task_names, @@ -242,11 +412,52 @@ def filter_tasks( ) def select_tasks(self, tasks: Sequence[AbsTask]) -> BenchmarkResults: + """Select tasks from the benchmark results. + + Args: + tasks: List of tasks to select. Can be a list of AbsTask objects or task names. + """ new_model_results = [ model_res.select_tasks(tasks) for model_res in self.model_results ] return type(self).model_construct(model_results=new_model_results) + def select_models( + self, + names: list[str] | list[ModelMeta], + revisions: list[str | None] | None = None, + ) -> BenchmarkResults: + """Get models by name and revision. + + Args: + names: List of model names to filter by. Can also be a list of ModelMeta objects. In which case, the revision is ignored. + revisions: List of model revisions to filter by. If None, all revisions are returned. + """ + models_res = [] + _revisions = revisions if revisions is not None else [None] * len(names) + + name_rev = {} + + if len(names) != len(_revisions): + raise ValueError( + "The length of names and revisions must be the same or revisions must be None." + ) + + for name, revision in zip(names, _revisions): + if isinstance(name, ModelMeta): + name_rev[name.name] = name.revision + else: + name_rev[name] = revision + + for model_res in self.model_results: + model_name = model_res.model_name + revision = model_res.model_revision + if model_name in name_rev: + if name_rev[model_name] is None or revision == name_rev[model_name]: + models_res.append(model_res) + + return type(self).model_construct(model_results=models_res) + def filter_models( self, model_names: Iterable[str] | None = None, @@ -257,8 +468,10 @@ def filter_models( use_instructions: bool | None = None, zero_shot_on: list[AbsTask] | None = None, ) -> BenchmarkResults: - # if model_names is None: - # model_names = [model_res.model_name for model_res in self] + # TODO: This seems like mostly a utility function for the benchmark + # I would probably move the filtering of the models outside of this call. No need to call get_model_metas inside the filter. + # interface would then be the same as the get_models function + model_metas = get_model_metas( model_names=model_names, languages=languages, @@ -274,9 +487,19 @@ def filter_models( for model_res in self: if model_res.model_name in models: new_model_results.append(model_res) + return type(self).model_construct(model_results=new_model_results) - def join_revisions(self): + def join_revisions(self) -> BenchmarkResults: + """Join revisions of the same model. + + In case of conflicts, the following rules are applied: + - If the main revision is present, it is kept. The main revision is the defined in the models ModelMeta object. + - If there is multiple revisions and some of them are None or na, they are filtered out. + - If there is no main revision, we prefer the one run using the latest mteb version. + """ + # TODO: In v2 we should probably have this be the default when loading. We could probably even reduce loading times by only loading what we need + def parse_version(version_str: str) -> Version | None: try: return Version(version_str) @@ -312,7 +535,7 @@ def keep_best(group: pd.DataFrame) -> pd.DataFrame: records = [] for model_result in self: - for task_result in model_result: + for task_result in model_result.task_results: records.append( dict( model=model_result.model_name, @@ -329,8 +552,8 @@ def keep_best(group: pd.DataFrame) -> pd.DataFrame: model_to_main_revision = { meta.name: meta.revision for meta in get_model_metas() } - task_df["main_revision"] = task_df["model"].map(model_to_main_revision) - task_df["mteb_version"] = task_df["mteb_version"].map(parse_version) + task_df["main_revision"] = task_df["model"].map(model_to_main_revision) # type: ignore + task_df["mteb_version"] = task_df["mteb_version"].map(parse_version) # type: ignore task_df = ( task_df.groupby(["model", "task_name"]) .apply(keep_best) @@ -355,6 +578,7 @@ def get_scores( aggregation: Callable[[list[Score]], Any] | None = None, format: Literal["wide", "long"] = "wide", ) -> list[dict]: + # TODO: Convert to private function in v2 entries = [] if format == "wide": for model_res in self: @@ -371,7 +595,7 @@ def get_scores( { "model": model_res.model_name, "revision": model_res.model_revision, - **model_scores, + **model_scores, # type: ignore } ) except Exception as e: @@ -397,6 +621,72 @@ def get_scores( ) return entries + def to_dataframe( + self, + aggregation_level: Literal["subset", "split", "task"] = "task", + aggregation_fn: Callable[[list[Score]], Any] | None = None, + include_model_revision: bool = False, + format: Literal["wide", "long"] = "wide", + ) -> pd.DataFrame: + """Get a DataFrame with the scores for all models and tasks. + The DataFrame will have the following columns in addition to the metadata columns: + + - model_name: The name of the model. + - task_name: The name of the task. + - score: The main score of the model on the task. + + In addition, the DataFrame can have the following columns depending on the aggregation level: + + - split: The split of the task. E.g. "test", "train", "validation". + - subset: The subset of the task. E.g. "en", "fr-en". + + Afterwards, the DataFrame will be aggregated according to the aggregation method and pivoted to either a wide format. + + Args: + aggregation_level: The aggregation to use. Can be one of: + - "subset"/None: No aggregation will be done. The DataFrame will have one row per model, task, split and subset. + - "split": Aggregates the scores by split. The DataFrame will have one row per model, task and split. + - "task": Aggregates the scores by task. The DataFrame will have one row per model and task. + aggregation_fn: The function to use for aggregation. If None, the mean will be used. + include_model_revision: If True, the model revision will be included in the DataFrame. If False, it will be excluded. + If there are multiple revisions for the same model, they will be joined using the `join_revisions` method. + format: The format of the DataFrame. Can be one of: + - "wide": The DataFrame will be of shape (number of tasks, number of models). Scores will be in the cells. + - "long": The DataFrame will of length (number of tasks * number of model). Scores will be in columns. + + Returns: + A DataFrame with the scores for all models and tasks. + """ + bench_results = self + if include_model_revision is False: + bench_results = bench_results.join_revisions() + + scores_data = [] + for model_result in bench_results: + scores_data.extend(model_result._get_score_for_table()) + + if not scores_data: + logger.warning("No scores data available. Returning empty DataFrame.") + return pd.DataFrame() + + # Create DataFrame + df = pd.DataFrame(scores_data) + + _columns = ["model_name"] + if include_model_revision is False: + df = df.drop(columns=["model_revision"]) + else: + _columns.append("model_revision") + + # Aggregation + return _aggregate_and_pivot( + df, + columns=_columns, + aggregation_level=aggregation_level, + aggregation_fn=aggregation_fn, + format=format, + ) + def __iter__(self): return iter(self.model_results) @@ -404,6 +694,7 @@ def __getitem__(self, index) -> ModelResult: return self.model_results[index] def to_legacy_dict(self) -> dict[str, dict[str, list[TaskResult]]]: + # TODO: Make private or remove in v2 res = defaultdict(dict) for model_res in self: res[model_res.model_name][model_res.model_revision] = model_res.task_results @@ -411,6 +702,7 @@ def to_legacy_dict(self) -> dict[str, dict[str, list[TaskResult]]]: @classmethod def from_legacy_dict(cls, legacy: dict[str, dict[str, list[TaskResult]]]): + # TODO: Make private or remove in v2 model_results = [] for model_name, revisions in legacy.items(): for model_revision, results in revisions.items(): @@ -451,6 +743,11 @@ def from_disk(cls, path: Path | str) -> BenchmarkResults: @property def languages(self) -> list[str]: + """Get all languages in the benchmark results. + + Returns: + A list of languages in ISO 639-1 format. + """ langs = [] for model_res in self.model_results: langs.extend(model_res.languages) @@ -458,6 +755,11 @@ def languages(self) -> list[str]: @property def domains(self) -> list[str]: + """Get all domains in the benchmark results. + + Returns: + A list of domains in ISO 639-1 format. + """ ds = [] for model_res in self.model_results: ds.extend(model_res.domains) @@ -465,6 +767,12 @@ def domains(self) -> list[str]: @property def task_types(self) -> list[str]: + """Get all task types in the benchmark results. + + Returns: + A list of task types. + """ + # TODO: V2: Task types vs task categories - we should probably be consistent ts = [] for model_res in self.model_results: ts.extend(model_res.task_types) @@ -472,6 +780,11 @@ def task_types(self) -> list[str]: @property def task_names(self) -> list[str]: + """Get all task names in the benchmark results. + + Returns: + A list of task names. + """ names = [] for model_res in self.model_results: names.extend(model_res.task_names) @@ -479,7 +792,33 @@ def task_names(self) -> list[str]: @property def modalities(self) -> list[str]: + """Get all modalities in the benchmark results. + + Returns: + A list of modalities. + """ mod = [] for model_res in self.model_results: mod.extend(model_res.modalities) return list(set(mod)) + + @property + def model_names(self) -> list[str]: + """Get all model names in the benchmark results. + + Returns: + A list of model names. + """ + return [model_res.model_name for model_res in self.model_results] + + @property + def model_revisions(self) -> list[dict[str, str | None]]: + """Get all model revisions in the benchmark results. + + Returns: + A list of dictionaries with model names and revisions. + """ + return [ + {"model_name": model_res.model_name, "revision": model_res.model_revision} + for model_res in self.model_results + ] diff --git a/mteb/load_results/load_results.py b/mteb/load_results/load_results.py index 917f82553f..ed8a90c060 100644 --- a/mteb/load_results/load_results.py +++ b/mteb/load_results/load_results.py @@ -106,6 +106,8 @@ def load_results( splits from the results object that are not default in the task metadata. Defaults to True. only_main_score: If True, only the main score will be loaded. """ + # TODO: we want to allow results_repo (the first argument) to be a local path + # TODO: in v2 we can rename it to "path" to align with load_dataset repo_directory = download_of_results(results_repo, download_latest=download_latest) model_paths = [p for p in (repo_directory / "results").glob("*") if p.is_dir()] diff --git a/mteb/load_results/task_results.py b/mteb/load_results/task_results.py index 989d191d1e..c8218075c9 100644 --- a/mteb/load_results/task_results.py +++ b/mteb/load_results/task_results.py @@ -464,6 +464,7 @@ def get_score_fast( subsets: Iterable[str] | None = None, ) -> float: """Sped up version of get_score that will be used if no aggregation, script or getter needs to be specified.""" + # TODO: v2: We should make this private if splits is None: splits = self.scores.keys() val_sum = 0 diff --git a/mteb/model_meta.py b/mteb/model_meta.py index 74f4a79f64..2aeb1ac6d6 100644 --- a/mteb/model_meta.py +++ b/mteb/model_meta.py @@ -17,7 +17,10 @@ from mteb.encoder_interface import Encoder from .custom_validators import LICENSES, MODALITIES, STR_DATE, STR_URL -from .languages import ISO_LANGUAGE_SCRIPT +from .languages import ( + ISO_LANGUAGE_SCRIPT, + check_language_code, +) if TYPE_CHECKING: from .models.sentence_transformer_wrapper import SentenceTransformerWrapper @@ -123,6 +126,16 @@ def to_dict(self): dict_repr["loader"] = get_loader_name(loader) return dict_repr + @field_validator("languages") + @classmethod + def languages_are_valid(cls, languages: list[ISO_LANGUAGE_SCRIPT] | None) -> None: + if languages is None: + return None + + for code in languages: + check_language_code(code) + return languages + @field_validator("name") @classmethod def check_name(cls, v: str | None) -> str | None: diff --git a/mteb/models/align_models.py b/mteb/models/align_models.py index be8ff8e56d..d43820d539 100644 --- a/mteb/models/align_models.py +++ b/mteb/models/align_models.py @@ -139,7 +139,7 @@ def get_fused_embeddings( model_name="kakaobrain/align-base", ), name="kakaobrain/align-base", - languages=["eng_Latn"], + languages=["eng-Latn"], revision="e96a37facc7b1f59090ece82293226b817afd6ba", release_date="2023-02-24", modalities=["image", "text"], diff --git a/mteb/models/arctic_models.py b/mteb/models/arctic_models.py index 8397157f43..ca0d0fbb34 100644 --- a/mteb/models/arctic_models.py +++ b/mteb/models/arctic_models.py @@ -5,80 +5,80 @@ from mteb.model_meta import ModelMeta, sentence_transformers_loader LANGUAGES_V2_0 = [ - "afr_Latn", - "ara_Arab", - "aze_Latn", - "bel_Cyrl", - "bul_Cyrl", - "ben_Beng", - "cat_Latn", - "ceb_Latn", - "ces_Latn", - "cym_Latn", - "dan_Latn", - "deu_Latn", - "ell_Grek", - "eng_Latn", - "spa_Latn", - "est_Latn", - "eus_Latn", - "fas_Arab", - "fin_Latn", - "fra_Latn", - "glg_Latn", - "guj_Gujr", - "heb_Hebr", - "hin_Deva", - "hrv_Latn", - "hat_Latn", - "hun_Latn", - "hye_Armn", - "ind_Latn", - "isl_Latn", - "ita_Latn", - "jpn_Jpan", - "jav_Latn", - "kat_Geor", - "kaz_Cyrl", - "khm_Khmr", - "kan_Knda", - "kor_Hang", - "kir_Cyrl", - "lao_Laoo", - "lit_Latn", - "lav_Latn", - "mkd_Cyrl", - "mal_Mlym", - "mon_Cyrl", - "mar_Deva", - "msa_Latn", - "mya_Mymr", - "nep_Deva", - "nld_Latn", - "pan_Guru", - "pol_Latn", - "por_Latn", - "que_Latn", - "ron_Latn", - "rus_Cyrl", - "sin_Sinh", - "slk_Latn", - "slv_Latn", - "som_Latn", - "sqi_Latn", - "srp_Cyrl", - "swe_Latn", - "swa_Latn", - "tam_Taml", - "tel_Telu", - "tha_Thai", - "tgl_Latn", - "tur_Latn", - "ukr_Cyrl", - "urd_Arab", - "vie_Latn", - "yor_Latn", - "zho_Hans", + "afr-Latn", + "ara-Arab", + "aze-Latn", + "bel-Cyrl", + "bul-Cyrl", + "ben-Beng", + "cat-Latn", + "ceb-Latn", + "ces-Latn", + "cym-Latn", + "dan-Latn", + "deu-Latn", + "ell-Grek", + "eng-Latn", + "spa-Latn", + "est-Latn", + "eus-Latn", + "fas-Arab", + "fin-Latn", + "fra-Latn", + "glg-Latn", + "guj-Gujr", + "heb-Hebr", + "hin-Deva", + "hrv-Latn", + "hat-Latn", + "hun-Latn", + "hye-Armn", + "ind-Latn", + "isl-Latn", + "ita-Latn", + "jpn-Jpan", + "jav-Latn", + "kat-Geor", + "kaz-Cyrl", + "khm-Khmr", + "kan-Knda", + "kor-Hang", + "kir-Cyrl", + "lao-Laoo", + "lit-Latn", + "lav-Latn", + "mkd-Cyrl", + "mal-Mlym", + "mon-Cyrl", + "mar-Deva", + "msa-Latn", + "mya-Mymr", + "nep-Deva", + "nld-Latn", + "pan-Guru", + "pol-Latn", + "por-Latn", + "que-Latn", + "ron-Latn", + "rus-Cyrl", + "sin-Sinh", + "slk-Latn", + "slv-Latn", + "som-Latn", + "sqi-Latn", + "srp-Cyrl", + "swe-Latn", + "swa-Latn", + "tam-Taml", + "tel-Telu", + "tha-Thai", + "tgl-Latn", + "tur-Latn", + "ukr-Cyrl", + "urd-Arab", + "vie-Latn", + "yor-Latn", + "zho-Hans", ] arctic_v1_training_datasets = { @@ -126,7 +126,7 @@ name="Snowflake/snowflake-arctic-embed-xs", revision="742da4f66e1823b5b4dbe6c320a1375a1fd85f9e", release_date="2024-07-08", # initial commit of hf model. - languages=["eng_Latn"], + languages=["eng-Latn"], open_weights=True, framework=["Sentence Transformers", "PyTorch"], n_parameters=22_600_000, @@ -154,7 +154,7 @@ name="Snowflake/snowflake-arctic-embed-s", revision="d3c1d2d433dd0fdc8e9ca01331a5f225639e798f", release_date="2024-04-12", # initial commit of hf model. - languages=["eng_Latn"], + languages=["eng-Latn"], open_weights=True, framework=["Sentence Transformers", "PyTorch"], n_parameters=32_200_000, @@ -182,7 +182,7 @@ name="Snowflake/snowflake-arctic-embed-m", revision="cc17beacbac32366782584c8752220405a0f3f40", release_date="2024-04-12", # initial commit of hf model. - languages=["eng_Latn"], + languages=["eng-Latn"], open_weights=True, framework=["Sentence Transformers", "PyTorch"], n_parameters=109_000_000, @@ -210,7 +210,7 @@ name="Snowflake/snowflake-arctic-embed-m-long", revision="89d0f6ab196eead40b90cb6f9fefec01a908d2d1", release_date="2024-04-12", # initial commit of hf model. - languages=["eng_Latn"], + languages=["eng-Latn"], open_weights=True, framework=["Sentence Transformers", "PyTorch"], n_parameters=137_000_000, @@ -237,7 +237,7 @@ name="Snowflake/snowflake-arctic-embed-l", revision="9a9e5834d2e89cdd8bb72b64111dde496e4fe78c", release_date="2024-04-12", # initial commit of hf model. - languages=["eng_Latn"], + languages=["eng-Latn"], open_weights=True, framework=["Sentence Transformers", "PyTorch"], n_parameters=335_000_000, @@ -267,7 +267,7 @@ name="Snowflake/snowflake-arctic-embed-m-v1.5", revision="97eab2e17fcb7ccb8bb94d6e547898fa1a6a0f47", release_date="2024-07-08", # initial commit of hf model. - languages=["eng_Latn"], + languages=["eng-Latn"], open_weights=True, framework=["Sentence Transformers", "PyTorch"], n_parameters=109_000_000, diff --git a/mteb/models/bge_models.py b/mteb/models/bge_models.py index ed360fbbdf..9ccc291553 100644 --- a/mteb/models/bge_models.py +++ b/mteb/models/bge_models.py @@ -137,64 +137,64 @@ # https://huggingface.co/BAAI/bge-m3/discussions/29 bgem3_languages = [ - "afr_Latn", # af + "afr-Latn", # af # als - "amh_Ethi", # am + "amh-Ethi", # am # an # ar - "azj_Latn", # arz + "azj-Latn", # arz # as - "ast_Latn", # ast + "ast-Latn", # ast # av # az - "azj_Latn", # azb + "azj-Latn", # azb # ba # bar # bcl - "ben_Beng", # be - "bul_Cyrl", # bg + "ben-Beng", # be + "bul-Cyrl", # bg # bh # bn # bo - "bel_Cyrl", # bpy + "bel-Cyrl", # bpy # br # bs # bxr - "cat_Latn", # ca + "cat-Latn", # ca # cbk # ce - "ceb_Latn", # ceb - "ckb_Arab", # ckb + "ceb-Latn", # ceb + "ckb-Arab", # ckb # co # cs # cv # cy - "dan_Latn", # da - "deu_Latn", # de + "dan-Latn", # da + "deu-Latn", # de # diq # dsb # dty # dv - "ell_Grek", # el + "ell-Grek", # el # eml - "eng_Latn", # en + "eng-Latn", # en # eo - "est_Latn", # es + "est-Latn", # es # et # eu # fa - "fin_Latn", # fi - "fra_Latn", # fr + "fin-Latn", # fi + "fra-Latn", # fr # fy # ga # gd - "glg_Latn", # gl + "glg-Latn", # gl # gn # gom - "guj_Gujr", # gu + "guj-Gujr", # gu # gv - "heb_Hebr", # he - "hin_Deva", # hi + "heb-Hebr", # he + "hin-Deva", # hi # hif # hr # hsb @@ -207,15 +207,15 @@ # ilo # io # is - "ita_Latn", # it - "jpn_Jpan", # ja + "ita-Latn", # it + "jpn-Jpan", # ja # jbo # jv # ka # kk # km # kn - "kor_Hang", # ko + "kor-Hang", # ko # krc # ku # kv @@ -266,7 +266,7 @@ # qu # rm # ro - "rus_Cyrl", # ru + "rus-Cyrl", # ru # sa # sah # sc @@ -286,14 +286,14 @@ # ta # te # tg - "tha_Thai", # th + "tha-Thai", # th # tk # tl # tr # tt # tyv # ug - "ukr_Cyrl", # uk + "ukr-Cyrl", # uk # ur # uz # vec @@ -309,7 +309,7 @@ # yi # yo # yue - "zho_Hans", # zh + "zho-Hans", # zh ] bge_small_en_v1_5 = ModelMeta( @@ -320,7 +320,7 @@ model_prompts=model_prompts, ), name="BAAI/bge-small-en-v1.5", - languages=["eng_Latn"], + languages=["eng-Latn"], open_weights=True, revision="5c38ec7c405ec4b44b94cc5a9bb96e735b38267a", release_date="2023-09-12", # initial commit of hf model. @@ -346,7 +346,7 @@ model_prompts=model_prompts, ), name="BAAI/bge-base-en-v1.5", - languages=["eng_Latn"], + languages=["eng-Latn"], open_weights=True, revision="a5beb1e3e68b9ab74eb54cfd186867f64f240e1a", release_date="2023-09-11", # initial commit of hf model. @@ -372,7 +372,7 @@ model_prompts=model_prompts, ), name="BAAI/bge-large-en-v1.5", - languages=["eng_Latn"], + languages=["eng-Latn"], open_weights=True, revision="d4aa6901d3a41ba39fb536a557fa166f842b0e09", release_date="2023-09-12", # initial commit of hf model. @@ -398,7 +398,7 @@ model_prompts=model_prompts_zh, ), name="BAAI/bge-small-zh", - languages=["zho_Hans"], + languages=["zho-Hans"], open_weights=True, revision="1d2363c5de6ce9ba9c890c8e23a4c72dce540ca8", release_date="2023-08-05", # initial commit of hf model. @@ -425,7 +425,7 @@ model_prompts=model_prompts_zh, ), name="BAAI/bge-base-zh", - languages=["zho_Hans"], + languages=["zho-Hans"], open_weights=True, revision="0e5f83d4895db7955e4cb9ed37ab73f7ded339b6", release_date="2023-08-05", # initial commit of hf model. @@ -452,7 +452,7 @@ model_prompts=model_prompts_zh, ), name="BAAI/bge-large-zh", - languages=["zho_Hans"], + languages=["zho-Hans"], open_weights=True, revision="b5d9f5c027e87b6f0b6fa4b614f8f9cdc45ce0e8", release_date="2023-08-02", # initial commit of hf model. @@ -479,7 +479,7 @@ model_prompts=model_prompts, ), name="BAAI/bge-small-en", - languages=["eng_Latn"], + languages=["eng-Latn"], open_weights=True, revision="4778d71a06863076696b03fd2777eb118712cad8", release_date="2023-08-05", # initial commit of hf model. @@ -506,7 +506,7 @@ model_prompts=model_prompts, ), name="BAAI/bge-base-en", - languages=["eng_Latn"], + languages=["eng-Latn"], open_weights=True, revision="b737bf5dcc6ee8bdc530531266b4804a5d77b5d8", release_date="2023-08-05", # initial commit of hf model. @@ -533,7 +533,7 @@ model_prompts=model_prompts, ), name="BAAI/bge-large-en", - languages=["eng_Latn"], + languages=["eng-Latn"], open_weights=True, revision="abe7d9d814b775ca171121fb03f394dc42974275", release_date="2023-08-05", # initial commit of hf model. @@ -561,7 +561,7 @@ model_prompts=model_prompts_zh, ), name="BAAI/bge-small-zh-v1.5", - languages=["zho_Hans"], + languages=["zho-Hans"], open_weights=True, revision="7999e1d3359715c523056ef9478215996d62a620", release_date="2023-09-12", # initial commit of hf model. @@ -587,7 +587,7 @@ model_prompts=model_prompts_zh, ), name="BAAI/bge-base-zh-v1.5", - languages=["zho_Hans"], + languages=["zho-Hans"], open_weights=True, revision="f03589ceff5aac7111bd60cfc7d497ca17ecac65", release_date="2023-09-11", # initial commit of hf model. @@ -613,7 +613,7 @@ model_prompts=model_prompts_zh, ), name="BAAI/bge-large-zh-v1.5", - languages=["zho_Hans"], + languages=["zho-Hans"], open_weights=True, revision="79e7739b6ab944e86d6171e44d24c997fc1e0116", release_date="2023-09-12", # initial commit of hf model. @@ -721,13 +721,13 @@ ), name="BAAI/bge-multilingual-gemma2", languages=[ - "eng_Latn", - "zho_Hans", - "kor_Hang", - "kor_Latn", - "fra_Latn", - "jpn_Jpan", - "jpn_Latn", + "eng-Latn", + "zho-Hans", + "kor-Hang", + "kor-Latn", + "fra-Latn", + "jpn-Jpan", + "jpn-Latn", ], # This list is incomlete. Their description says "and more". # I'm also unsure about the scripts. open_weights=True, @@ -760,7 +760,7 @@ ), name="BAAI/bge-en-icl", languages=[ - "eng_Latn", + "eng-Latn", ], open_weights=True, revision="971c7e1445cc86656ca0bd85ed770b8675a40bb5", diff --git a/mteb/models/blip2_models.py b/mteb/models/blip2_models.py index 0314ffcd22..899f3fc54e 100644 --- a/mteb/models/blip2_models.py +++ b/mteb/models/blip2_models.py @@ -227,7 +227,7 @@ def get_fused_embeddings( model_name="Salesforce/blip2-opt-2.7b", ), name="Salesforce/blip2-opt-2.7b", - languages=["eng_Latn"], + languages=["eng-Latn"], revision="51572668da0eb669e01a189dc22abe6088589a24", release_date="2024-03-22", modalities=["image", "text"], @@ -252,7 +252,7 @@ def get_fused_embeddings( model_name="Salesforce/blip2-opt-6.7b-coco", ), name="Salesforce/blip2-opt-6.7b-coco", - languages=["eng_Latn"], + languages=["eng-Latn"], revision="0d580de59320a25a4d2c386387bcef310d5f286e", release_date="2024-03-31", modalities=["image", "text"], diff --git a/mteb/models/blip_models.py b/mteb/models/blip_models.py index 9fadf15e1a..bad1a85948 100644 --- a/mteb/models/blip_models.py +++ b/mteb/models/blip_models.py @@ -160,7 +160,7 @@ def get_fused_embeddings( model_name="Salesforce/blip-image-captioning-large", ), name="Salesforce/blip-image-captioning-large", - languages=["eng_Latn"], + languages=["eng-Latn"], revision="2227ac38c9f16105cb0412e7cab4759978a8fd90", release_date="2023-12-07", modalities=["image", "text"], @@ -189,7 +189,7 @@ def get_fused_embeddings( model_name="Salesforce/blip-image-captioning-base", ), name="Salesforce/blip-image-captioning-base", - languages=["eng_Latn"], + languages=["eng-Latn"], revision="89b09ea1789f7addf2f6d6f0dfc4ce10ab58ef84", release_date="2023-08-01", modalities=["image", "text"], @@ -219,7 +219,7 @@ def get_fused_embeddings( model_name="Salesforce/blip-vqa-base", ), name="Salesforce/blip-vqa-base", - languages=["eng_Latn"], + languages=["eng-Latn"], revision="c7df8e7cd7aa2ee9af18f56e2b29e59a92651b64", release_date="2023-12-07", modalities=["image", "text"], @@ -247,7 +247,7 @@ def get_fused_embeddings( model_name="Salesforce/blip-vqa-capfilt-large", ), name="Salesforce/blip-vqa-capfilt-large", - languages=["eng_Latn"], + languages=["eng-Latn"], revision="e53f95265aeab69013fabb5380500ab984adbbb4", release_date="2023-01-22", modalities=["image", "text"], @@ -275,7 +275,7 @@ def get_fused_embeddings( model_name="Salesforce/blip-itm-base-coco", ), name="Salesforce/blip-itm-base-coco", - languages=["eng_Latn"], + languages=["eng-Latn"], revision="7eaa90c11850c0b17fc38c6a11e7d88bd6ac231f", release_date="2023-08-01", modalities=["image", "text"], @@ -303,7 +303,7 @@ def get_fused_embeddings( model_name="Salesforce/blip-itm-large-coco", ), name="Salesforce/blip-itm-large-coco", - languages=["eng_Latn"], + languages=["eng-Latn"], revision="fef05cafc05298067cbbca00b125749394a77a6f", release_date="2023-08-01", modalities=["image", "text"], @@ -332,7 +332,7 @@ def get_fused_embeddings( model_name="Salesforce/blip-itm-base-flickr", ), name="Salesforce/blip-itm-base-flickr", - languages=["eng_Latn"], + languages=["eng-Latn"], revision="1de29e660d91ae1786c1876212ea805a22eab251", release_date="2023-08-01", modalities=["image", "text"], @@ -361,7 +361,7 @@ def get_fused_embeddings( model_name="Salesforce/blip-itm-large-flickr", ), name="Salesforce/blip-itm-large-flickr", - languages=["eng_Latn"], + languages=["eng-Latn"], revision="bda12e6506758f54261b5ab174b2c55a3ba143fb", release_date="2023-08-01", modalities=["image", "text"], diff --git a/mteb/models/bm25.py b/mteb/models/bm25.py index 3de9be16e8..6c9db2b808 100644 --- a/mteb/models/bm25.py +++ b/mteb/models/bm25.py @@ -123,7 +123,7 @@ def encode(self, texts: list[str], **kwargs): bm25_s = ModelMeta( loader=partial(bm25_loader, model_name="bm25s"), # type: ignore name="bm25s", - languages=["eng_Latn"], + languages=["eng-Latn"], open_weights=True, revision="0_1_10", release_date="2024-07-10", ## release of version 0.1.10 diff --git a/mteb/models/cache_wrapper.py b/mteb/models/cache_wrapper.py index 4fde7c4f49..57b8954c7b 100644 --- a/mteb/models/cache_wrapper.py +++ b/mteb/models/cache_wrapper.py @@ -206,24 +206,30 @@ def __init__(self, model: Encoder, cache_path: str | Path): self.cache_path.mkdir(parents=True, exist_ok=True) if hasattr(model, "encode"): - self.cache = TextVectorMap(self.cache_path / "cache") - self.cache.load(name="cache") + self.cache_dict = {} else: logger.error("Model must have an 'encode' method.") raise ValueError("Invalid model encoding method") logger.info("Initialized CachedEmbeddingWrapper") - def encode(self, texts: list[str], batch_size: int = 32, **kwargs) -> np.ndarray: + def encode( + self, texts: list[str], batch_size: int = 32, task_name: str = None, **kwargs + ) -> np.ndarray: """Encode texts using the wrapped model, with caching""" try: results = [] uncached_texts = [] uncached_indices = [] + # Initialize cache + if task_name not in self.cache_dict: + self.cache_dict[task_name] = TextVectorMap(self.cache_path / task_name) + self.cache_dict[task_name].load(name=task_name) + # Check cache for each text for i, text in enumerate(texts): - vector = self.cache.get_vector(text) + vector = self.cache_dict[task_name].get_vector(text) if vector is not None: results.append(vector) else: @@ -241,9 +247,9 @@ def encode(self, texts: list[str], batch_size: int = 32, **kwargs) -> np.ndarray # Add new vectors to cache for text, vector in zip(uncached_texts, new_vectors): - self.cache.add(text, vector) + self.cache_dict[task_name].add(text, vector) results.extend(new_vectors) - self.cache.save() + self.cache_dict[task_name].save() else: logger.info("All texts found in cache") @@ -287,5 +293,6 @@ def __del__(self): self.close() def close(self): - self.cache.close() + for task in list(self.cache_dict.keys()): + self.cache_dict[task].close() logger.info("Closed CachedEmbeddingWrapper") diff --git a/mteb/models/cde_models.py b/mteb/models/cde_models.py index c8e398c1a3..9845ce71b1 100644 --- a/mteb/models/cde_models.py +++ b/mteb/models/cde_models.py @@ -12,7 +12,7 @@ cde_small_v1 = ModelMeta( loader=no_model_implementation_available, name="jxm/cde-small-v1", - languages=["eng_Latn"], + languages=["eng-Latn"], open_weights=True, revision="8d5736163718a8b65cd787b75ed61020d18bad3c", release_date="2024-09-24", @@ -35,7 +35,7 @@ cde_small_v2 = ModelMeta( loader=no_model_implementation_available, name="jxm/cde-small-v2", - languages=["eng_Latn"], + languages=["eng-Latn"], open_weights=True, revision="a7e5882ad52c27ea2831fc8258f24379c25cb459", release_date="2025-01-13", diff --git a/mteb/models/clap_models.py b/mteb/models/clap_models.py index 628aadf342..f4d7900c90 100644 --- a/mteb/models/clap_models.py +++ b/mteb/models/clap_models.py @@ -194,7 +194,7 @@ def encode( clap_htsat_fused = ModelMeta( loader=partial(ClapZeroShotWrapper, model_name="laion/clap-htsat-fused"), name="laion/clap-htsat-fused", - languages=["eng_Latn"], + languages=["eng-Latn"], revision="cca9e288ab447cee67d9ada1f85ddb46500f1401", release_date="2023-05-22", modalities=["audio", "text"], @@ -219,7 +219,7 @@ def encode( clap_htsat_unfused = ModelMeta( loader=partial(ClapZeroShotWrapper, model_name="laion/clap-htsat-unfused"), name="laion/clap-htsat-unfused", - languages=["eng_Latn"], + languages=["eng-Latn"], revision="8fa0f1c6d0433df6e97c127f64b2a1d6c0dcda8a", release_date="2023-05-22", modalities=["audio", "text"], @@ -243,7 +243,7 @@ def encode( larger_clap_general = ModelMeta( loader=partial(ClapZeroShotWrapper, model_name="laion/larger_clap_general"), name="laion/larger_clap_general", - languages=["eng_Latn"], + languages=["eng-Latn"], revision="ada0c23a36c4e8582805bb38fec3905903f18b41", release_date="2023-05-22", modalities=["audio", "text"], @@ -267,7 +267,7 @@ def encode( larger_clap_music = ModelMeta( loader=partial(ClapZeroShotWrapper, model_name="laion/larger_clap_music"), name="laion/larger_clap_music", - languages=["eng_Latn"], + languages=["eng-Latn"], revision="a0b4534a14f58e20944452dff00a22a06ce629d1", release_date="2023-05-22", modalities=["audio", "text"], @@ -293,7 +293,7 @@ def encode( ClapZeroShotWrapper, model_name="laion/larger_clap_music_and_speech" ), name="laion/larger_clap_music_and_speech", - languages=["eng_Latn"], + languages=["eng-Latn"], revision="195c3a3e68faebb3e2088b9a79e79b43ddbda76b", release_date="2023-05-22", modalities=["audio", "text"], diff --git a/mteb/models/clip_models.py b/mteb/models/clip_models.py index a8c3da96c8..054ff92c2d 100644 --- a/mteb/models/clip_models.py +++ b/mteb/models/clip_models.py @@ -143,7 +143,7 @@ def get_fused_embeddings( model_name="openai/clip-vit-large-patch14", ), name="openai/clip-vit-large-patch14", - languages=["eng_Latn"], + languages=["eng-Latn"], revision="32bd64288804d66eefd0ccbe215aa642df71cc41", release_date="2021-02-26", modalities=["image", "text"], @@ -168,7 +168,7 @@ def get_fused_embeddings( model_name="openai/clip-vit-base-patch32", ), name="openai/clip-vit-base-patch32", - languages=["eng_Latn"], + languages=["eng-Latn"], revision="3d74acf9a28c67741b2f4f2ea7635f0aaf6f0268", release_date="2021-02-26", modalities=["image", "text"], @@ -193,7 +193,7 @@ def get_fused_embeddings( model_name="openai/clip-vit-base-patch16", ), name="openai/clip-vit-base-patch16", - languages=["eng_Latn"], + languages=["eng-Latn"], revision="57c216476eefef5ab752ec549e440a49ae4ae5f3", release_date="2021-02-26", modalities=["image", "text"], diff --git a/mteb/models/colbert_models.py b/mteb/models/colbert_models.py index aea1bcb914..8306daea9e 100644 --- a/mteb/models/colbert_models.py +++ b/mteb/models/colbert_models.py @@ -145,7 +145,7 @@ def similarity(self, a: np.ndarray, b: np.ndarray) -> np.ndarray: model_name="colbert-ir/colbertv2.0", ), name="colbert-ir/colbertv2.0", - languages=["eng_Latn"], + languages=["eng-Latn"], open_weights=True, revision="c1e84128e85ef755c096a95bdb06b47793b13acf", public_training_code=None, diff --git a/mteb/models/conan_models.py b/mteb/models/conan_models.py new file mode 100644 index 0000000000..1a69d880ad --- /dev/null +++ b/mteb/models/conan_models.py @@ -0,0 +1,214 @@ +from __future__ import annotations + +import hashlib +import json +import logging +import os +import random +import string +import time +from functools import partial +from typing import Any + +import numpy as np +import requests + +from mteb.model_meta import ModelMeta +from mteb.models.bge_models import bge_full_data +from mteb.models.e5_instruct import E5_MISTRAL_TRAINING_DATA +from mteb.models.wrapper import Wrapper + +conan_zh_datasets = { + "BQ": ["train"], + "LCQMC": ["train"], + "PAWSX": ["train"], + "STS-B": ["train"], + "DuRetrieval": ["train"], + "AFQMC": ["train"], + "Cmnli": ["train"], + "Ocnli": ["train"], + "T2Retrieval": ["train"], + "T2Reranking": ["train"], + "MMarcoReranking": ["train"], + "CMedQAv2-reranking": ["train"], +} + +logger = logging.getLogger(__name__) + + +class RateLimiter: + def __init__(self, qps, max_retries=3): + self.qps = qps + self.min_interval = 1.0 / qps + self.last_request_time = 0 + self.max_retries = max_retries + + def wait(self): + """Simple rate limiting logic""" + current_time = time.time() + time_since_last = current_time - self.last_request_time + if time_since_last < self.min_interval: + sleep_time = self.min_interval - time_since_last + time.sleep(sleep_time) + self.last_request_time = time.time() + + def execute_with_retry(self, func, *args, **kwargs): + """Execute a function with retry logic + + Args: + func: The function to execute + *args: Arguments to pass to the function + **kwargs: Arguments to pass to the function + + Returns: + The result of the function execution + + Raises: + Exception: When max retries are reached and still failing + """ + retries = 0 + while retries < self.max_retries: + self.wait() + try: + return func(*args, **kwargs) + except Exception as e: + retries += 1 + if retries < self.max_retries: + sleep_time = 10 * retries + logger.warning( + f"Request failed (attempt {retries}/{self.max_retries}), " + f"sleeping for {sleep_time}s. Error: {str(e)}" + ) + time.sleep(sleep_time) + else: + logger.error(f"Max retries reached. Last error: {str(e)}") + raise + + +class Client: + def __init__(self, ak, sk, url, timeout=30): + self.ak = ak + self.sk = sk + self.url = url + self.timeout = timeout + self.rate_limiter = RateLimiter(qps=5, max_retries=3) + + def _random_password(self, size=40, chars=None): + if chars is None: + chars = string.ascii_uppercase + string.ascii_lowercase + string.digits + random_chars = random.SystemRandom().choice + return "".join(random_chars(chars) for _ in range(size)) + + def __signature(self, random_str, time_stamp): + params_str = f"{self.ak}:{time_stamp}:{random_str}:{self.sk}" + encoded_params_str = params_str.encode("utf-8") + return hashlib.md5(encoded_params_str).hexdigest() + + def get_signature(self): + timestamp = int(time.time()) + random_str = self._random_password(20) + sig = self.__signature(random_str, timestamp) + params = { + "timestamp": timestamp, + "random": random_str, + "app_id": self.ak, + "sign": sig, + } + return params + + def _do_request(self, text): + """Execute the actual request without retry logic""" + params = self.get_signature() + params["body"] = text + params["content_id"] = f"test_{int(time.time())}" + headers = {"Content-Type": "application/json"} + + rsp = requests.post( + self.url, data=json.dumps(params), timeout=self.timeout, headers=headers + ) + result = rsp.json() + + if rsp.status_code != 200: + raise Exception( + f"API request failed with status {rsp.status_code}: {result}" + ) + + return result + + def embed(self, text): + """Embed text using the server with rate limiting and retry logic + + Args: + text: The input text to embed + + Returns: + dict: Response containing embedding + """ + # Use rate_limiter to execute the request, handling rate limiting and retries + return self.rate_limiter.execute_with_retry(self._do_request, text) + + +class ConanWrapper(Wrapper): + def __init__( + self, + model_name: str, + **kwargs, + ) -> None: + AK = os.getenv("CONAN_AK") + SK = os.getenv("CONAN_SK") + if not AK or not SK: + raise ValueError("CONAN_AK and CONAN_SK environment variables must be set") + + self.client = Client(ak=AK, sk=SK, url="https://ai.om.qq.com/api/conan/v2") + self.model_name = model_name + + def encode( + self, + sentences: list[str], + **kwargs: Any, + ) -> np.ndarray: + embeddings = [] + + for sentence in sentences: + try: + result = self.client.embed(sentence) + if "embedding" not in result: + raise ValueError(f"No embedding in response: {result}") + embeddings.append(result["embedding"]) + except Exception as e: + logger.error(f"Failed to embed sentence: {str(e)}") + raise + + return np.array(embeddings) + + +Conan_embedding_v2 = ModelMeta( + name="TencentBAC/Conan-embedding-v2", + revision="e5c87c63889630bca87486f6a2645ed97c5ddb17", + release_date="2025-04-10", + languages=[ + "eng-Latn", + "zho-Hans", + ], + loader=partial( # type: ignore + ConanWrapper, + model_name="Conan-embedding-v2", + ), + max_tokens=32768, + embed_dim=3584, + open_weights=False, + n_parameters=None, + memory_usage_mb=None, + license="apache-2.0", + reference="https://huggingface.co/TencentBAC/Conan-embedding-v2", + similarity_fn_name="cosine", + framework=["API"], + use_instructions=True, + training_datasets={ + **E5_MISTRAL_TRAINING_DATA, + **bge_full_data, + **conan_zh_datasets, + }, + public_training_code=None, + public_training_data=None, +) diff --git a/mteb/models/dino_models.py b/mteb/models/dino_models.py index 31cd442f25..b25d8770ee 100644 --- a/mteb/models/dino_models.py +++ b/mteb/models/dino_models.py @@ -132,7 +132,7 @@ def get_fused_embeddings( model_name="facebook/dinov2-small", ), name="facebook/dinov2-small", - languages=["eng_Latn"], + languages=["eng-Latn"], revision="ed25f3a31f01632728cabb09d1542f84ab7b0056", release_date="2023-07-18", modalities=["image"], @@ -157,7 +157,7 @@ def get_fused_embeddings( model_name="facebook/dinov2-base", ), name="facebook/dinov2-base", - languages=["eng_Latn"], + languages=["eng-Latn"], revision="f9e44c814b77203eaa57a6bdbbd535f21ede1415", release_date="2023-07-18", modalities=["image"], @@ -182,7 +182,7 @@ def get_fused_embeddings( model_name="facebook/dinov2-large", ), name="facebook/dinov2-large", - languages=["eng_Latn"], + languages=["eng-Latn"], revision="47b73eefe95e8d44ec3623f8890bd894b6ea2d6c", release_date="2023-07-18", modalities=["image"], @@ -207,7 +207,7 @@ def get_fused_embeddings( model_name="facebook/dinov2-giant", ), name="facebook/dinov2-giant", - languages=["eng_Latn"], + languages=["eng-Latn"], revision="611a9d42f2335e0f921f1e313ad3c1b7178d206d", release_date="2023-07-18", modalities=["image"], @@ -225,3 +225,382 @@ def get_fused_embeddings( use_instructions=False, training_datasets=dinov2_training_datasets, ) + +webssl_dino_training_datasets = { + # MetaCLIP 2B samples +} + +webssl_dino300m_full2b = ModelMeta( + loader=partial( + DINOModelWrapper, + model_name="facebook/webssl-dino300m-full2b-224", + ), + name="facebook/webssl-dino300m-full2b-224", + languages=["eng-Latn"], + revision="8529cdb3fb75014932af3b896455fc21c386168e", + release_date="2025-04-24", + modalities=["image"], + n_parameters=304_000_000, + memory_usage_mb=1158, + max_tokens=None, + embed_dim=1024, + license="cc-by-nc-4.0", + open_weights=True, + public_training_code="", + public_training_data=None, + framework=["PyTorch"], + reference="https://huggingface.co/facebook/webssl-dino300m-full2b-224", + similarity_fn_name=None, + use_instructions=False, + training_datasets=webssl_dino_training_datasets, +) + +webssl_dino1b_full2b = ModelMeta( + loader=partial( + DINOModelWrapper, + model_name="facebook/webssl-dino1b-full2b-224", + ), + name="facebook/webssl-dino1b-full2b-224", + languages=["eng-Latn"], + revision="d3bf033d9c8cc62ea9e73c40956642cad2ec568a", + release_date="2025-04-24", + modalities=["image"], + n_parameters=1_130_000_000, + memory_usage_mb=4329, + max_tokens=None, + embed_dim=1536, + license="cc-by-nc-4.0", + open_weights=True, + public_training_code="", + public_training_data=None, + framework=["PyTorch"], + reference="https://huggingface.co/facebook/webssl-dino1b-full2b-224", + similarity_fn_name=None, + use_instructions=False, + training_datasets=webssl_dino_training_datasets, +) + +webssl_dino2b_full2b = ModelMeta( + loader=partial( + DINOModelWrapper, + model_name="facebook/webssl-dino2b-full2b-224", + ), + name="facebook/webssl-dino2b-full2b-224", + languages=["eng-Latn"], + revision="cd5893e3fd2e988eb716792049b3dd53b3f1b68b", + release_date="2025-04-24", + modalities=["image"], + n_parameters=2_080_000_000, + memory_usage_mb=7951, + max_tokens=None, + embed_dim=2688, + license="cc-by-nc-4.0", + open_weights=True, + public_training_code="", + public_training_data=None, + framework=["PyTorch"], + reference="https://huggingface.co/facebook/webssl-dino2b-full2b-224", + similarity_fn_name=None, + use_instructions=False, + training_datasets=webssl_dino_training_datasets, +) + +webssl_dino3b_full2b = ModelMeta( + loader=partial( + DINOModelWrapper, + model_name="facebook/webssl-dino3b-full2b-224", + ), + name="facebook/webssl-dino3b-full2b-224", + languages=["eng-Latn"], + revision="2d015c340b16bc47bc6557fcb4e6c83a9d4aa1d3", + release_date="2025-04-24", + modalities=["image"], + n_parameters=3_000_000_000, + memory_usage_mb=11247, + max_tokens=None, + embed_dim=3072, + license="cc-by-nc-4.0", + open_weights=True, + public_training_code="", + public_training_data=None, + framework=["PyTorch"], + reference="https://huggingface.co/facebook/webssl-dino3b-full2b-224", + similarity_fn_name=None, + use_instructions=False, + training_datasets=webssl_dino_training_datasets, +) + +webssl_dino5b_full2b = ModelMeta( + loader=partial( + DINOModelWrapper, + model_name="facebook/webssl-dino5b-full2b-224", + ), + name="facebook/webssl-dino5b-full2b-224", + languages=["eng-Latn"], + revision="88006b18b9af369f6c611db7a64d908bde3714e0", + release_date="2025-04-24", + modalities=["image"], + n_parameters=5_000_000_000, + memory_usage_mb=18838, + max_tokens=None, + embed_dim=3584, + license="cc-by-nc-4.0", + open_weights=True, + public_training_code="", + public_training_data=None, + framework=["PyTorch"], + reference="https://huggingface.co/facebook/webssl-dino5b-full2b-224", + similarity_fn_name=None, + use_instructions=False, + training_datasets=webssl_dino_training_datasets, +) + +webssl_dino7b_full8b_224 = ModelMeta( + loader=partial( + DINOModelWrapper, + model_name="facebook/webssl-dino7b-full8b-224", + ), + name="facebook/webssl-dino7b-full8b-224", + languages=["eng-Latn"], + revision="c6085463ea680043042a80c6d41db2c65e85f466", + release_date="2025-04-24", + modalities=["image"], + n_parameters=7_000_000_000, + memory_usage_mb=24605, + max_tokens=None, + embed_dim=4096, + license="cc-by-nc-4.0", + open_weights=True, + public_training_code="", + public_training_data=None, + framework=["PyTorch"], + reference="https://huggingface.co/facebook/webssl-dino7b-full8b-224", + similarity_fn_name=None, + use_instructions=False, + training_datasets=webssl_dino_training_datasets, +) + +webssl_dino7b_full8b_378 = ModelMeta( + loader=partial( + DINOModelWrapper, + model_name="facebook/webssl-dino7b-full8b-378", + ), + name="facebook/webssl-dino7b-full8b-378", + languages=["eng-Latn"], + revision="53c8c5b43070bd2ddb3f66161140408ce832301f", + release_date="2025-04-24", + modalities=["image"], + n_parameters=7_000_000_000, + memory_usage_mb=24613, + max_tokens=None, + embed_dim=4096, + license="cc-by-nc-4.0", + open_weights=True, + public_training_code="", + public_training_data=None, + framework=["PyTorch"], + reference="https://huggingface.co/facebook/webssl-dino7b-full8b-378", + similarity_fn_name=None, + use_instructions=False, + training_datasets=webssl_dino_training_datasets, +) + +webssl_dino7b_full8b_518 = ModelMeta( + loader=partial( + DINOModelWrapper, + model_name="facebook/webssl-dino7b-full8b-518", + ), + name="facebook/webssl-dino7b-full8b-518", + languages=["eng-Latn"], + revision="aee350d2c5e3e5fdb7ee6985291d808ea5eef431", + release_date="2025-04-24", + modalities=["image"], + n_parameters=7_000_000_000, + memory_usage_mb=24623, + max_tokens=None, + embed_dim=4096, + license="cc-by-nc-4.0", + open_weights=True, + public_training_code="", + public_training_data=None, + framework=["PyTorch"], + reference="https://huggingface.co/facebook/webssl-dino7b-full8b-518", + similarity_fn_name=None, + use_instructions=False, + training_datasets=webssl_dino_training_datasets, +) + +webssl_dino2b_light2b = ModelMeta( + loader=partial( + DINOModelWrapper, + model_name="facebook/webssl-dino2b-light2b-224", + ), + name="facebook/webssl-dino2b-light2b-224", + languages=["eng-Latn"], + revision="633a663f304e63cc3cbec3f7f9ca2fbc94736128", + release_date="2025-04-24", + modalities=["image"], + n_parameters=2_000_000_000, + memory_usage_mb=7951, + max_tokens=None, + embed_dim=2688, + license="cc-by-nc-4.0", + open_weights=True, + public_training_code="", + public_training_data=None, + framework=["PyTorch"], + reference="https://huggingface.co/facebook/webssl-dino2b-light2b-224", + similarity_fn_name=None, + use_instructions=False, + training_datasets=webssl_dino_training_datasets, +) + +webssl_dino2b_heavy2b = ModelMeta( + loader=partial( + DINOModelWrapper, + model_name="facebook/webssl-dino2b-heavy2b-224", + ), + name="facebook/webssl-dino2b-heavy2b-224", + languages=["eng-Latn"], + revision="9f46eb0c0129656a1ef195fde072e3765abdb7c6", + release_date="2025-04-24", + modalities=["image"], + n_parameters=2_000_000_000, + memory_usage_mb=7951, + max_tokens=None, + embed_dim=2688, + license="cc-by-nc-4.0", + open_weights=True, + public_training_code="", + public_training_data=None, + framework=["PyTorch"], + reference="https://huggingface.co/facebook/webssl-dino2b-heavy2b-224", + similarity_fn_name=None, + use_instructions=False, + training_datasets=webssl_dino_training_datasets, +) + +webssl_dino3b_light2b = ModelMeta( + loader=partial( + DINOModelWrapper, + model_name="facebook/webssl-dino3b-light2b-224", + ), + name="facebook/webssl-dino3b-light2b-224", + languages=["eng-Latn"], + revision="4d0160f60673805431f4ad14983e712ed88be5b8", + release_date="2025-04-24", + modalities=["image"], + n_parameters=3_000_000_000, + memory_usage_mb=11247, + max_tokens=None, + embed_dim=3072, + license="cc-by-nc-4.0", + open_weights=True, + public_training_code="", + public_training_data=None, + framework=["PyTorch"], + reference="https://huggingface.co/facebook/webssl-dino3b-light2b-224", + similarity_fn_name=None, + use_instructions=False, + training_datasets=webssl_dino_training_datasets, +) + +webssl_dino3b_heavy2b = ModelMeta( + loader=partial( + DINOModelWrapper, + model_name="facebook/webssl-dino3b-heavy2b-224", + ), + name="facebook/webssl-dino3b-heavy2b-224", + languages=["eng-Latn"], + revision="dd39c2910747561b332285d96c4dce0bdb240775", + release_date="2025-04-24", + modalities=["image"], + n_parameters=3_000_000_000, + memory_usage_mb=11247, + max_tokens=None, + embed_dim=3072, + license="cc-by-nc-4.0", + open_weights=True, + public_training_code="", + public_training_data=None, + framework=["PyTorch"], + reference="https://huggingface.co/facebook/webssl-dino3b-heavy2b-224", + similarity_fn_name=None, + use_instructions=False, + training_datasets=webssl_dino_training_datasets, +) + +webssl_mae300m_full2b = ModelMeta( + loader=partial( + DINOModelWrapper, + model_name="facebook/webssl-mae300m-full2b-224", + ), + name="facebook/webssl-mae300m-full2b-224", + languages=["eng-Latn"], + revision="4655a0ac1726c206ba14d5ccb26758c62a4d03b0", + release_date="2025-04-24", + modalities=["image"], + n_parameters=304_000_000, + memory_usage_mb=1161, + max_tokens=None, + embed_dim=1024, + license="cc-by-nc-4.0", + open_weights=True, + public_training_code="", + public_training_data=None, + framework=["PyTorch"], + reference="https://huggingface.co/facebook/webssl-mae300m-full2b-224", + similarity_fn_name=None, + use_instructions=False, + training_datasets=webssl_dino_training_datasets, +) + +webssl_mae700m_full2b = ModelMeta( + loader=partial( + DINOModelWrapper, + model_name="facebook/webssl-mae700m-full2b-224", + ), + name="facebook/webssl-mae700m-full2b-224", + languages=["eng-Latn"], + revision="c32be382e757d73a178de1ead62c27391d4b4280", + release_date="2025-04-24", + modalities=["image"], + n_parameters=700_000_000, + memory_usage_mb=2412, + max_tokens=None, + embed_dim=1280, + license="cc-by-nc-4.0", + open_weights=True, + public_training_code="", + public_training_data=None, + framework=["PyTorch"], + reference="https://huggingface.co/facebook/webssl-mae700m-full2b-224", + similarity_fn_name=None, + use_instructions=False, + training_datasets=webssl_dino_training_datasets, +) + +webssl_mae1b_full2b = ModelMeta( + loader=partial( + DINOModelWrapper, + model_name="facebook/webssl-mae1b-full2b-224", + ), + name="facebook/webssl-mae1b-full2b-224", + languages=["eng-Latn"], + revision="5880aefedbad8db0f44d27358f6f08e8576f70fc", + release_date="2025-04-24", + modalities=["image"], + n_parameters=1_000_000_000, + memory_usage_mb=4337, + max_tokens=None, + embed_dim=1536, + license="cc-by-nc-4.0", + open_weights=True, + public_training_code="", + public_training_data=None, + framework=["PyTorch"], + reference="https://huggingface.co/facebook/webssl-mae1b-full2b-224", + similarity_fn_name=None, + use_instructions=False, + training_datasets=webssl_dino_training_datasets, +) diff --git a/mteb/models/e5_instruct.py b/mteb/models/e5_instruct.py index 49245cbed8..6b51f31a96 100644 --- a/mteb/models/e5_instruct.py +++ b/mteb/models/e5_instruct.py @@ -10,9 +10,12 @@ ME5_TRAINING_DATA, XLMR_LANGUAGES, ) -from mteb.models.instruct_wrapper import instruct_wrapper +from mteb.models.instruct_wrapper import ( + InstructSentenceTransformerWrapper, + instruct_wrapper, +) -MISTRAL_LANGUAGES = ["eng_Latn", "fra_Latn", "deu_Latn", "ita_Latn", "spa_Latn"] +MISTRAL_LANGUAGES = ["eng-Latn", "fra-Latn", "deu-Latn", "ita-Latn", "spa-Latn"] E5_INSTRUCTION = "Instruct: {instruction}\nQuery: " @@ -110,7 +113,7 @@ name="zeta-alpha-ai/Zeta-Alpha-E5-Mistral", revision="c791d37474fa6a5c72eb3a2522be346bc21fbfc3", release_date="2024-08-30", - languages=["eng_Latn"], + languages=["eng-Latn"], n_parameters=7110660096, memory_usage_mb=13563, max_tokens=32768.0, @@ -175,3 +178,34 @@ adapted_from="intfloat/e5-mistral-7b-instruct", superseded_by=None, ) + +E5_R_MISTRAL_7B_INSTRUCTION = "{instruction}\n" +BeastyZ__e5_R_mistral_7b = ModelMeta( + loader=partial( # type: ignore + InstructSentenceTransformerWrapper, + model_name="BeastyZ/e5-R-mistral-7b", + revision="3f810a6a7fd220369ad248e3705cf13d71803602", + instruction_template=E5_R_MISTRAL_7B_INSTRUCTION, + tokenizer_kwargs={"pad_token": ""}, + ), + name="BeastyZ/e5-R-mistral-7b", + revision="3f810a6a7fd220369ad248e3705cf13d71803602", + release_date="2024-06-28", + languages=["eng-Latn"], + n_parameters=7241732096, + memory_usage_mb=27625, + max_tokens=32768.0, + embed_dim=4096, + license="apache-2.0", + open_weights=True, + public_training_code="https://github.com/LeeSureman/E5-Retrieval-Reproduction", + public_training_data="https://huggingface.co/datasets/BeastyZ/E5-R", + framework=["PyTorch"], + reference="https://huggingface.co/BeastyZ/e5-R-mistral-7b", + similarity_fn_name="cosine", + use_instructions=True, + training_datasets=E5_MISTRAL_TRAINING_DATA, + # not MTEB: {"BeastyZ/E5-R": ["train"]}, + adapted_from="intfloat/e5-mistral-7b-instruct", + superseded_by=None, +) diff --git a/mteb/models/e5_models.py b/mteb/models/e5_models.py index c7515b761d..395d1da311 100644 --- a/mteb/models/e5_models.py +++ b/mteb/models/e5_models.py @@ -7,105 +7,105 @@ E5_PAPER_RELEASE_DATE = "2024-02-08" XLMR_LANGUAGES = [ - "afr_Latn", - "amh_Latn", - "ara_Latn", - "asm_Latn", - "aze_Latn", - "bel_Latn", - "bul_Latn", - "ben_Latn", - "ben_Beng", - "bre_Latn", - "bos_Latn", - "cat_Latn", - "ces_Latn", - "cym_Latn", - "dan_Latn", - "deu_Latn", - "ell_Latn", - "eng_Latn", - "epo_Latn", - "spa_Latn", - "est_Latn", - "eus_Latn", - "fas_Latn", - "fin_Latn", - "fra_Latn", - "fry_Latn", - "gle_Latn", - "gla_Latn", - "glg_Latn", - "guj_Latn", - "hau_Latn", - "heb_Latn", - "hin_Latn", - "hin_Deva", - "hrv_Latn", - "hun_Latn", - "hye_Latn", - "ind_Latn", - "isl_Latn", - "ita_Latn", - "jpn_Latn", - "jav_Latn", - "kat_Latn", - "kaz_Latn", - "khm_Latn", - "kan_Latn", - "kor_Latn", - "kur_Latn", - "kir_Latn", - "lat_Latn", - "lao_Latn", - "lit_Latn", - "lav_Latn", - "mlg_Latn", - "mkd_Latn", - "mal_Latn", - "mon_Latn", - "mar_Latn", - "msa_Latn", - "mya_Latn", - "nep_Latn", - "nld_Latn", - "nob_Latn", - "orm_Latn", - "ori_Latn", - "pan_Latn", - "pol_Latn", - "pus_Latn", - "por_Latn", - "ron_Latn", - "rus_Latn", - "san_Latn", - "snd_Latn", - "sin_Latn", - "slk_Latn", - "slv_Latn", - "som_Latn", - "sqi_Latn", - "srp_Latn", - "sun_Latn", - "swe_Latn", - "swa_Latn", - "tam_Latn", - "tam_Taml", - "tel_Latn", - "tel_Telu", - "tha_Latn", - "tgl_Latn", - "tur_Latn", - "uig_Latn", - "ukr_Latn", - "urd_Latn", - "urd_Arab", - "uzb_Latn", - "vie_Latn", - "xho_Latn", - "yid_Latn", - "zho_Hant", - "zho_Hans", + "afr-Latn", + "amh-Latn", + "ara-Latn", + "asm-Latn", + "aze-Latn", + "bel-Latn", + "bul-Latn", + "ben-Latn", + "ben-Beng", + "bre-Latn", + "bos-Latn", + "cat-Latn", + "ces-Latn", + "cym-Latn", + "dan-Latn", + "deu-Latn", + "ell-Latn", + "eng-Latn", + "epo-Latn", + "spa-Latn", + "est-Latn", + "eus-Latn", + "fas-Latn", + "fin-Latn", + "fra-Latn", + "fry-Latn", + "gle-Latn", + "gla-Latn", + "glg-Latn", + "guj-Latn", + "hau-Latn", + "heb-Latn", + "hin-Latn", + "hin-Deva", + "hrv-Latn", + "hun-Latn", + "hye-Latn", + "ind-Latn", + "isl-Latn", + "ita-Latn", + "jpn-Latn", + "jav-Latn", + "kat-Latn", + "kaz-Latn", + "khm-Latn", + "kan-Latn", + "kor-Latn", + "kur-Latn", + "kir-Latn", + "lat-Latn", + "lao-Latn", + "lit-Latn", + "lav-Latn", + "mlg-Latn", + "mkd-Latn", + "mal-Latn", + "mon-Latn", + "mar-Latn", + "msa-Latn", + "mya-Latn", + "nep-Latn", + "nld-Latn", + "nob-Latn", + "orm-Latn", + "ori-Latn", + "pan-Latn", + "pol-Latn", + "pus-Latn", + "por-Latn", + "ron-Latn", + "rus-Latn", + "san-Latn", + "snd-Latn", + "sin-Latn", + "slk-Latn", + "slv-Latn", + "som-Latn", + "sqi-Latn", + "srp-Latn", + "sun-Latn", + "swe-Latn", + "swa-Latn", + "tam-Latn", + "tam-Taml", + "tel-Latn", + "tel-Telu", + "tha-Latn", + "tgl-Latn", + "tur-Latn", + "uig-Latn", + "ukr-Latn", + "urd-Latn", + "urd-Arab", + "uzb-Latn", + "vie-Latn", + "xho-Latn", + "yid-Latn", + "zho-Hant", + "zho-Hans", ] model_prompts = { @@ -130,6 +130,7 @@ ME5_TRAINING_DATA = { **E5_TRAINING_DATA, + "XQuADRetrieval": ["train"], # trained on SQuAD train dataset "FEVER": ["train"], "FEVERHardNegatives": ["train"], "FEVER-NL": ["train"], # translation not trained on @@ -230,7 +231,7 @@ model_prompts=model_prompts, ), name="intfloat/e5-small-v2", - languages=["eng_Latn"], + languages=["eng-Latn"], open_weights=True, revision="dca8b1a9dae0d4575df2bf423a5edb485a431236", release_date=E5_PAPER_RELEASE_DATE, @@ -257,7 +258,7 @@ model_prompts=model_prompts, ), name="intfloat/e5-small", - languages=["eng_Latn"], + languages=["eng-Latn"], open_weights=True, revision="e272f3049e853b47cb5ca3952268c6662abda68f", release_date=E5_PAPER_RELEASE_DATE, @@ -284,7 +285,7 @@ model_prompts=model_prompts, ), name="intfloat/e5-base-v2", - languages=["eng_Latn"], + languages=["eng-Latn"], open_weights=True, revision="1c644c92ad3ba1efdad3f1451a637716616a20e8", release_date=E5_PAPER_RELEASE_DATE, @@ -312,7 +313,7 @@ model_prompts=model_prompts, ), name="intfloat/e5-large-v2", - languages=["eng_Latn"], + languages=["eng-Latn"], open_weights=True, revision="b322e09026e4ea05f42beadf4d661fb4e101d311", release_date=E5_PAPER_RELEASE_DATE, diff --git a/mteb/models/e5_v.py b/mteb/models/e5_v.py index 14383b4413..5c7a227b44 100644 --- a/mteb/models/e5_v.py +++ b/mteb/models/e5_v.py @@ -194,7 +194,7 @@ def get_fused_embeddings( device_map="auto", ), name="royokong/e5-v", - languages=["eng_Latn"], + languages=["eng-Latn"], revision="0c1f22679417b3ae925d779442221c40cd1861ab", release_date="2024-07-17", modalities=["image", "text"], diff --git a/mteb/models/evaclip_models.py b/mteb/models/evaclip_models.py index 03ce84e032..ca25e0084c 100644 --- a/mteb/models/evaclip_models.py +++ b/mteb/models/evaclip_models.py @@ -182,7 +182,7 @@ def get_fused_embeddings( model_name="EVA02-CLIP-B-16", ), name="QuanSun/EVA02-CLIP-B-16", - languages=["eng_Latn"], + languages=["eng-Latn"], revision="11afd202f2ae80869d6cef18b1ec775e79bd8d12", release_date="2023-04-26", modalities=["image", "text"], @@ -207,7 +207,7 @@ def get_fused_embeddings( model_name="EVA02-CLIP-L-14", ), name="QuanSun/EVA02-CLIP-L-14", - languages=["eng_Latn"], + languages=["eng-Latn"], revision="11afd202f2ae80869d6cef18b1ec775e79bd8d12", release_date="2023-04-26", modalities=["image", "text"], @@ -232,7 +232,7 @@ def get_fused_embeddings( model_name="EVA02-CLIP-bigE-14", ), name="QuanSun/EVA02-CLIP-bigE-14", - languages=["eng_Latn"], + languages=["eng-Latn"], revision="11afd202f2ae80869d6cef18b1ec775e79bd8d12", release_date="2023-04-26", modalities=["image", "text"], @@ -258,7 +258,7 @@ def get_fused_embeddings( model_name="EVA02-CLIP-bigE-14-plus", ), name="QuanSun/EVA02-CLIP-bigE-14-plus", - languages=["eng_Latn"], + languages=["eng-Latn"], revision="11afd202f2ae80869d6cef18b1ec775e79bd8d12", release_date="2023-04-26", modalities=["image", "text"], diff --git a/mteb/models/gme_v_models.py b/mteb/models/gme_v_models.py index 8d83b54a33..29e7de8c78 100644 --- a/mteb/models/gme_v_models.py +++ b/mteb/models/gme_v_models.py @@ -412,7 +412,7 @@ def fetch_image( model_name="Alibaba-NLP/gme-Qwen2-VL-2B-Instruct", ), name="Alibaba-NLP/gme-Qwen2-VL-2B-Instruct", - languages=["eng_Latn", "cmn-Hans"], + languages=["eng-Latn", "cmn-Hans"], open_weights=True, revision="ce765ae71b8cdb208203cd8fb64a170b1b84293a", release_date="2024-12-24", @@ -437,7 +437,7 @@ def fetch_image( model_name="Alibaba-NLP/gme-Qwen2-VL-7B-Instruct", ), name="Alibaba-NLP/gme-Qwen2-VL-7B-Instruct", - languages=["eng_Latn", "cmn-Hans"], + languages=["eng-Latn", "cmn-Hans"], open_weights=True, revision="477027a6480f8630363be77751f169cc3434b673", release_date="2024-12-24", diff --git a/mteb/models/google_models.py b/mteb/models/google_models.py index cd98e35b45..2de26b55c9 100644 --- a/mteb/models/google_models.py +++ b/mteb/models/google_models.py @@ -12,25 +12,25 @@ from mteb.requires_package import requires_package MULTILINGUAL_EVALUATED_LANGUAGES = [ - "arb_Arab", - "ben_Beng", - "eng_Latn", - "spa_Latn", - "deu_Latn", - "pes_Arab", - "fin_Latn", - "fra_Latn", - "hin_Deva", - "ind_Latn", - "jpn_Jpan", - "kor_Hang", - "rus_Cyrl", - "swh_Latn", - "tel_Telu", - "tha_Thai", - "yor_Latn", - "zho_Hant", - "zho_Hans", + "arb-Arab", + "ben-Beng", + "eng-Latn", + "spa-Latn", + "deu-Latn", + "pes-Arab", + "fin-Latn", + "fra-Latn", + "hin-Deva", + "ind-Latn", + "jpn-Jpan", + "kor-Hang", + "rus-Cyrl", + "swh-Latn", + "tel-Telu", + "tha-Thai", + "yor-Latn", + "zho-Hant", + "zho-Hans", ] MODEL_PROMPTS = { diff --git a/mteb/models/gritlm_models.py b/mteb/models/gritlm_models.py index 02e48db4e1..9afc5b4c82 100644 --- a/mteb/models/gritlm_models.py +++ b/mteb/models/gritlm_models.py @@ -30,7 +30,7 @@ def gritlm_instruction(instruction: str = "", prompt_type=None) -> str: torch_dtype="auto", ), name="GritLM/GritLM-7B", - languages=["eng_Latn", "fra_Latn", "deu_Latn", "ita_Latn", "spa_Latn"], + languages=["eng-Latn", "fra-Latn", "deu-Latn", "ita-Latn", "spa-Latn"], open_weights=True, revision="13f00a0e36500c80ce12870ea513846a066004af", release_date="2024-02-15", @@ -58,7 +58,7 @@ def gritlm_instruction(instruction: str = "", prompt_type=None) -> str: torch_dtype="auto", ), name="GritLM/GritLM-8x7B", - languages=["eng_Latn", "fra_Latn", "deu_Latn", "ita_Latn", "spa_Latn"], + languages=["eng-Latn", "fra-Latn", "deu-Latn", "ita-Latn", "spa-Latn"], open_weights=True, revision="7f089b13e3345510281733ca1e6ff871b5b4bc76", release_date="2024-02-15", diff --git a/mteb/models/gte_models.py b/mteb/models/gte_models.py index 7b7464ff26..f3615d084c 100644 --- a/mteb/models/gte_models.py +++ b/mteb/models/gte_models.py @@ -65,7 +65,7 @@ def instruction_template( embed_eos="<|endoftext|>", ), name="Alibaba-NLP/gte-Qwen1.5-7B-instruct", - languages=["eng_Latn"], + languages=["eng-Latn"], open_weights=True, revision="07d27e5226328010336563bc1b564a5e3436a298", release_date="2024-04-20", # initial commit of hf model. @@ -96,7 +96,7 @@ def instruction_template( embed_eos="<|endoftext|>", ), name="Alibaba-NLP/gte-Qwen2-1.5B-instruct", - languages=["eng_Latn"], + languages=["eng-Latn"], open_weights=True, revision="c6c1b92f4a3e1b92b326ad29dd3c8433457df8dd", release_date="2024-07-29", # initial commit of hf model. @@ -121,7 +121,7 @@ def instruction_template( revision="af7bd46fbb00b3a6963c8dd7f1786ddfbfbe973a", ), name="thenlper/gte-small-zh", - languages=["zho_Hans"], + languages=["zho-Hans"], open_weights=True, revision="af7bd46fbb00b3a6963c8dd7f1786ddfbfbe973a", release_date="2023-11-08", # initial commit of hf model. @@ -146,7 +146,7 @@ def instruction_template( revision="71ab7947d6fac5b64aa299e6e40e6c2b2e85976c", ), name="thenlper/gte-base-zh", - languages=["zho_Hans"], + languages=["zho-Hans"], open_weights=True, revision="71ab7947d6fac5b64aa299e6e40e6c2b2e85976c", release_date="2023-11-08", # initial commit of hf model. @@ -171,7 +171,7 @@ def instruction_template( revision="64c364e579de308104a9b2c170ca009502f4f545", ), name="thenlper/gte-large-zh", - languages=["zho_Hans"], + languages=["zho-Hans"], open_weights=True, revision="64c364e579de308104a9b2c170ca009502f4f545", release_date="2023-11-08", # initial commit of hf model. @@ -190,77 +190,77 @@ def instruction_template( ) gte_multilingual_langs = [ - "afr_Latn", - "ara_Arab", - "aze_Latn", - "bel_Cyrl", - "bul_Cyrl", - "ben_Beng", - "cat_Latn", - "ceb_Latn", - "ces_Latn", - "cym_Latn", - "dan_Latn", - "deu_Latn", - "ell_Grek", - "eng_Latn", - "spa_Latn", - "est_Latn", - "eus_Latn", - "fas_Arab", - "fin_Latn", - "fra_Latn", - "glg_Latn", - "guj_Gujr", - "heb_Hebr", - "hin_Deva", - "hrv_Latn", - "hat_Latn", - "hun_Latn", - "hye_Armn", - "ind_Latn", - "isl_Latn", - "ita_Latn", - "jpn_Jpan", - "jav_Latn", - "kat_Geor", - "kaz_Cyrl", - "khm_Khmr", - "kan_Knda", - "kor_Hang", - "kir_Cyrl", - "lao_Laoo", - "lit_Latn", - "lav_Latn", - "mkd_Cyrl", - "mal_Mlym", - "mon_Cyrl", - "mar_Deva", - "msa_Latn", - "mya_Mymr", - "nep_Deva", - "nld_Latn", - "nor_Latn", - "pan_Guru", - "pol_Latn", - "por_Latn", - "que_Latn", - "ron_Latn", - "rus_Cyrl", - "sin_Sinh", - "slk_Latn", - "slv_Latn", - "swa_Latn", - "tam_Taml", - "tel_Telu", - "tha_Thai", - "tgl_Latn", - "tur_Latn", - "ukr_Cyrl", - "urd_Arab", - "vie_Latn", - "yor_Latn", - "zho_Hans", + "afr-Latn", + "ara-Arab", + "aze-Latn", + "bel-Cyrl", + "bul-Cyrl", + "ben-Beng", + "cat-Latn", + "ceb-Latn", + "ces-Latn", + "cym-Latn", + "dan-Latn", + "deu-Latn", + "ell-Grek", + "eng-Latn", + "spa-Latn", + "est-Latn", + "eus-Latn", + "fas-Arab", + "fin-Latn", + "fra-Latn", + "glg-Latn", + "guj-Gujr", + "heb-Hebr", + "hin-Deva", + "hrv-Latn", + "hat-Latn", + "hun-Latn", + "hye-Armn", + "ind-Latn", + "isl-Latn", + "ita-Latn", + "jpn-Jpan", + "jav-Latn", + "kat-Geor", + "kaz-Cyrl", + "khm-Khmr", + "kan-Knda", + "kor-Hang", + "kir-Cyrl", + "lao-Laoo", + "lit-Latn", + "lav-Latn", + "mkd-Cyrl", + "mal-Mlym", + "mon-Cyrl", + "mar-Deva", + "msa-Latn", + "mya-Mymr", + "nep-Deva", + "nld-Latn", + "nor-Latn", + "pan-Guru", + "pol-Latn", + "por-Latn", + "que-Latn", + "ron-Latn", + "rus-Cyrl", + "sin-Sinh", + "slk-Latn", + "slv-Latn", + "swa-Latn", + "tam-Taml", + "tel-Telu", + "tha-Thai", + "tgl-Latn", + "tur-Latn", + "ukr-Cyrl", + "urd-Arab", + "vie-Latn", + "yor-Latn", + "zho-Hans", ] # Source: https://arxiv.org/pdf/2407.19669 gte_multi_training_data = { @@ -303,7 +303,7 @@ def instruction_template( release_date="2024-07-20", # initial commit of hf model. n_parameters=int(305 * 1e6), memory_usage_mb=582, - embed_dim=1024, + embed_dim=768, license="apache-2.0", max_tokens=8192, reference="https://huggingface.co/Alibaba-NLP/gte-multilingual-base", @@ -322,7 +322,7 @@ def instruction_template( revision="7ca8b4ca700621b67618669f5378fe5f5820b8e4", ), name="Alibaba-NLP/gte-modernbert-base", - languages=["eng_Latn"], + languages=["eng-Latn"], open_weights=True, revision="7ca8b4ca700621b67618669f5378fe5f5820b8e4", release_date="2025-01-21", # initial commit of hf model. diff --git a/mteb/models/ibm_granite_models.py b/mteb/models/ibm_granite_models.py index f0ab0f0c6e..db5a883f2c 100644 --- a/mteb/models/ibm_granite_models.py +++ b/mteb/models/ibm_granite_models.py @@ -5,19 +5,19 @@ from mteb.model_meta import ModelMeta, sentence_transformers_loader GRANITE_LANGUAGES = [ - "ara_Latn", - "ces_Latn", - "deu_Latn", - "eng_Latn", - "spa_Latn", - "fra_Latn", - "ita_Latn", - "jpn_Latn", - "kor_Latn", - "nld_Latn", - "por_Latn", - "zho_Hant", - "zho_Hans", + "ara-Latn", + "ces-Latn", + "deu-Latn", + "eng-Latn", + "spa-Latn", + "fra-Latn", + "ita-Latn", + "jpn-Latn", + "kor-Latn", + "nld-Latn", + "por-Latn", + "zho-Hant", + "zho-Hans", ] granite_training_data = { @@ -145,7 +145,7 @@ revision="eddbb57470f896b5f8e2bfcb823d8f0e2d2024a5", ), name="ibm-granite/granite-embedding-30m-english", - languages=["eng_Latn"], + languages=["eng-Latn"], open_weights=True, revision="eddbb57470f896b5f8e2bfcb823d8f0e2d2024a5", release_date="2024-12-18", @@ -172,7 +172,7 @@ revision="e48d3a5b47eaa18e3fe07d4676e187fd80f32730", ), name="ibm-granite/granite-embedding-125m-english", - languages=["eng_Latn"], + languages=["eng-Latn"], open_weights=True, revision="e48d3a5b47eaa18e3fe07d4676e187fd80f32730", release_date="2024-12-18", diff --git a/mteb/models/inf_models.py b/mteb/models/inf_models.py index f53d8c9bdb..6dc8411f6d 100644 --- a/mteb/models/inf_models.py +++ b/mteb/models/inf_models.py @@ -43,7 +43,7 @@ trust_remote_code=True, ), name="infly/inf-retriever-v1", - languages=["eng_Latn", "zho_Hans"], + languages=["eng-Latn", "zho-Hans"], open_weights=True, revision="cb70ca7c31dfa866b2eff2dad229c144d8ddfd91", release_date="2024-12-24", # initial commit of hf model. @@ -70,7 +70,7 @@ trust_remote_code=True, ), name="infly/inf-retriever-v1-1.5b", - languages=["eng_Latn", "zho_Hans"], + languages=["eng-Latn", "zho-Hans"], open_weights=True, revision="c9c05c2dd50707a486966ba81703021ae2094a06", release_date="2025-02-08", # initial commit of hf model. diff --git a/mteb/models/jasper_models.py b/mteb/models/jasper_models.py index dcef55d553..b3a3f62516 100644 --- a/mteb/models/jasper_models.py +++ b/mteb/models/jasper_models.py @@ -68,7 +68,7 @@ def encode( config_kwargs={"is_text_encoder": True, "vector_dim": 12288}, model_kwargs={ "attn_implementation": "sdpa", - "torch_dtype": torch.float16, + "torch_dtype": torch.bfloat16, }, trust_remote_code=True, max_seq_length=2048, diff --git a/mteb/models/jina_clip.py b/mteb/models/jina_clip.py index 208b77e44a..c73a6b2de3 100644 --- a/mteb/models/jina_clip.py +++ b/mteb/models/jina_clip.py @@ -157,7 +157,7 @@ def encode( # type: ignore model_name="jinaai/jina-clip-v1", ), name="jinaai/jina-clip-v1", - languages=["eng_Latn"], + languages=["eng-Latn"], revision="06150c7c382d7a4faedc7d5a0d8cdb59308968f4", release_date="2024-05-30", modalities=["image", "text"], diff --git a/mteb/models/jina_models.py b/mteb/models/jina_models.py index a9c05c4041..4bfffe8f6c 100644 --- a/mteb/models/jina_models.py +++ b/mteb/models/jina_models.py @@ -20,105 +20,105 @@ CURRENT_SENTENCE_TRANSFORMERS_VERSION = tuple(map(int, st_version.split("."))) XLMR_LANGUAGES = [ - "afr_Latn", - "amh_Latn", - "ara_Latn", - "asm_Latn", - "aze_Latn", - "bel_Latn", - "bul_Latn", - "ben_Latn", - "ben_Beng", - "bre_Latn", - "bos_Latn", - "cat_Latn", - "ces_Latn", - "cym_Latn", - "dan_Latn", - "deu_Latn", - "ell_Latn", - "eng_Latn", - "epo_Latn", - "spa_Latn", - "est_Latn", - "eus_Latn", - "fas_Latn", - "fin_Latn", - "fra_Latn", - "fry_Latn", - "gle_Latn", - "gla_Latn", - "glg_Latn", - "guj_Latn", - "hau_Latn", - "heb_Latn", - "hin_Latn", - "hin_Deva", - "hrv_Latn", - "hun_Latn", - "hye_Latn", - "ind_Latn", - "isl_Latn", - "ita_Latn", - "jpn_Latn", - "jav_Latn", - "kat_Latn", - "kaz_Latn", - "khm_Latn", - "kan_Latn", - "kor_Latn", - "kur_Latn", - "kir_Latn", - "lat_Latn", - "lao_Latn", - "lit_Latn", - "lav_Latn", - "mlg_Latn", - "mkd_Latn", - "mal_Latn", - "mon_Latn", - "mar_Latn", - "msa_Latn", - "mya_Latn", - "nep_Latn", - "nld_Latn", - "nob_Latn", - "orm_Latn", - "ori_Latn", - "pan_Latn", - "pol_Latn", - "pus_Latn", - "por_Latn", - "ron_Latn", - "rus_Latn", - "san_Latn", - "snd_Latn", - "sin_Latn", - "slk_Latn", - "slv_Latn", - "som_Latn", - "sqi_Latn", - "srp_Latn", - "sun_Latn", - "swe_Latn", - "swa_Latn", - "tam_Latn", - "tam_Taml", - "tel_Latn", - "tel_Telu", - "tha_Latn", - "tgl_Latn", - "tur_Latn", - "uig_Latn", - "ukr_Latn", - "urd_Latn", - "urd_Arab", - "uzb_Latn", - "vie_Latn", - "xho_Latn", - "yid_Latn", - "zho_Hant", - "zho_Hans", + "afr-Latn", + "amh-Latn", + "ara-Latn", + "asm-Latn", + "aze-Latn", + "bel-Latn", + "bul-Latn", + "ben-Latn", + "ben-Beng", + "bre-Latn", + "bos-Latn", + "cat-Latn", + "ces-Latn", + "cym-Latn", + "dan-Latn", + "deu-Latn", + "ell-Latn", + "eng-Latn", + "epo-Latn", + "spa-Latn", + "est-Latn", + "eus-Latn", + "fas-Latn", + "fin-Latn", + "fra-Latn", + "fry-Latn", + "gle-Latn", + "gla-Latn", + "glg-Latn", + "guj-Latn", + "hau-Latn", + "heb-Latn", + "hin-Latn", + "hin-Deva", + "hrv-Latn", + "hun-Latn", + "hye-Latn", + "ind-Latn", + "isl-Latn", + "ita-Latn", + "jpn-Latn", + "jav-Latn", + "kat-Latn", + "kaz-Latn", + "khm-Latn", + "kan-Latn", + "kor-Latn", + "kur-Latn", + "kir-Latn", + "lat-Latn", + "lao-Latn", + "lit-Latn", + "lav-Latn", + "mlg-Latn", + "mkd-Latn", + "mal-Latn", + "mon-Latn", + "mar-Latn", + "msa-Latn", + "mya-Latn", + "nep-Latn", + "nld-Latn", + "nob-Latn", + "orm-Latn", + "ori-Latn", + "pan-Latn", + "pol-Latn", + "pus-Latn", + "por-Latn", + "ron-Latn", + "rus-Latn", + "san-Latn", + "snd-Latn", + "sin-Latn", + "slk-Latn", + "slv-Latn", + "som-Latn", + "sqi-Latn", + "srp-Latn", + "sun-Latn", + "swe-Latn", + "swa-Latn", + "tam-Latn", + "tam-Taml", + "tel-Latn", + "tel-Telu", + "tha-Latn", + "tgl-Latn", + "tur-Latn", + "uig-Latn", + "ukr-Latn", + "urd-Latn", + "urd-Arab", + "uzb-Latn", + "vie-Latn", + "xho-Latn", + "yid-Latn", + "zho-Hant", + "zho-Hans", ] @@ -141,11 +141,11 @@ def __init__( raise RuntimeError( f"sentence_transformers version {st_version} is lower than the required version 3.1.0" ) - requires_package(self, "jina", model, "pip install 'mteb[jina]'") + requires_package(self, "einops", model, "pip install 'mteb[jina]'") import einops # noqa: F401 requires_package( - self, "flash_attention", model, "pip install 'mteb[flash_attention]'" + self, "flash_attn", model, "pip install 'mteb[flash_attention]'" ) import flash_attn # noqa: F401 diff --git a/mteb/models/linq_models.py b/mteb/models/linq_models.py index 2c0eafa591..3fc773cc7b 100644 --- a/mteb/models/linq_models.py +++ b/mteb/models/linq_models.py @@ -28,7 +28,7 @@ def instruction_template( normalized=True, ), name="Linq-AI-Research/Linq-Embed-Mistral", - languages=["eng_Latn"], + languages=["eng-Latn"], open_weights=True, revision="0c1a0b0589177079acc552433cad51d7c9132379", release_date="2024-05-29", # initial commit of hf model. diff --git a/mteb/models/llm2clip_models.py b/mteb/models/llm2clip_models.py index 8b950867ab..86659efe7c 100644 --- a/mteb/models/llm2clip_models.py +++ b/mteb/models/llm2clip_models.py @@ -215,7 +215,7 @@ def get_fused_embeddings( model_name="microsoft/LLM2CLIP-Openai-L-14-336", ), name="microsoft/LLM2CLIP-Openai-L-14-336", - languages=["eng_Latn"], + languages=["eng-Latn"], revision="92512331f393a003c3d98404677f991c188162c9", release_date="2024-11-07", modalities=["image", "text"], @@ -241,7 +241,7 @@ def get_fused_embeddings( model_name="microsoft/LLM2CLIP-Openai-L-14-224", ), name="microsoft/LLM2CLIP-Openai-L-14-224", - languages=["eng_Latn"], + languages=["eng-Latn"], revision="6b8a11a94ff380fa220dfefe73ac9293d2677575", release_date="2024-11-07", modalities=["image", "text"], @@ -266,7 +266,7 @@ def get_fused_embeddings( model_name="microsoft/LLM2CLIP-Openai-B-16", ), name="microsoft/LLM2CLIP-Openai-B-16", - languages=["eng_Latn"], + languages=["eng-Latn"], revision="ecfb347eb3dcfeb2fbc2a2eae7de6ac5a001aaf8", release_date="2024-11-07", modalities=["image", "text"], diff --git a/mteb/models/llm2vec_models.py b/mteb/models/llm2vec_models.py index e8ee5c43e7..37983bc159 100644 --- a/mteb/models/llm2vec_models.py +++ b/mteb/models/llm2vec_models.py @@ -116,7 +116,7 @@ def loader_inner(**kwargs: Any) -> Encoder: torch_dtype=torch.bfloat16, ), name="McGill-NLP/LLM2Vec-Meta-Llama-3-8B-Instruct-mntp-supervised", - languages=["eng_Latn"], + languages=["eng-Latn"], open_weights=True, revision="baa8ebf04a1c2500e61288e7dad65e8ae42601a7", # TODO: Not sure what to put here as a model is made of two peft repos, each with a different revision @@ -144,7 +144,7 @@ def loader_inner(**kwargs: Any) -> Encoder: torch_dtype=torch.bfloat16, ), name="McGill-NLP/LLM2Vec-Meta-Llama-3-8B-Instruct-mntp-unsup-simcse", - languages=["eng_Latn"], + languages=["eng-Latn"], open_weights=True, revision="1cb7b735326d13a8541db8f57f35da5373f5e9c6", release_date="2024-04-09", @@ -171,7 +171,7 @@ def loader_inner(**kwargs: Any) -> Encoder: torch_dtype=torch.bfloat16, ), name="McGill-NLP/LLM2Vec-Mistral-7B-Instruct-v2-mntp-supervised", - languages=["eng_Latn"], + languages=["eng-Latn"], open_weights=True, revision="0ae69bdd5816105778b971c3138e8f8a18eaa3ae", release_date="2024-04-09", @@ -198,7 +198,7 @@ def loader_inner(**kwargs: Any) -> Encoder: torch_dtype=torch.bfloat16, ), name="McGill-NLP/LLM2Vec-Mistral-7B-Instruct-v2-mntp-unsup-simcse", - languages=["eng_Latn"], + languages=["eng-Latn"], open_weights=True, revision="2c055a5d77126c0d3dc6cd8ffa30e2908f4f45f8", release_date="2024-04-09", @@ -225,7 +225,7 @@ def loader_inner(**kwargs: Any) -> Encoder: torch_dtype=torch.bfloat16, ), name="McGill-NLP/LLM2Vec-Llama-2-7b-chat-hf-mntp-supervised", - languages=["eng_Latn"], + languages=["eng-Latn"], open_weights=True, revision="2c055a5d77126c0d3dc6cd8ffa30e2908f4f45f8", release_date="2024-04-09", @@ -252,7 +252,7 @@ def loader_inner(**kwargs: Any) -> Encoder: torch_dtype=torch.bfloat16, ), name="McGill-NLP/LLM2Vec-Llama-2-7b-chat-hf-mntp-unsup-simcse", - languages=["eng_Latn"], + languages=["eng-Latn"], open_weights=True, revision="a76944871d169ebe7c97eb921764cd063afed785", release_date="2024-04-09", @@ -279,7 +279,7 @@ def loader_inner(**kwargs: Any) -> Encoder: torch_dtype=torch.bfloat16, ), name="McGill-NLP/LLM2Vec-Sheared-LLaMA-mntp-supervised", - languages=["eng_Latn"], + languages=["eng-Latn"], open_weights=True, revision="a5943d406c6b016fef3f07906aac183cf1a0b47d", release_date="2024-04-09", @@ -306,7 +306,7 @@ def loader_inner(**kwargs: Any) -> Encoder: torch_dtype=torch.bfloat16, ), name="McGill-NLP/LLM2Vec-Sheared-LLaMA-mntp-unsup-simcse", - languages=["eng_Latn"], + languages=["eng-Latn"], open_weights=True, revision="a5943d406c6b016fef3f07906aac183cf1a0b47d", release_date="2024-04-09", diff --git a/mteb/models/misc_models.py b/mteb/models/misc_models.py index 54d3f3b72a..b895122266 100644 --- a/mteb/models/misc_models.py +++ b/mteb/models/misc_models.py @@ -6,7 +6,6 @@ from mteb.model_meta import ModelMeta, sentence_transformers_loader from mteb.models.bge_models import bge_m3_training_data, bge_training_data -from mteb.models.e5_instruct import E5_MISTRAL_TRAINING_DATA from mteb.models.e5_models import E5_TRAINING_DATA from mteb.models.sentence_transformers_models import sent_trf_training_dataset @@ -14,7 +13,7 @@ name="Haon-Chen/speed-embedding-7b-instruct", revision="c167e9a8144b397622ce47b85d9edcdeecef3d3f", release_date="2024-10-31", - languages=["eng_Latn"], + languages=["eng-Latn"], loader=None, n_parameters=7110660096, memory_usage_mb=13563, @@ -273,7 +272,7 @@ name="Hum-Works/lodestone-base-4096-v1", revision="9bbc2d0b57dd2198aea029404b0f976712a7d966", release_date="2023-08-25", - languages=["eng_Latn"], + languages=["eng-Latn"], loader=None, n_parameters=None, memory_usage_mb=None, @@ -357,29 +356,6 @@ adapted_from="google/gemma-2b", superseded_by=None, ) -BeastyZ__e5_R_mistral_7b = ModelMeta( - name="BeastyZ/e5-R-mistral-7b", - revision="3f810a6a7fd220369ad248e3705cf13d71803602", - release_date="2024-06-28", - languages=["eng_Latn"], - loader=None, - n_parameters=7241732096, - memory_usage_mb=27625, - max_tokens=32768.0, - embed_dim=None, - license="apache-2.0", - open_weights=True, - public_training_code=None, - public_training_data=None, - framework=["PyTorch"], - reference="https://huggingface.co/BeastyZ/e5-R-mistral-7b", - similarity_fn_name="cosine", - use_instructions=None, - training_datasets=E5_MISTRAL_TRAINING_DATA, - # not MTEB: {"BeastyZ/E5-R": ["train"]}, - adapted_from="intfloat/e5-mistral-7b-instruct", - superseded_by=None, -) bilingual_embedding_training_data = { "STSBenchmark": ["train"], @@ -420,7 +396,7 @@ name="Lajavaness/bilingual-embedding-large", revision="e83179d7a66e8aed1b3015e98bb5ae234ed89598", release_date="2024-06-24", - languages=["fra_Latn", "eng_Latn"], + languages=["fra-Latn", "eng-Latn"], loader=partial( # type: ignore sentence_transformers_loader, model_name="Lajavaness/bilingual-embedding-large", @@ -447,7 +423,7 @@ name="Lajavaness/bilingual-embedding-small", revision="ed4a1dd814de0db81d4a4e287c296a03194463e3", release_date="2024-07-17", - languages=["fra_Latn", "eng_Latn"], + languages=["fra-Latn", "eng-Latn"], loader=partial( # type: ignore sentence_transformers_loader, model_name="Lajavaness/bilingual-embedding-small", @@ -633,7 +609,7 @@ name="OrdalieTech/Solon-embeddings-large-0.1", revision="9f6465f6ea2f6d10c6294bc15d84edf87d47cdef", release_date="2023-12-09", - languages=["fra_Latn"], + languages=["fra-Latn"], loader=None, n_parameters=559890432, memory_usage_mb=2136, @@ -655,7 +631,7 @@ name="Omartificial-Intelligence-Space/Arabert-all-nli-triplet-Matryoshka", revision="d0361a36f6fe69febfc8550d0918abab174f6f30", release_date="2024-06-16", - languages=["ara_Arab"], + languages=["ara-Arab"], loader=None, n_parameters=135193344, memory_usage_mb=516, @@ -677,7 +653,7 @@ name="Omartificial-Intelligence-Space/Arabic-MiniLM-L12-v2-all-nli-triplet", revision="6916465c43b984e955aa6dc72851474f0128f428", release_date="2024-06-25", - languages=["ara_Arab"], + languages=["ara-Arab"], loader=None, n_parameters=117653760, memory_usage_mb=449, @@ -701,7 +677,7 @@ name="Omartificial-Intelligence-Space/Arabic-all-nli-triplet-Matryoshka", revision="1ca467cc576bd76666a4d21b24ee43afa914dd10", release_date="2024-06-14", - languages=["ara_Arab"], + languages=["ara-Arab"], loader=None, n_parameters=278043648, memory_usage_mb=1061, @@ -725,7 +701,7 @@ name="Omartificial-Intelligence-Space/Arabic-labse-Matryoshka", revision="ee6d5e33c78ed582ade47fd452a74ea52aa5bfe2", release_date="2024-06-16", - languages=["ara_Arab"], + languages=["ara-Arab"], loader=None, n_parameters=470926848, memory_usage_mb=1796, @@ -749,7 +725,7 @@ name="Omartificial-Intelligence-Space/Arabic-mpnet-base-all-nli-triplet", revision="2628cb641e040f44328195fadcdfb58e6d5cffa7", release_date="2024-06-15", - languages=["ara_Arab"], + languages=["ara-Arab"], loader=None, n_parameters=109486464, memory_usage_mb=418, @@ -773,7 +749,7 @@ name="Omartificial-Intelligence-Space/Marbert-all-nli-triplet-Matryoshka", revision="ecf3274e164f057c4a3dd70691cae0265d87a9d0", release_date="2024-06-17", - languages=["ara_Arab"], + languages=["ara-Arab"], loader=None, n_parameters=162841344, memory_usage_mb=621, @@ -892,7 +868,7 @@ name="manu/sentence_croissant_alpha_v0.4", revision="0ce6372e6a3c21134dcf26dcde13cca869c767fc", release_date="2024-04-27", - languages=["fra_Latn", "eng_Latn"], + languages=["fra-Latn", "eng-Latn"], loader=None, n_parameters=1279887360, memory_usage_mb=2441, @@ -915,7 +891,7 @@ name="thenlper/gte-base", revision="c078288308d8dee004ab72c6191778064285ec0c", release_date="2023-07-27", - languages=["eng_Latn"], + languages=["eng-Latn"], loader=None, n_parameters=109482752, memory_usage_mb=209, @@ -937,7 +913,7 @@ name="thenlper/gte-large", revision="4bef63f39fcc5e2d6b0aae83089f307af4970164", release_date="2023-07-27", - languages=["eng_Latn"], + languages=["eng-Latn"], loader=None, n_parameters=335142400, memory_usage_mb=639, @@ -959,7 +935,7 @@ name="thenlper/gte-small", revision="17e1f347d17fe144873b1201da91788898c639cd", release_date="2023-07-27", - languages=["eng_Latn"], + languages=["eng-Latn"], loader=None, n_parameters=33360512, memory_usage_mb=64, @@ -981,7 +957,7 @@ name="OrlikB/KartonBERT-USE-base-v1", revision="1f59dd58fe57995c0e867d5e29f03763eae99645", release_date="2024-09-30", - languages=["pol_Latn"], + languages=["pol-Latn"], loader=None, n_parameters=103705344, memory_usage_mb=396, @@ -1003,7 +979,7 @@ name="OrlikB/st-polish-kartonberta-base-alpha-v1", revision="5590a0e2d7bb43674e44d7076b3ff157f7d4a1cb", release_date="2023-11-12", - languages=["pol_Latn"], + languages=["pol-Latn"], loader=None, n_parameters=None, memory_usage_mb=None, @@ -1025,7 +1001,7 @@ name="sdadas/mmlw-e5-base", revision="f10628ed55b5ec400502aff439bd714a6da0af30", release_date="2023-11-17", - languages=["pol_Latn"], + languages=["pol-Latn"], loader=None, n_parameters=278043648, memory_usage_mb=1061, @@ -1047,7 +1023,7 @@ name="dwzhu/e5-base-4k", revision="1b5664b8cb2bccd8c309429b7bfe5864402e8fbc", release_date="2024-03-28", - languages=["eng_Latn"], + languages=["eng-Latn"], loader=None, n_parameters=None, memory_usage_mb=None, @@ -1071,7 +1047,7 @@ name="sdadas/mmlw-e5-large", revision="5c143fb045ebed664fd85b43fc45155999eb110f", release_date="2023-11-17", - languages=["pol_Latn"], + languages=["pol-Latn"], loader=None, n_parameters=559890432, memory_usage_mb=2136, @@ -1093,7 +1069,7 @@ name="sdadas/mmlw-e5-small", revision="ff1298cb6d997f18b794d2f3d73cad2ba2ad739a", release_date="2023-11-17", - languages=["pol_Latn"], + languages=["pol-Latn"], loader=None, n_parameters=117653760, memory_usage_mb=449, @@ -1115,7 +1091,7 @@ name="sdadas/mmlw-roberta-base", revision="0ac7f23f6c96af601fa6a17852bd08d5136d6365", release_date="2023-11-17", - languages=["pol_Latn"], + languages=["pol-Latn"], loader=None, n_parameters=124442880, memory_usage_mb=475, @@ -1137,7 +1113,7 @@ name="sdadas/mmlw-roberta-large", revision="b8058066a8de32d0737b3cd82d8b4f4108745af9", release_date="2023-11-17", - languages=["pol_Latn"], + languages=["pol-Latn"], loader=None, n_parameters=434961408, memory_usage_mb=1659, @@ -1163,51 +1139,51 @@ } udever_langauges = [ - "aka_Latn", - "ara_Arab", - "asm_Beng", - "bam_Latn", - "ben_Beng", - "cat_Latn", - "eng_Latn", - "spa_Latn", - "eus_Latn", - "fon_Latn", - "fra_Latn", - "guj_Gujr", - "hin_Deva", - "ind_Latn", - "ibo_Latn", - "kik_Latn", - "kan_Knda", - "lug_Latn", - "lin_Latn", - "mal_Mlym", - "mar_Deva", - "nep_Deva", - "nso_Latn", - "nya_Latn", - "ori_Orya", - "pan_Guru", - "por_Latn", - "run_Latn", - "kin_Latn", - "sna_Latn", - "sot_Latn", - "swa_Latn", - "tam_Taml", - "tel_Telu", - "tsn_Latn", - "tso_Latn", - "tum_Latn", - "twi_Latn", - "urd_Arab", - "vie_Latn", - "wol_Latn", - "xho_Latn", - "yor_Latn", - "zho_Hans", - "zul_Latn", + "aka-Latn", + "ara-Arab", + "asm-Beng", + "bam-Latn", + "ben-Beng", + "cat-Latn", + "eng-Latn", + "spa-Latn", + "eus-Latn", + "fon-Latn", + "fra-Latn", + "guj-Gujr", + "hin-Deva", + "ind-Latn", + "ibo-Latn", + "kik-Latn", + "kan-Knda", + "lug-Latn", + "lin-Latn", + "mal-Mlym", + "mar-Deva", + "nep-Deva", + "nso-Latn", + "nya-Latn", + "ori-Orya", + "pan-Guru", + "por-Latn", + "run-Latn", + "kin-Latn", + "sna-Latn", + "sot-Latn", + "swa-Latn", + "tam-Taml", + "tel-Telu", + "tsn-Latn", + "tso-Latn", + "tum-Latn", + "twi-Latn", + "urd-Arab", + "vie-Latn", + "wol-Latn", + "xho-Latn", + "yor-Latn", + "zho-Hans", + "zul-Latn", ] izhx__udever_bloom_1b1 = ModelMeta( @@ -1302,7 +1278,7 @@ name="avsolatorio/GIST-Embedding-v0", revision="bf6b2e55e92f510a570ad4d7d2da2ec8cd22590c", release_date="2024-01-31", - languages=["eng_Latn"], + languages=["eng-Latn"], loader=None, n_parameters=109482240, memory_usage_mb=418, @@ -1341,7 +1317,7 @@ name="avsolatorio/GIST-all-MiniLM-L6-v2", revision="ea89dfad053bba14677bb784a4269898abbdce44", release_date="2024-02-03", - languages=["eng_Latn"], + languages=["eng-Latn"], loader=None, n_parameters=22713216, memory_usage_mb=87, @@ -1380,7 +1356,7 @@ name="avsolatorio/GIST-large-Embedding-v0", revision="7831200e2f7819b994490c091cf3258a2b821f0c", release_date="2024-02-14", - languages=["eng_Latn"], + languages=["eng-Latn"], loader=None, n_parameters=335141888, memory_usage_mb=1278, @@ -1419,7 +1395,7 @@ name="avsolatorio/GIST-small-Embedding-v0", revision="d6c4190f9e01b9994dc7cac99cf2f2b85cfb57bc", release_date="2024-02-03", - languages=["eng_Latn"], + languages=["eng-Latn"], loader=None, n_parameters=33360000, memory_usage_mb=127, @@ -1480,7 +1456,7 @@ name="aari1995/German_Semantic_STS_V2", revision="22912542b0ec7a7ef369837e28ffe6352a27afc9", release_date="2022-11-17", - languages=["deu_Latn"], + languages=["deu-Latn"], loader=None, n_parameters=335736320, memory_usage_mb=1281, @@ -1503,7 +1479,7 @@ name="abhinand/MedEmbed-small-v0.1", revision="40a5850d046cfdb56154e332b4d7099b63e8d50e", release_date="2024-10-20", - languages=["eng_Latn"], + languages=["eng-Latn"], loader=None, n_parameters=33360000, memory_usage_mb=127, @@ -1534,7 +1510,7 @@ name="avsolatorio/NoInstruct-small-Embedding-v0", revision="b38747000553d8268915c95a55fc87e707c9aadd", release_date="2024-05-01", - languages=["eng_Latn"], + languages=["eng-Latn"], loader=None, n_parameters=33360000, memory_usage_mb=127, @@ -1556,7 +1532,7 @@ name="brahmairesearch/slx-v0.1", revision="688c83fd1a7f34b25575a2bc26cfd87c11b4ce71", release_date="2024-08-13", - languages=["eng_Latn"], + languages=["eng-Latn"], loader=None, n_parameters=22713216, memory_usage_mb=87, @@ -1600,7 +1576,7 @@ name="deepvk/USER-bge-m3", revision="0cc6cfe48e260fb0474c753087a69369e88709ae", release_date="2024-07-05", - languages=["rus_Cyrl"], + languages=["rus-Cyrl"], loader=None, n_parameters=359026688, memory_usage_mb=1370, @@ -1633,7 +1609,7 @@ name="infgrad/stella-base-en-v2", revision="c9e80ff9892d80b39dc54e30a7873f91ea161034", release_date="2023-10-19", - languages=["eng_Latn"], + languages=["eng-Latn"], loader=None, n_parameters=None, memory_usage_mb=None, @@ -1677,7 +1653,7 @@ name="omarelshehy/arabic-english-sts-matryoshka", revision="763d116fbe8bf7883c64635c862feeaa3768bb64", release_date="2024-10-13", - languages=["ara_Arab", "eng_Latn"], + languages=["ara-Arab", "eng-Latn"], loader=None, n_parameters=559890432, memory_usage_mb=2136, @@ -1710,7 +1686,7 @@ name="openbmb/MiniCPM-Embedding", revision="c0cb2de33fb366e17c30f9d53142ff11bc18e049", release_date="2024-09-04", - languages=["zho_Hans", "eng_Latn"], + languages=["zho-Hans", "eng-Latn"], n_parameters=2724880896, memory_usage_mb=5197, max_tokens=512.0, @@ -1732,15 +1708,15 @@ revision="6633dc49e554de7105458f8f2e96445c6598e9d1", release_date="2023-06-22", languages=[ - "zho_Hans", - "eng_Latn", - "deu_Latn", - "fra_Latn", - "ita_Latn", - "nld_Latn", - "por_Latn", - "pol_Latn", - "rus_Cyrl", + "zho-Hans", + "eng-Latn", + "deu-Latn", + "fra-Latn", + "ita-Latn", + "nld-Latn", + "por-Latn", + "pol-Latn", + "rus-Cyrl", ], loader=None, n_parameters=117654272, @@ -1764,7 +1740,7 @@ name="silma-ai/silma-embeddding-matryoshka-v0.1", revision="a520977a9542ebdb8a7206df6b7ff6977f1886ea", release_date="2024-10-12", - languages=["ara_Arab", "eng_Latn"], + languages=["ara-Arab", "eng-Latn"], loader=None, n_parameters=135193344, memory_usage_mb=516, @@ -1787,7 +1763,7 @@ name="DMetaSoul/sbert-chinese-general-v1", revision="bd27765956bcc2fcf682de0097819947ac10037e", release_date="2022-03-25", - languages=["zho_Hans"], + languages=["zho-Hans"], loader=None, n_parameters=None, memory_usage_mb=None, # Not visible on repo @@ -1813,7 +1789,7 @@ name="DMetaSoul/Dmeta-embedding-zh-small", revision="2050d3439a2f68999dd648c1697471acaac37a29", release_date="2024-03-25", - languages=["zho_Hans"], + languages=["zho-Hans"], loader=None, n_parameters=74.2 * 1e6, memory_usage_mb=283, @@ -1834,7 +1810,7 @@ name="lier007/xiaobu-embedding", revision="59c79d82eb5223cd9895f6eb8e825c7fa10e4e92", release_date="2024-01-09", - languages=["zho_Hans"], + languages=["zho-Hans"], loader=None, n_parameters=326 * 1e6, memory_usage_mb=1244, @@ -1856,7 +1832,7 @@ name="lier007/xiaobu-embedding-v2", revision="1912f2e59a5c2ef802a471d735a38702a5c9485e", release_date="2024-06-30", - languages=["zho_Hans"], + languages=["zho-Hans"], loader=None, n_parameters=326 * 1e6, memory_usage_mb=1242, @@ -1878,7 +1854,7 @@ name="Classical/Yinka", revision="59c79d82eb5223cd9895f6eb8e825c7fa10e4e92", release_date="2024-01-09", - languages=["zho_Hans"], + languages=["zho-Hans"], loader=None, n_parameters=326 * 1e6, memory_usage_mb=1244, @@ -1900,7 +1876,7 @@ name="TencentBAC/Conan-embedding-v1", revision="bb9749a57d4f02fd71722386f8d0f5a9398d7eeb", release_date="2024-08-22", - languages=["zho_Hans"], + languages=["zho-Hans"], loader=None, n_parameters=326 * 1e6, memory_usage_mb=1242, @@ -1922,7 +1898,7 @@ name="llmrails/ember-v1", revision="5e5ce5904901f6ce1c353a95020f17f09e5d021d", release_date="2023-10-10", - languages=["eng_Latn"], + languages=["eng-Latn"], n_parameters=335 * 1e6, memory_usage_mb=1278, max_tokens=512, @@ -1942,7 +1918,7 @@ name="amazon/Titan-text-embeddings-v2", revision="1", release_date="2024-04-30", - languages=["eng_Latn"], + languages=["eng-Latn"], n_parameters=None, memory_usage_mb=None, max_tokens=None, diff --git a/mteb/models/moco_models.py b/mteb/models/moco_models.py index 1383447493..b9b7928112 100644 --- a/mteb/models/moco_models.py +++ b/mteb/models/moco_models.py @@ -150,7 +150,7 @@ def get_fused_embeddings( model_name="nyu-visionx/moco-v3-vit-b", ), name="nyu-visionx/moco-v3-vit-b", - languages=["eng_Latn"], + languages=["eng-Latn"], revision="7d091cd70772c5c0ecf7f00b5f12ca609a99d69d", release_date="2024-06-03", modalities=["image"], @@ -175,7 +175,7 @@ def get_fused_embeddings( model_name="nyu-visionx/moco-v3-vit-l", ), name="nyu-visionx/moco-v3-vit-l", - languages=["eng_Latn"], + languages=["eng-Latn"], revision="7bf75358d616f39b9716148bf4e3425f3bd35b47", release_date="2024-06-03", modalities=["image"], diff --git a/mteb/models/model2vec_models.py b/mteb/models/model2vec_models.py index cbdc3c0a13..69b716f186 100644 --- a/mteb/models/model2vec_models.py +++ b/mteb/models/model2vec_models.py @@ -56,7 +56,7 @@ def encode( model_name="minishlab/M2V_base_glove_subword", ), name="minishlab/M2V_base_glove_subword", - languages=["eng_Latn"], + languages=["eng-Latn"], open_weights=True, revision="5f4f5ca159b7321a8b39739bba0794fa0debddf4", release_date="2024-09-21", @@ -83,7 +83,7 @@ def encode( model_name="minishlab/M2V_base_glove", ), name="minishlab/M2V_base_glove", - languages=["eng_Latn"], + languages=["eng-Latn"], open_weights=True, revision="38ebd7f10f71e67fa8db898290f92b82e9cfff2b", release_date="2024-09-21", @@ -109,7 +109,7 @@ def encode( model_name="minishlab/M2V_base_output", ), name="minishlab/M2V_base_output", - languages=["eng_Latn"], + languages=["eng-Latn"], open_weights=True, revision="02460ae401a22b09d2c6652e23371398329551e2", release_date="2024-09-21", @@ -135,7 +135,7 @@ def encode( model_name="minishlab/M2V_multilingual_output", ), name="minishlab/M2V_multilingual_output", - languages=["eng_Latn"], + languages=["eng-Latn"], open_weights=True, revision="2cf4ec4e1f51aeca6c55cf9b93097d00711a6305", release_date="2024-09-21", @@ -161,7 +161,7 @@ def encode( model_name="minishlab/potion-base-2M", ), name="minishlab/potion-base-2M", - languages=["eng_Latn"], + languages=["eng-Latn"], open_weights=True, revision="86db093558fbced2072b929eb1690bce5272bd4b", release_date="2024-10-29", @@ -187,7 +187,7 @@ def encode( model_name="minishlab/potion-base-4M", ), name="minishlab/potion-base-4M", - languages=["eng_Latn"], + languages=["eng-Latn"], open_weights=True, revision="81b1802ada41afcd0987a37dc15e569c9fa76f04", release_date="2024-10-29", @@ -213,7 +213,7 @@ def encode( model_name="minishlab/potion-base-8M", ), name="minishlab/potion-base-8M", - languages=["eng_Latn"], + languages=["eng-Latn"], open_weights=True, revision="dcbec7aa2d52fc76754ac6291803feedd8c619ce", release_date="2024-10-29", @@ -238,7 +238,7 @@ def encode( Model2VecWrapper, model_name="NeuML/pubmedbert-base-embeddings-100K" ), name="NeuML/pubmedbert-base-embeddings-100K", - languages=["eng_Latn"], + languages=["eng-Latn"], open_weights=True, revision="bac5e3b12fb8c650e92a19c41b436732c4f16e9e", release_date="2025-01-03", @@ -263,7 +263,7 @@ def encode( Model2VecWrapper, model_name="NeuML/pubmedbert-base-embeddings-500K" ), name="NeuML/pubmedbert-base-embeddings-500K", - languages=["eng_Latn"], + languages=["eng-Latn"], open_weights=True, revision="34ba71e35c393fdad7ed695113f653feb407b16b", release_date="2025-01-03", @@ -286,7 +286,7 @@ def encode( pubmed_bert_1m = ModelMeta( loader=partial(Model2VecWrapper, model_name="NeuML/pubmedbert-base-embeddings-1M"), name="NeuML/pubmedbert-base-embeddings-1M", - languages=["eng_Latn"], + languages=["eng-Latn"], open_weights=True, revision="2b7fed222594708da6d88bcda92ae9b434b7ddd1", release_date="2025-01-03", @@ -309,7 +309,7 @@ def encode( pubmed_bert_2m = ModelMeta( loader=partial(Model2VecWrapper, model_name="NeuML/pubmedbert-base-embeddings-2M"), name="NeuML/pubmedbert-base-embeddings-2M", - languages=["eng_Latn"], + languages=["eng-Latn"], open_weights=True, revision="1d7bbe04d6713e425161146bfdc71473cbed498a", release_date="2025-01-03", @@ -332,7 +332,7 @@ def encode( pubmed_bert_8m = ModelMeta( loader=partial(Model2VecWrapper, model_name="NeuML/pubmedbert-base-embeddings-8M"), name="NeuML/pubmedbert-base-embeddings-8M", - languages=["eng_Latn"], + languages=["eng-Latn"], open_weights=True, revision="387d350015e963744f4fafe56a574b7cd48646c9", release_date="2025-01-03", diff --git a/mteb/models/moka_models.py b/mteb/models/moka_models.py index 75103825ec..60438eaec0 100644 --- a/mteb/models/moka_models.py +++ b/mteb/models/moka_models.py @@ -84,7 +84,7 @@ m3e_base = ModelMeta( name="moka-ai/m3e-base", - languages=["zho_Hans", "eng-Latn"], + languages=["zho-Hans", "eng-Latn"], open_weights=True, revision="764b537a0e50e5c7d64db883f2d2e051cbe3c64c", release_date="2023-06-06", # first commit @@ -107,7 +107,7 @@ m3e_small = ModelMeta( name="moka-ai/m3e-small", - languages=["zho_Hans", "eng-Latn"], + languages=["zho-Hans", "eng-Latn"], open_weights=True, revision="44c696631b2a8c200220aaaad5f987f096e986df", release_date="2023-06-02", # first commit @@ -130,7 +130,7 @@ m3e_large = ModelMeta( name="moka-ai/m3e-large", - languages=["zho_Hans", "eng-Latn"], + languages=["zho-Hans", "eng-Latn"], open_weights=True, revision="12900375086c37ba5d83d1e417b21dc7d1d1f388", release_date="2023-06-21", # first commit diff --git a/mteb/models/mxbai_models.py b/mteb/models/mxbai_models.py index f3476b264e..0e8f81a3f0 100644 --- a/mteb/models/mxbai_models.py +++ b/mteb/models/mxbai_models.py @@ -23,7 +23,7 @@ }, ), name="mixedbread-ai/mxbai-embed-large-v1", - languages=["eng_Latn"], + languages=["eng-Latn"], open_weights=True, revision="990580e27d329c7408b3741ecff85876e128e203", release_date="2024-03-07", # initial commit of hf model. @@ -44,7 +44,7 @@ mxbai_embed_2d_large_v1 = ModelMeta( loader=None, name="mixedbread-ai/mxbai-embed-2d-large-v1", - languages=["eng_Latn"], + languages=["eng-Latn"], open_weights=True, revision="7e639ca8e344af398876ead3b19ec3c0b9068f49", release_date="2024-03-04", # initial commit of hf model. @@ -68,7 +68,7 @@ mxbai_embed_xsmall_v1 = ModelMeta( loader=None, name="mixedbread-ai/mxbai-embed-xsmall-v1", - languages=["eng_Latn"], + languages=["eng-Latn"], open_weights=True, revision="2f741ec33328bb57e4704e1238fc59a4a5745705", release_date="2024-08-13", # initial commit of hf model. diff --git a/mteb/models/nomic_models_vision.py b/mteb/models/nomic_models_vision.py index 661bb7aa1f..05ad575e05 100644 --- a/mteb/models/nomic_models_vision.py +++ b/mteb/models/nomic_models_vision.py @@ -167,7 +167,7 @@ def get_fused_embeddings( text_model_name="nomic-ai/nomic-embed-text-v1.5", ), name="nomic-ai/nomic-embed-vision-v1.5", - languages=["eng_Latn"], + languages=["eng-Latn"], revision="af2246fffdab78d8458418480e4886a8e48b70a7", release_date="2024-06-08", modalities=["image", "text"], diff --git a/mteb/models/nvidia_models.py b/mteb/models/nvidia_models.py index ff0f3f5ef2..d3cd2ea29b 100644 --- a/mteb/models/nvidia_models.py +++ b/mteb/models/nvidia_models.py @@ -94,7 +94,7 @@ def instruction_template( add_eos_token=True, ), name="nvidia/NV-Embed-v2", - languages=["eng_Latn"], + languages=["eng-Latn"], open_weights=True, revision="7604d305b621f14095a1aa23d351674c2859553a", release_date="2024-09-09", # initial commit of hf model. @@ -125,7 +125,7 @@ def instruction_template( add_eos_token=True, ), name="nvidia/NV-Embed-v1", - languages=["eng_Latn"], + languages=["eng-Latn"], open_weights=True, revision="570834afd5fef5bf3a3c2311a2b6e0a66f6f4f2c", release_date="2024-09-13", # initial commit of hf model. diff --git a/mteb/models/openclip_models.py b/mteb/models/openclip_models.py index cca2a9caef..f0376e1bca 100644 --- a/mteb/models/openclip_models.py +++ b/mteb/models/openclip_models.py @@ -162,7 +162,7 @@ def get_fused_embeddings( model_name="laion/CLIP-ViT-L-14-DataComp.XL-s13B-b90K", ), name="laion/CLIP-ViT-L-14-DataComp.XL-s13B-b90K", - languages=["eng_Latn"], + languages=["eng-Latn"], revision="84c9828e63dc9a9351d1fe637c346d4c1c4db341", release_date="2023-04-26", modalities=["image", "text"], @@ -189,7 +189,7 @@ def get_fused_embeddings( model_name="laion/CLIP-ViT-B-32-DataComp.XL-s13B-b90K", ), name="laion/CLIP-ViT-B-32-DataComp.XL-s13B-b90K", - languages=["eng_Latn"], + languages=["eng-Latn"], revision="f0e2ffa09cbadab3db6a261ec1ec56407ce42912", release_date="2023-04-26", modalities=["image", "text"], @@ -216,7 +216,7 @@ def get_fused_embeddings( model_name="laion/CLIP-ViT-B-16-DataComp.XL-s13B-b90K", ), name="laion/CLIP-ViT-B-16-DataComp.XL-s13B-b90K", - languages=["eng_Latn"], + languages=["eng-Latn"], revision="d110532e8d4ff91c574ee60a342323f28468b287", release_date="2023-04-26", modalities=["image", "text"], @@ -243,7 +243,7 @@ def get_fused_embeddings( model_name="laion/CLIP-ViT-bigG-14-laion2B-39B-b160k", ), name="laion/CLIP-ViT-bigG-14-laion2B-39B-b160k", - languages=["eng_Latn"], + languages=["eng-Latn"], revision="bc7788f151930d91b58474715fdce5524ad9a189", release_date="2023-01-23", modalities=["image", "text"], @@ -270,7 +270,7 @@ def get_fused_embeddings( model_name="laion/CLIP-ViT-g-14-laion2B-s34B-b88K", ), name="laion/CLIP-ViT-g-14-laion2B-s34B-b88K", - languages=["eng_Latn"], + languages=["eng-Latn"], revision="15efd0f6ac0c40c0f9da7becca03c974d7012604", release_date="2023-03-06", modalities=["image", "text"], @@ -297,7 +297,7 @@ def get_fused_embeddings( model_name="laion/CLIP-ViT-H-14-laion2B-s32B-b79K", ), name="laion/CLIP-ViT-H-14-laion2B-s32B-b79K", - languages=["eng_Latn"], + languages=["eng-Latn"], revision="de081ac0a0ca8dc9d1533eed1ae884bb8ae1404b", release_date="2022-09-15", modalities=["image", "text"], @@ -324,7 +324,7 @@ def get_fused_embeddings( model_name="laion/CLIP-ViT-L-14-laion2B-s32B-b82K", ), name="laion/CLIP-ViT-L-14-laion2B-s32B-b82K", - languages=["eng_Latn"], + languages=["eng-Latn"], revision="1627032197142fbe2a7cfec626f4ced3ae60d07a", release_date="2022-09-15", modalities=["image", "text"], @@ -351,7 +351,7 @@ def get_fused_embeddings( model_name="laion/CLIP-ViT-B-32-laion2B-s34B-b79K", ), name="laion/CLIP-ViT-B-32-laion2B-s34B-b79K", - languages=["eng_Latn"], + languages=["eng-Latn"], revision="08f73555f1b2fb7c82058aebbd492887a94968ef", release_date="2022-09-15", modalities=["image", "text"], diff --git a/mteb/models/ops_moa_models.py b/mteb/models/ops_moa_models.py index 9c4d4c5e18..bbee0ba9f7 100644 --- a/mteb/models/ops_moa_models.py +++ b/mteb/models/ops_moa_models.py @@ -26,7 +26,7 @@ def encode(self, sentences: list[str], **kwargs) -> np.ndarray: name="OpenSearch-AI/Ops-MoA-Conan-embedding-v1", revision="46dcd58753f3daa920c66f89e47086a534089350", release_date="2025-03-26", - languages=["zho_Hans"], + languages=["zho-Hans"], loader=partial( CustomWrapper, "OpenSearch-AI/Ops-MoA-Conan-embedding-v1", @@ -61,7 +61,7 @@ def encode(self, sentences: list[str], **kwargs) -> np.ndarray: name="OpenSearch-AI/Ops-MoA-Yuan-embedding-1.0", revision="23712d0766417b0eb88a2513c6e212a58b543268", release_date="2025-03-26", - languages=["zho_Hans"], + languages=["zho-Hans"], loader=partial( CustomWrapper, "OpenSearch-AI/Ops-MoA-Yuan-embedding-1.0", diff --git a/mteb/models/overview.py b/mteb/models/overview.py index afb30ba525..972f7efaaa 100644 --- a/mteb/models/overview.py +++ b/mteb/models/overview.py @@ -28,6 +28,7 @@ cohere_models, cohere_v, colbert_models, + conan_models, dino_models, e5_instruct, e5_models, @@ -64,6 +65,7 @@ promptriever_models, qodo_models, qtack_models, + relle_models, repllama_models, rerankers_custom, rerankers_monot5_based, @@ -71,11 +73,14 @@ ru_sentence_models, salesforce_models, searchmap_models, + seed_models, sentence_transformers_models, + shuu_model, siglip_models, sonar_models, stella_models, text2vec_models, + ua_sentence_models, uae_models, vdr_models, vista_models, @@ -103,6 +108,7 @@ cohere_models, cohere_v, colbert_models, + conan_models, dino_models, e5_instruct, e5_models, @@ -137,14 +143,17 @@ promptriever_models, qodo_models, qtack_models, + relle_models, repllama_models, rerankers_custom, rerankers_monot5_based, richinfoai_models, ru_sentence_models, + ua_sentence_models, salesforce_models, searchmap_models, sentence_transformers_models, + shuu_model, siglip_models, vista_models, vlm2vec_models, @@ -163,6 +172,8 @@ nb_sbert, wavlm_models, whisper_models, + nb_sbert, + seed_models, ] MODEL_REGISTRY = {} diff --git a/mteb/models/piccolo_models.py b/mteb/models/piccolo_models.py index 4c24e9ba86..b4ffc8949a 100644 --- a/mteb/models/piccolo_models.py +++ b/mteb/models/piccolo_models.py @@ -6,7 +6,7 @@ piccolo_base_zh = ModelMeta( name="sensenova/piccolo-base-zh", - languages=["zho_Hans"], + languages=["zho-Hans"], open_weights=True, revision="47c0a63b8f667c3482e05b2fd45577bb19252196", release_date="2023-09-04", # first commit @@ -28,7 +28,7 @@ piccolo_large_zh_v2 = ModelMeta( name="sensenova/piccolo-large-zh-v2", - languages=["zho_Hans"], + languages=["zho-Hans"], open_weights=False, # They "temporarily" removed it in may last year # "Due to certain internal company considerations" revision="05948c1d889355936bdf9db7d30df57dd78d25a3", diff --git a/mteb/models/promptriever_models.py b/mteb/models/promptriever_models.py index cbed2e89c8..7e0576234b 100644 --- a/mteb/models/promptriever_models.py +++ b/mteb/models/promptriever_models.py @@ -50,7 +50,7 @@ def loader_inner(**kwargs: Any) -> Encoder: torch_dtype=torch.bfloat16, ), name="samaya-ai/promptriever-llama2-7b-v1", - languages=["eng_Latn"], + languages=["eng-Latn"], open_weights=True, revision="01c7f73d771dfac7d292323805ebc428287df4f9-30b14e3813c0fa45facfd01a594580c3fe5ecf23", # base-peft revision release_date="2024-09-15", @@ -77,7 +77,7 @@ def loader_inner(**kwargs: Any) -> Encoder: torch_dtype=torch.bfloat16, ), name="samaya-ai/promptriever-llama3.1-8b-v1", - languages=["eng_Latn"], + languages=["eng-Latn"], open_weights=True, revision="48d6d0fc4e02fb1269b36940650a1b7233035cbb-2ead22cfb1b0e0c519c371c63c2ab90ffc511b8a", # base-peft revision training_datasets={ @@ -107,7 +107,7 @@ def loader_inner(**kwargs: Any) -> Encoder: torch_dtype=torch.bfloat16, ), name="samaya-ai/promptriever-llama3.1-8b-instruct-v1", - languages=["eng_Latn"], + languages=["eng-Latn"], open_weights=True, revision="5206a32e0bd3067aef1ce90f5528ade7d866253f-8b677258615625122c2eb7329292b8c402612c21", # base-peft revision release_date="2024-09-15", @@ -137,7 +137,7 @@ def loader_inner(**kwargs: Any) -> Encoder: torch_dtype=torch.bfloat16, ), name="samaya-ai/promptriever-mistral-v0.1-7b-v1", - languages=["eng_Latn"], + languages=["eng-Latn"], open_weights=True, revision="7231864981174d9bee8c7687c24c8344414eae6b-876d63e49b6115ecb6839893a56298fadee7e8f5", # base-peft revision release_date="2024-09-15", diff --git a/mteb/models/qodo_models.py b/mteb/models/qodo_models.py index f5125c7638..693bcf6648 100644 --- a/mteb/models/qodo_models.py +++ b/mteb/models/qodo_models.py @@ -2,19 +2,22 @@ from mteb.model_meta import ModelMeta +qodo_languages = [ + "python-Code", + "c++-Code", + "c#-Code", + "go-Code", + "java-Code", + "javascript-Code", + "php-Code", + "ruby-Code", + "typescript-Code", +] + + Qodo_Embed_1_1_5B = ModelMeta( name="Qodo/Qodo-Embed-1-1.5B", - languages=[ - "python-Code", - "c++-Code", - "c#-Code", - "go-Code", - "java-Code", - "Javascript-Code", - "php-Code", - "ruby-Code", - "typescript-Code", - ], + languages=qodo_languages, open_weights=True, revision="84bbef079b32e8823ec226d4e9e92902706b0eb6", release_date="2025-02-19", @@ -35,17 +38,7 @@ Qodo_Embed_1_7B = ModelMeta( name="Qodo/Qodo-Embed-1-7B", - languages=[ - "python-Code", - "c++-Code", - "c#-Code", - "go-Code", - "java-Code", - "Javascript-Code", - "php-Code", - "ruby-Code", - "typescript-Code", - ], + languages=qodo_languages, open_weights=True, revision="f9edd9bf7f687c0e832424058e265120f603cd81", release_date="2025-02-24", diff --git a/mteb/models/qtack_models.py b/mteb/models/qtack_models.py index 4cfd43461a..662272168c 100644 --- a/mteb/models/qtack_models.py +++ b/mteb/models/qtack_models.py @@ -32,7 +32,7 @@ revision="7fbe6f9b4cc42615e0747299f837ad7769025492", ), name="prdev/mini-gte", - languages=["eng_Latn"], + languages=["eng-Latn"], open_weights=True, revision="7fbe6f9b4cc42615e0747299f837ad7769025492", release_date="2025-01-28", diff --git a/mteb/models/relle_models.py b/mteb/models/relle_models.py new file mode 100644 index 0000000000..e8ec9bd971 --- /dev/null +++ b/mteb/models/relle_models.py @@ -0,0 +1,32 @@ +from __future__ import annotations + +from functools import partial + +from mteb.model_meta import ModelMeta, sentence_transformers_loader + +relle_en = ModelMeta( + loader=partial( + sentence_transformers_loader, + model_name="bchoiced/RELLE", + revision="093417798a062ed8ddd8424df3be62145d4ace91", + ), + name="bchoiced/RELLE", + languages=[ + "eng-Latn", + ], + open_weights=True, + revision="093417798a062ed8ddd8424df3be62145d4ace91", + release_date="2025-04-17", + n_parameters=7_110_000_000, + memory_usage_mb=27125, + embed_dim=4096, + license="cc-by-sa-4.0", + max_tokens=32768, + reference="https://huggingface.co/bchoiced/RELLE", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instructions=True, + public_training_code=None, + public_training_data=None, + training_datasets=None, +) diff --git a/mteb/models/repllama_models.py b/mteb/models/repllama_models.py index 549f231c93..eed13cd71b 100644 --- a/mteb/models/repllama_models.py +++ b/mteb/models/repllama_models.py @@ -134,7 +134,7 @@ def loader_inner(**kwargs: Any) -> Encoder: model_prompts=model_prompts, ), name="castorini/repllama-v1-7b-lora-passage", - languages=["eng_Latn"], + languages=["eng-Latn"], open_weights=True, revision="01c7f73d771dfac7d292323805ebc428287df4f9-6097554dfe6e7d93e92f55010b678bcca1e233a8", # base-peft revision release_date="2023-10-11", @@ -166,7 +166,7 @@ def loader_inner(**kwargs: Any) -> Encoder: model_prompts=model_prompts, ), name="samaya-ai/RepLLaMA-reproduced", - languages=["eng_Latn"], + languages=["eng-Latn"], open_weights=True, revision="01c7f73d771dfac7d292323805ebc428287df4f9-ad5c1d0938a1e02954bcafb4d811ba2f34052e71", # base-peft revision release_date="2024-09-15", diff --git a/mteb/models/rerankers_custom.py b/mteb/models/rerankers_custom.py index 32a2534c0e..8804558bc4 100644 --- a/mteb/models/rerankers_custom.py +++ b/mteb/models/rerankers_custom.py @@ -64,7 +64,7 @@ def __init__( requires_package( self, - "flagembedding", + "FlagEmbedding", model_name_or_path, "pip install 'mteb[flagembedding]'", ) @@ -203,7 +203,7 @@ def loader_inner(**kwargs: Any) -> Encoder: fp_options="float16", ), name="castorini/monobert-large-msmarco", - languages=["eng_Latn"], + languages=["eng-Latn"], open_weights=True, revision="0a97706f3827389da43b83348d5d18c9d53876fa", release_date="2020-05-28", @@ -230,7 +230,7 @@ def loader_inner(**kwargs: Any) -> Encoder: fp_options="float16", ), name="jinaai/jina-reranker-v2-base-multilingual", - languages=["eng_Latn"], + languages=["eng-Latn"], open_weights=True, revision="126747772a932960028d9f4dc93bd5d9c4869be4", release_date="2024-09-26", @@ -257,38 +257,38 @@ def loader_inner(**kwargs: Any) -> Encoder: ), name="BAAI/bge-reranker-v2-m3", languages=[ - "eng_Latn", - "ara_Arab", - "ben_Beng", - "spa_Latn", - "fas_Arab", - "fin_Latn", - "fra_Latn", - "hin_Deva", - "ind_Latn", - "jpn_Jpan", - "kor_Hang", - "rus_Cyrl", - "swa_Latn", - "tel_Telu", - "tha_Thai", - "zho_Hans", - "deu_Latn", - "yor_Latn", - "dan_Latn", - "heb_Hebr", - "hun_Latn", - "ita_Latn", - "khm_Khmr", - "msa_Latn", - "nld_Latn", - "nob_Latn", - "pol_Latn", - "por_Latn", - "swe_Latn", - "tur_Latn", - "vie_Latn", - "zho_Hant", + "eng-Latn", + "ara-Arab", + "ben-Beng", + "spa-Latn", + "fas-Arab", + "fin-Latn", + "fra-Latn", + "hin-Deva", + "ind-Latn", + "jpn-Jpan", + "kor-Hang", + "rus-Cyrl", + "swa-Latn", + "tel-Telu", + "tha-Thai", + "zho-Hans", + "deu-Latn", + "yor-Latn", + "dan-Latn", + "heb-Hebr", + "hun-Latn", + "ita-Latn", + "khm-Khmr", + "msa-Latn", + "nld-Latn", + "nob-Latn", + "pol-Latn", + "por-Latn", + "swe-Latn", + "tur-Latn", + "vie-Latn", + "zho-Hant", ], open_weights=True, revision="953dc6f6f85a1b2dbfca4c34a2796e7dde08d41e", diff --git a/mteb/models/rerankers_monot5_based.py b/mteb/models/rerankers_monot5_based.py index f94508c548..2cd4266cb6 100644 --- a/mteb/models/rerankers_monot5_based.py +++ b/mteb/models/rerankers_monot5_based.py @@ -292,7 +292,7 @@ def get_prediction_tokens(self, *args, **kwargs): fp_options="float16", ), name="castorini/monot5-small-msmarco-10k", - languages=["eng_Latn"], + languages=["eng-Latn"], open_weights=True, revision="77f8e3f7b1eb1afe353aa21a7c3a2fc8feca702e", release_date="2022-03-28", @@ -318,7 +318,7 @@ def get_prediction_tokens(self, *args, **kwargs): fp_options="float16", ), name="castorini/monot5-base-msmarco-10k", - languages=["eng_Latn"], + languages=["eng-Latn"], open_weights=True, revision="f15657ab3d2a5dd0b9a30c8c0b6a0a73c9cb5884", release_date="2022-03-28", @@ -344,7 +344,7 @@ def get_prediction_tokens(self, *args, **kwargs): fp_options="float16", ), name="castorini/monot5-large-msmarco-10k", - languages=["eng_Latn"], + languages=["eng-Latn"], open_weights=True, revision="48cfad1d8dd587670393f27ee8ec41fde63e3d98", release_date="2022-03-28", @@ -370,7 +370,7 @@ def get_prediction_tokens(self, *args, **kwargs): fp_options="float16", ), name="castorini/monot5-3b-msmarco-10k", - languages=["eng_Latn"], + languages=["eng-Latn"], open_weights=True, revision="bc0c419a438c81f592f878ce32430a1823f5db6c", release_date="2022-03-28", @@ -396,7 +396,7 @@ def get_prediction_tokens(self, *args, **kwargs): fp_options="float16", ), name="google/flan-t5-base", - languages=["eng_Latn"], + languages=["eng-Latn"], open_weights=True, revision="7bcac572ce56db69c1ea7c8af255c5d7c9672fc2", release_date="2022-10-21", @@ -433,7 +433,7 @@ def get_prediction_tokens(self, *args, **kwargs): fp_options="float16", ), name="google/flan-t5-large", - languages=["eng_Latn"], + languages=["eng-Latn"], open_weights=True, revision="0613663d0d48ea86ba8cb3d7a44f0f65dc596a2a", release_date="2022-10-21", @@ -470,7 +470,7 @@ def get_prediction_tokens(self, *args, **kwargs): fp_options="float16", ), name="google/flan-t5-xl", - languages=["eng_Latn"], + languages=["eng-Latn"], open_weights=True, revision="7d6315df2c2fb742f0f5b556879d730926ca9001", release_date="2022-10-21", @@ -507,7 +507,7 @@ def get_prediction_tokens(self, *args, **kwargs): fp_options="float16", ), name="google/flan-t5-xxl", - languages=["eng_Latn"], + languages=["eng-Latn"], open_weights=True, revision="ae7c9136adc7555eeccc78cdd960dfd60fb346ce", release_date="2022-10-21", @@ -545,7 +545,7 @@ def get_prediction_tokens(self, *args, **kwargs): fp_options="float16", ), name="meta-llama/Llama-2-7b-hf", - languages=["eng_Latn"], + languages=["eng-Latn"], open_weights=True, revision="01c7f73d771dfac7d292323805ebc428287df4f9", release_date="2023-07-18", @@ -571,7 +571,7 @@ def get_prediction_tokens(self, *args, **kwargs): fp_options="float16", ), name="meta-llama/Llama-2-7b-chat-hf", - languages=["eng_Latn"], + languages=["eng-Latn"], open_weights=True, revision="f5db02db724555f92da89c216ac04704f23d4590", release_date="2023-07-18", @@ -597,7 +597,7 @@ def get_prediction_tokens(self, *args, **kwargs): fp_options="float16", ), name="mistralai/Mistral-7B-Instruct-v0.2", - languages=["eng_Latn"], + languages=["eng-Latn"], open_weights=True, revision="3ad372fc79158a2148299e3318516c786aeded6c", release_date="2023-12-11", @@ -623,7 +623,7 @@ def get_prediction_tokens(self, *args, **kwargs): fp_options="float16", ), name="jhu-clsp/FollowIR-7B", - languages=["eng_Latn"], + languages=["eng-Latn"], open_weights=True, revision="4d25d437e38b510c01852070c0731e8f6e1875d1", release_date="2024-04-29", @@ -643,107 +643,107 @@ def get_prediction_tokens(self, *args, **kwargs): mt5_languages = [ - "afr_Latn", - "sqi_Latn", - "amh_Ethi", - "ara_Arab", - "hye_Armn", - "aze_Latn", - "eus_Latn", - "bel_Cyrl", - "ben_Beng", - "bul_Cyrl", - "mya_Mymr", - "cat_Latn", - "ceb_Latn", - "nya_Latn", - "zho_Hans", - "cos_Latn", - "ces_Latn", - "dan_Latn", - "nld_Latn", - "eng_Latn", - "epo_Latn", - "est_Latn", - "fil_Latn", - "fin_Latn", - "fra_Latn", - "glg_Latn", - "kat_Geor", - "deu_Latn", - "ell_Grek", - "guj_Gujr", - "hat_Latn", - "hau_Latn", - "haw_Latn", - "heb_Hebr", - "hin_Deva", - "hmn_Latn", - "hun_Latn", - "isl_Latn", - "ibo_Latn", - "ind_Latn", - "gle_Latn", - "ita_Latn", - "jpn_Jpan", - "jav_Latn", - "kan_Knda", - "kaz_Cyrl", - "khm_Khmr", - "kor_Hang", - "kur_Latn", - "kir_Cyrl", - "lao_Laoo", - "lat_Latn", - "lav_Latn", - "lit_Latn", - "ltz_Latn", - "mkd_Cyrl", - "mlg_Latn", - "msa_Latn", - "mal_Mlym", - "mlt_Latn", - "mri_Latn", - "mar_Deva", - "mon_Cyrl", - "nep_Deva", - "nor_Latn", - "pus_Arab", - "fas_Arab", - "pol_Latn", - "por_Latn", - "pan_Guru", - "ron_Latn", - "rus_Cyrl", - "smo_Latn", - "gla_Latn", - "srp_Cyrl", - "sna_Latn", - "snd_Arab", - "sin_Sinh", - "slk_Latn", - "slv_Latn", - "som_Latn", - "sot_Latn", - "spa_Latn", - "sun_Latn", - "swa_Latn", - "swe_Latn", - "tgk_Cyrl", - "tam_Taml", - "tel_Telu", - "tha_Thai", - "tur_Latn", - "ukr_Cyrl", - "urd_Arab", - "uzb_Latn", - "vie_Latn", - "cym_Latn", - "fry_Latn", - "xho_Latn", - "yid_Hebr", - "yor_Latn", - "zul_Latn", + "afr-Latn", + "sqi-Latn", + "amh-Ethi", + "ara-Arab", + "hye-Armn", + "aze-Latn", + "eus-Latn", + "bel-Cyrl", + "ben-Beng", + "bul-Cyrl", + "mya-Mymr", + "cat-Latn", + "ceb-Latn", + "nya-Latn", + "zho-Hans", + "cos-Latn", + "ces-Latn", + "dan-Latn", + "nld-Latn", + "eng-Latn", + "epo-Latn", + "est-Latn", + "fil-Latn", + "fin-Latn", + "fra-Latn", + "glg-Latn", + "kat-Geor", + "deu-Latn", + "ell-Grek", + "guj-Gujr", + "hat-Latn", + "hau-Latn", + "haw-Latn", + "heb-Hebr", + "hin-Deva", + "hmn-Latn", + "hun-Latn", + "isl-Latn", + "ibo-Latn", + "ind-Latn", + "gle-Latn", + "ita-Latn", + "jpn-Jpan", + "jav-Latn", + "kan-Knda", + "kaz-Cyrl", + "khm-Khmr", + "kor-Hang", + "kur-Latn", + "kir-Cyrl", + "lao-Laoo", + "lat-Latn", + "lav-Latn", + "lit-Latn", + "ltz-Latn", + "mkd-Cyrl", + "mlg-Latn", + "msa-Latn", + "mal-Mlym", + "mlt-Latn", + "mri-Latn", + "mar-Deva", + "mon-Cyrl", + "nep-Deva", + "nor-Latn", + "pus-Arab", + "fas-Arab", + "pol-Latn", + "por-Latn", + "pan-Guru", + "ron-Latn", + "rus-Cyrl", + "smo-Latn", + "gla-Latn", + "srp-Cyrl", + "sna-Latn", + "snd-Arab", + "sin-Sinh", + "slk-Latn", + "slv-Latn", + "som-Latn", + "sot-Latn", + "spa-Latn", + "sun-Latn", + "swa-Latn", + "swe-Latn", + "tgk-Cyrl", + "tam-Taml", + "tel-Telu", + "tha-Thai", + "tur-Latn", + "ukr-Cyrl", + "urd-Arab", + "uzb-Latn", + "vie-Latn", + "cym-Latn", + "fry-Latn", + "xho-Latn", + "yid-Hebr", + "yor-Latn", + "zul-Latn", ] mt5_base_mmarco_v2 = ModelMeta( diff --git a/mteb/models/richinfoai_models.py b/mteb/models/richinfoai_models.py index c46df9c378..213699893a 100644 --- a/mteb/models/richinfoai_models.py +++ b/mteb/models/richinfoai_models.py @@ -6,7 +6,7 @@ ritrieve_zh_v1 = ModelMeta( name="richinfoai/ritrieve_zh_v1", - languages=["zho_Hans"], + languages=["zho-Hans"], open_weights=True, revision="f8d5a707656c55705027678e311f9202c8ced12c", release_date="2025-03-25", diff --git a/mteb/models/ru_sentence_models.py b/mteb/models/ru_sentence_models.py index c3c63d5461..e939b8eb07 100644 --- a/mteb/models/ru_sentence_models.py +++ b/mteb/models/ru_sentence_models.py @@ -8,11 +8,15 @@ from mteb.encoder_interface import PromptType from mteb.model_meta import ModelMeta, sentence_transformers_loader +from mteb.models.bge_models import bge_m3_training_data from mteb.models.instruct_wrapper import InstructSentenceTransformerWrapper +from mteb.models.nomic_models import ( + nomic_training_data, +) rubert_tiny = ModelMeta( name="cointegrated/rubert-tiny", - languages=["rus_Cyrl"], + languages=["rus-Cyrl"], open_weights=True, revision="5441c5ea8026d4f6d7505ec004845409f1259fb1", release_date="2021-05-24", @@ -36,7 +40,7 @@ rubert_tiny2 = ModelMeta( name="cointegrated/rubert-tiny2", - languages=["rus_Cyrl"], + languages=["rus-Cyrl"], open_weights=True, revision="dad72b8f77c5eef6995dd3e4691b758ba56b90c3", release_date="2021-10-28", @@ -61,7 +65,7 @@ sbert_large_nlu_ru = ModelMeta( name="ai-forever/sbert_large_nlu_ru", - languages=["rus_Cyrl"], + languages=["rus-Cyrl"], open_weights=True, revision="af977d5dfa46a3635e29bf0ef383f2df2a08d47a", release_date="2020-11-20", @@ -85,7 +89,7 @@ sbert_large_mt_nlu_ru = ModelMeta( name="ai-forever/sbert_large_mt_nlu_ru", - languages=["rus_Cyrl"], + languages=["rus-Cyrl"], open_weights=True, revision="05300876c2b83f46d3ddd422a7f17e45cf633bb0", release_date="2021-05-18", @@ -114,7 +118,7 @@ model_prompts={"query": "query: ", "passage": "passage: "}, ), name="deepvk/USER-base", - languages=["rus_Cyrl"], + languages=["rus-Cyrl"], open_weights=True, revision="436a489a2087d61aa670b3496a9915f84e46c861", release_date="2024-06-10", @@ -170,7 +174,7 @@ revision="0cc6cfe48e260fb0474c753087a69369e88709ae", ), name="deepvk/USER-bge-m3", - languages=["rus_Cyrl"], + languages=["rus-Cyrl"], open_weights=True, revision="0cc6cfe48e260fb0474c753087a69369e88709ae", release_date="2024-07-05", @@ -212,7 +216,7 @@ deberta_v1_ru = ModelMeta( name="deepvk/deberta-v1-base", - languages=["rus_Cyrl"], + languages=["rus-Cyrl"], open_weights=True, revision="bdd30b0e19757e6940c92c7aff19e8fc0a60dff4", release_date="2023-02-07", @@ -241,7 +245,7 @@ rubert_base_cased = ModelMeta( name="DeepPavlov/rubert-base-cased", - languages=["rus_Cyrl"], + languages=["rus-Cyrl"], open_weights=True, revision="4036cab694767a299f2b9e6492909664d9414229", release_date="2020-03-04", @@ -266,7 +270,7 @@ distilrubert_small_cased_conversational = ModelMeta( name="DeepPavlov/distilrubert-small-cased-conversational", - languages=["rus_Cyrl"], + languages=["rus-Cyrl"], open_weights=True, revision="e348066b4a7279b97138038299bddc6580a9169a", release_date="2022-06-28", @@ -289,7 +293,7 @@ rubert_base_cased_sentence = ModelMeta( name="DeepPavlov/rubert-base-cased-sentence", - languages=["rus_Cyrl"], + languages=["rus-Cyrl"], open_weights=True, revision="78b5122d6365337dd4114281b0d08cd1edbb3bc8", release_date="2020-03-04", @@ -312,7 +316,7 @@ labse_en_ru = ModelMeta( name="cointegrated/LaBSE-en-ru", - languages=["rus_Cyrl"], + languages=["rus-Cyrl"], open_weights=True, revision="cf0714e606d4af551e14ad69a7929cd6b0da7f7e", release_date="2021-06-10", @@ -338,7 +342,7 @@ } rubert_tiny_turbo = ModelMeta( name="sergeyzh/rubert-tiny-turbo", - languages=["rus_Cyrl"], + languages=["rus-Cyrl"], open_weights=True, revision="8ce0cf757446ce9bb2d5f5a4ac8103c7a1049054", release_date="2024-06-21", @@ -359,7 +363,7 @@ rubert_mini_frida = ModelMeta( name="sergeyzh/rubert-mini-frida", - languages=["rus_Cyrl"], + languages=["rus-Cyrl"], open_weights=True, revision="19b279b78afd945b5ccae78f63e284909814adc2", release_date="2025-03-02", @@ -385,7 +389,7 @@ labse_ru_turbo = ModelMeta( name="sergeyzh/LaBSE-ru-turbo", - languages=["rus_Cyrl"], + languages=["rus-Cyrl"], open_weights=True, revision="1940b046c6b5e125df11722b899130329d0a46da", release_date="2024-06-27", @@ -406,7 +410,7 @@ berta = ModelMeta( name="sergeyzh/BERTA", - languages=["rus_Cyrl"], + languages=["rus-Cyrl"], open_weights=True, revision="914c8c8aed14042ed890fc2c662d5e9e66b2faa7", release_date="2025-03-10", @@ -461,7 +465,7 @@ model_prompts=rosberta_prompts, ), name="ai-forever/ru-en-RoSBERTa", - languages=["rus_Cyrl"], + languages=["rus-Cyrl"], open_weights=True, revision="89fb1651989adbb1cfcfdedafd7d102951ad0555", release_date="2024-07-29", @@ -617,7 +621,7 @@ model_prompts=frida_prompts, ), name="ai-forever/FRIDA", - languages=["rus_Cyrl"], + languages=["rus-Cyrl"], open_weights=True, revision="7292217af9a9e6dbf07048f76b434ad1e2aa8b76", release_date="2024-12-29", @@ -649,7 +653,7 @@ }, ), name="ai-sage/Giga-Embeddings-instruct", - languages=["eng_Latn", "rus_Cyrl"], + languages=["eng-Latn", "rus-Cyrl"], open_weights=True, revision="646f5ff3587e74a18141c8d6b60d1cffd5897b92", release_date="2024-12-13", @@ -666,3 +670,152 @@ public_training_data=None, training_datasets=None, ) + +berta_training_datasets = { + **frida_training_datasets, # distilled from FRIDA + # https://huggingface.co/datasets/IlyaGusev/gazeta + # https://huggingface.co/datasets/zloelias/lenta-ru + # https://huggingface.co/datasets/HuggingFaceFW/fineweb-2 + # https://huggingface.co/datasets/HuggingFaceFW/fineweb +} + +berta = ModelMeta( + name="sergeyzh/BERTA", + languages=["rus-Cyrl"], + open_weights=True, + revision="914c8c8aed14042ed890fc2c662d5e9e66b2faa7", + release_date="2025-03-10", + n_parameters=128_000_000, + memory_usage_mb=489, + embed_dim=768, + license="mit", + max_tokens=512, + reference="https://huggingface.co/sergeyzh/BERTA", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instructions=True, + training_datasets=berta_training_datasets, + public_training_code=None, + adapted_from="sergeyzh/LaBSE-ru-turbo", + public_training_data=None, +) + +rubert_mini_frida = ModelMeta( + name="sergeyzh/rubert-mini-frida", + languages=["rus-Cyrl"], + open_weights=True, + revision="19b279b78afd945b5ccae78f63e284909814adc2", + release_date="2025-03-02", + n_parameters=32_300_000, + memory_usage_mb=123, + embed_dim=312, + license="mit", + max_tokens=2048, + reference="https://huggingface.co/sergeyzh/rubert-mini-frida", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instructions=True, + public_training_code=None, + public_training_data=None, + training_datasets=berta_training_datasets, + adapted_from="sergeyzh/rubert-mini-sts", +) + + +user2_training_data = { + **nomic_training_data, + **bge_m3_training_data, + # deepvk/cultura_ru_edu + # AllNLI + # nyuuzyou/fishkinet-posts + # IlyaGusev/gazeta + # its5Q/habr_qna + # zloelias/lenta-ru + # unicamp-dl/mmarco + # deepvk/ru-HNP + # deepvk/ru-WANLI + # wikimedia/wikipedia + # CarlBrendt/Summ_Dialog_News + # RussianNLP/wikiomnia + # its5Q/yandex-q + # "mC4" ru + # "CC-News" ru + # MultiLongDocRetrieval +} + +user2_prompts = { + # Override some prompts for ruMTEB tasks + "HeadlineClassification": "search_query: ", + "RuSciBenchGRNTIClassification": "clustering: ", + "RuSciBenchOECDClassification": "clustering: ", + "GeoreviewClusteringP2P": "search_query: ", + "SensitiveTopicsClassification": "search_query: ", + "STS22": "search_document: ", + "InappropriatenessClassification": "classification: ", + "CEDRClassification": "classification: ", + # Default + "Classification": "classification: ", + "MultilabelClassification": "classification: ", + "Clustering": "clustering: ", + "PairClassification": "classification: ", + "Reranking": "classification: ", + f"Reranking-{PromptType.query.value}": "search_query: ", + f"Reranking-{PromptType.passage.value}": "search_document: ", + "STS": "classification: ", + "Summarization": "clustering: ", + PromptType.query.value: "search_query: ", + PromptType.passage.value: "search_document: ", +} +user2_small = ModelMeta( + loader=partial( + sentence_transformers_loader, + model_name="deepvk/USER2-small", + revision="23f65b34cf7632032061f5cc66c14714e6d4cee4", + model_prompts=user2_prompts, + ), + name="deepvk/USER2-small", + languages=["rus-Cyrl"], + open_weights=True, + revision="23f65b34cf7632032061f5cc66c14714e6d4cee4", + release_date="2025-04-19", + use_instructions=True, + reference="https://huggingface.co/collections/deepvk/user2-6802650d7210f222ec60e05f", + n_parameters=34_400_000, + memory_usage_mb=131, + max_tokens=8192, + embed_dim=384, + license="apache-2.0", + similarity_fn_name="cosine", + adapted_from="deepvk/RuModernBERT-small", + training_datasets=user2_training_data, + public_training_data=None, + public_training_code="https://github.com/BlessedTatonka/some_code/tree/2899f27d51efdf4217fc6453799ff197e9792f1e", + framework=["Sentence Transformers", "PyTorch"], +) + +user2_base = ModelMeta( + loader=partial( + sentence_transformers_loader, + model_name="deepvk/USER2-base", + revision="0942cf96909b6d52e61f79a01e2d30c7be640b27", + model_prompts=user2_prompts, + ), + name="deepvk/USER2-base", + languages=["rus-Cyrl"], + open_weights=True, + revision="0942cf96909b6d52e61f79a01e2d30c7be640b27", + release_date="2025-04-19", + use_instructions=True, + reference="https://huggingface.co/collections/deepvk/user2-6802650d7210f222ec60e05f", + n_parameters=149_000_000, + memory_usage_mb=568, + max_tokens=8192, + embed_dim=768, + license="apache-2.0", + similarity_fn_name="cosine", + adapted_from="deepvk/RuModernBERT-base", + training_datasets=user2_training_data, + public_training_data=None, + public_training_code="https://github.com/BlessedTatonka/some_code/tree/2899f27d51efdf4217fc6453799ff197e9792f1e", + framework=["Sentence Transformers", "PyTorch"], +) diff --git a/mteb/models/salesforce_models.py b/mteb/models/salesforce_models.py index fdcf30e82d..ab0e230f7b 100644 --- a/mteb/models/salesforce_models.py +++ b/mteb/models/salesforce_models.py @@ -50,7 +50,7 @@ def instruction_template( normalized=True, ), name="Salesforce/SFR-Embedding-2_R", - languages=["eng_Latn"], + languages=["eng-Latn"], open_weights=True, revision="91762139d94ed4371a9fa31db5551272e0b83818", release_date="2024-06-14", # initial commit of hf model. @@ -81,7 +81,7 @@ def instruction_template( normalized=True, ), name="Salesforce/SFR-Embedding-Code-2B_R", - languages=["eng_Latn"], + languages=["eng-Latn"], open_weights=True, revision="c73d8631a005876ed5abde34db514b1fb6566973", release_date="2025-01-17", # initial commit of hf model. @@ -112,7 +112,7 @@ def instruction_template( normalized=True, ), name="Salesforce/SFR-Embedding-Mistral", - languages=["eng_Latn"], + languages=["eng-Latn"], open_weights=True, revision="938c560d1c236aa563b2dbdf084f28ab28bccb11", release_date="2024-01-24", # initial commit of hf model. diff --git a/mteb/models/searchmap_models.py b/mteb/models/searchmap_models.py index 69bb81929c..6ab3850541 100644 --- a/mteb/models/searchmap_models.py +++ b/mteb/models/searchmap_models.py @@ -24,7 +24,7 @@ ), name="VPLabs/SearchMap_Preview", revision="69de17ef48278ed08ba1a4e65ead8179912b696e", - languages=["eng_Latn"], + languages=["eng-Latn"], open_weights=True, use_instructions=True, release_date="2025-03-05", diff --git a/mteb/models/seed_models.py b/mteb/models/seed_models.py new file mode 100644 index 0000000000..2555ec9aaa --- /dev/null +++ b/mteb/models/seed_models.py @@ -0,0 +1,167 @@ +from __future__ import annotations + +import logging +import os +import time +from functools import partial +from typing import Any + +import numpy as np +import tqdm + +from mteb.encoder_interface import PromptType +from mteb.model_meta import ModelMeta +from mteb.models.wrapper import Wrapper +from mteb.requires_package import requires_package + +logger = logging.getLogger(__name__) + + +class SeedTextEmbeddingModel(Wrapper): + def __init__( + self, + model_name: str, + rate_limit_per_minute: int = 300, + **kwargs, + ) -> None: + requires_package( + self, + "openai", + model_name, + install_instruction="pip install 'mteb[openai]'", + ) + from openai import OpenAI + + requires_package( + self, + "tiktoken", + model_name, + install_instruction="pip install 'mteb[openai]'", + ) + import tiktoken + + self.model_name = model_name + self.rate_limit_per_minute = rate_limit_per_minute + self.last_request_time = 0 + self.tokenizer = tiktoken.get_encoding("cl100k_base") + self.client = OpenAI( + api_key=os.environ["ARK_API_KEY"], + base_url="https://ark.cn-beijing.volces.com/api/v3", + ) + + def _enforce_rate_limit(self): + """Enforce rate limiting""" + current_time = time.time() + time_since_last_request = current_time - self.last_request_time + min_interval = 60.0 / self.rate_limit_per_minute + + if time_since_last_request < min_interval: + time.sleep(min_interval - time_since_last_request) + + self.last_request_time = time.time() + + def _truncate_text(self, text: str, max_tokens: int = 32000) -> str: + """Truncate text to fit within token limit""" + tokens = self.tokenizer.encode(text) + if len(tokens) > max_tokens: + tokens = tokens[:max_tokens] + text = self.tokenizer.decode(tokens) + return text + + def _format_instruction(self, instruction: str, input_: str) -> str: + if isinstance(instruction, dict): + return input_ + elif isinstance(instruction, str) and len(instruction): + return instruction + "\n" + input_ + else: + return input_ + + def _embed( + self, + sentences: list[str], + instruction: str, + show_progress_bar: bool = False, + retries: int = 5, + ) -> np.ndarray: + max_batch_size = 20 + batches = [ + sentences[i : i + max_batch_size] + for i in range(0, len(sentences), max_batch_size) + ] + + all_embeddings = [] + + for batch in tqdm.tqdm(batches, leave=False, disable=not show_progress_bar): + # Truncate texts + batch = [self._truncate_text(text) for text in batch] + + # Add instruction to each text + batch = [self._format_instruction(instruction, text) for text in batch] + + while retries > 0: + try: + self._enforce_rate_limit() + response = self.client.embeddings.create( + model=self.model_name, input=batch, encoding_format="float" + ) + embeddings = [x.embedding for x in response.data] + break + except Exception as e: + logger.warning( + f"Retrying... {retries} retries left. Error: {str(e)}" + ) + retries -= 1 + if retries == 0: + raise e + + all_embeddings.extend(embeddings) + + return np.array(all_embeddings) + + def encode( + self, + sentences: list[str], + *, + task_name: str, + prompt_type: PromptType | None = None, + **kwargs: Any, + ) -> np.ndarray: + logger.warning("The API will be publicly available soon. Stay tuned!") + + instruction = self.get_instruction(task_name, prompt_type) + show_progress_bar = kwargs.pop("show_progress_bar", False) + + return self._embed( + sentences, + instruction=instruction, + show_progress_bar=show_progress_bar, + ) + + +seed_embedding = ModelMeta( + name="ByteDance-Seed/Doubao-1.5-Embedding", + revision="2", + release_date="2025-04-25", + languages=[ + "eng-Latn", + "zho-Hans", + ], + loader=partial( + SeedTextEmbeddingModel, + model_name="doubao-1-5-embedding", + rate_limit_per_minute=300, + ), + max_tokens=32768, + embed_dim=2048, + open_weights=False, + n_parameters=None, + memory_usage_mb=None, + license=None, + reference="https://huggingface.co/ByteDance-Seed/Doubao-1.5-Embedding", + similarity_fn_name="cosine", + framework=["API"], + use_instructions=True, + training_datasets=None, + public_training_code=None, + public_training_data=None, +) diff --git a/mteb/models/sentence_transformers_models.py b/mteb/models/sentence_transformers_models.py index 4825621604..d495f02594 100644 --- a/mteb/models/sentence_transformers_models.py +++ b/mteb/models/sentence_transformers_models.py @@ -8,59 +8,59 @@ from mteb.models.sentence_transformer_wrapper import SentenceTransformerWrapper paraphrase_langs = [ - "ara_Arab", - "bul_Cyrl", - "cat_Latn", - "ces_Latn", - "dan_Latn", - "deu_Latn", - "ell_Grek", - "eng_Latn", - "spa_Latn", - "est_Latn", - "fas_Arab", - "fin_Latn", - "fra_Latn", - "fra_Latn", - "glg_Latn", - "guj_Gujr", - "heb_Hebr", - "hin_Deva", - "hrv_Latn", - "hun_Latn", - "hye_Armn", - "ind_Latn", - "ita_Latn", - "jpn_Jpan", - "kat_Geor", - "kor_Hang", - "kur_Arab", - "lit_Latn", - "lav_Latn", - "mkd_Cyrl", - "mon_Cyrl", - "mar_Deva", - "msa_Latn", - "mya_Mymr", - "nob_Latn", - "nld_Latn", - "pol_Latn", - "por_Latn", - "por_Latn", - "ron_Latn", - "rus_Cyrl", - "slk_Latn", - "slv_Latn", - "sqi_Latn", - "srp_Cyrl", - "swe_Latn", - "tha_Thai", - "tur_Latn", - "ukr_Cyrl", - "urd_Arab", - "vie_Latn", - "zho_Hans", - "zho_Hant", + "ara-Arab", + "bul-Cyrl", + "cat-Latn", + "ces-Latn", + "dan-Latn", + "deu-Latn", + "ell-Grek", + "eng-Latn", + "spa-Latn", + "est-Latn", + "fas-Arab", + "fin-Latn", + "fra-Latn", + "fra-Latn", + "glg-Latn", + "guj-Gujr", + "heb-Hebr", + "hin-Deva", + "hrv-Latn", + "hun-Latn", + "hye-Armn", + "ind-Latn", + "ita-Latn", + "jpn-Jpan", + "kat-Geor", + "kor-Hang", + "kur-Arab", + "lit-Latn", + "lav-Latn", + "mkd-Cyrl", + "mon-Cyrl", + "mar-Deva", + "msa-Latn", + "mya-Mymr", + "nob-Latn", + "nld-Latn", + "pol-Latn", + "por-Latn", + "por-Latn", + "ron-Latn", + "rus-Cyrl", + "slk-Latn", + "slv-Latn", + "sqi-Latn", + "srp-Cyrl", + "swe-Latn", + "tha-Thai", + "tur-Latn", + "ukr-Cyrl", + "urd-Arab", + "vie-Latn", + "zho-Hans", + "zho-Hant", ] sent_trf_training_dataset = { @@ -226,7 +226,7 @@ ) multi_qa_MiniLM_L6_cos_v1 = ModelMeta( - name="sentence-transformer/multi-qa-MiniLM-L6-cos-v1", + name="sentence-transformers/multi-qa-MiniLM-L6-cos-v1", languages=["eng-Latn"], open_weights=True, revision="b207367332321f8e44f96e224ef15bc607f4dbf0", @@ -294,55 +294,55 @@ # negation } static_multi_languages = [ - "eng_Latn", - "ara_Arab", - "bul_Cyrl", - "cat_Latn", - "ces_Latn", - "dan_Latn", - "deu_Latn", - "ell_Grek", - "spa_Latn", - "est_Latn", - "fas_Arab", - "fin_Latn", - "fra_Latn", - "glg_Latn", - "guj_Gujr", - "heb_Hebr", - "hin_Deva", - "hun_Latn", - "hye_Armn", - "ind_Latn", - "ita_Latn", - "jpn_Jpan", - "kat_Geor", - "kor_Hang", - "kur_Latn", - "lit_Latn", - "lav_Latn", - "mkd_Cyrl", - "mon_Cyrl", - "mar_Deva", - "mal_Mlym", - "mya_Mymr", - "nob_Latn", - "nld_Latn", - "pol_Latn", - "por_Latn", - "ron_Latn", - "rus_Cyrl", - "slk_Latn", - "slv_Latn", - "sqi_Latn", - "srp_Cyrl", - "swe_Latn", - "tha_Thai", - "tur_Latn", - "ukr_Cyrl", - "urd_Arab", - "vie_Latn", - "zho_Hans", + "eng-Latn", + "ara-Arab", + "bul-Cyrl", + "cat-Latn", + "ces-Latn", + "dan-Latn", + "deu-Latn", + "ell-Grek", + "spa-Latn", + "est-Latn", + "fas-Arab", + "fin-Latn", + "fra-Latn", + "glg-Latn", + "guj-Gujr", + "heb-Hebr", + "hin-Deva", + "hun-Latn", + "hye-Armn", + "ind-Latn", + "ita-Latn", + "jpn-Jpan", + "kat-Geor", + "kor-Hang", + "kur-Latn", + "lit-Latn", + "lav-Latn", + "mkd-Cyrl", + "mon-Cyrl", + "mar-Deva", + "mal-Mlym", + "mya-Mymr", + "nob-Latn", + "nld-Latn", + "pol-Latn", + "por-Latn", + "ron-Latn", + "rus-Cyrl", + "slk-Latn", + "slv-Latn", + "sqi-Latn", + "srp-Cyrl", + "swe-Latn", + "tha-Thai", + "tur-Latn", + "ukr-Cyrl", + "urd-Arab", + "vie-Latn", + "zho-Hans", ] static_similarity_mrl_multilingual_v1 = ModelMeta( diff --git a/mteb/models/shuu_model.py b/mteb/models/shuu_model.py new file mode 100644 index 0000000000..b89a957dbe --- /dev/null +++ b/mteb/models/shuu_model.py @@ -0,0 +1,38 @@ +from __future__ import annotations + +from functools import partial + +from mteb.model_meta import ModelMeta, sentence_transformers_loader + +codemodernbert_crow_meta = ModelMeta( + loader=partial( + sentence_transformers_loader, + model_name="Shuu12121/CodeSearch-ModernBERT-Crow-Plus", + revision="044a7a4b552f86e284817234c336bccf16f895ce", + ), + name="Shuu12121/CodeSearch-ModernBERT-Crow-Plus", + languages=["eng-Latn"], + open_weights=True, + revision="044a7a4b552f86e284817234c336bccf16f895ce", + release_date="2025-04-21", + n_parameters=151668480, + memory_usage_mb=607, + embed_dim=768, + license="apache-2.0", + max_tokens=1024, + reference="https://huggingface.co/Shuu12121/CodeSearch-ModernBERT-Crow-Plus", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instructions=False, + public_training_code=None, + public_training_data=None, + training_datasets={ + "CodeSearchNetRetrieval": [], + # "code-search-net/code_search_net": ["train"], + # "Shuu12121/python-codesearch-filtered": ["train"], + # "Shuu12121/java-codesearch-filtered": ["train"], + # "Shuu12121/javascript-codesearch-filtered": ["train"], + # "Shuu12121/ruby-codesearch-filtered": ["train"], + # "Shuu12121/rust-codesearch-filtered": ["train"], + }, +) diff --git a/mteb/models/siglip_models.py b/mteb/models/siglip_models.py index cabb3b7794..1aa85f501f 100644 --- a/mteb/models/siglip_models.py +++ b/mteb/models/siglip_models.py @@ -165,7 +165,7 @@ def get_fused_embeddings( model_name="google/siglip-so400m-patch14-224", ), name="google/siglip-so400m-patch14-224", - languages=["eng_Latn"], + languages=["eng-Latn"], revision="d04cf29fca7b6374f74d8bea1969314492266b5e", release_date="2024-01-08", modalities=["image", "text"], @@ -190,7 +190,7 @@ def get_fused_embeddings( model_name="google/siglip-so400m-patch14-384", ), name="google/siglip-so400m-patch14-384", - languages=["eng_Latn"], + languages=["eng-Latn"], revision="9fdffc58afc957d1a03a25b10dba0329ab15c2a3", release_date="2024-01-08", modalities=["image", "text"], @@ -215,7 +215,7 @@ def get_fused_embeddings( model_name="google/siglip-so400m-patch16-256-i18n", ), name="google/siglip-so400m-patch16-256-i18n", - languages=["eng_Latn"], + languages=["eng-Latn"], revision="365d321c0cfdea96bc28e3a29787a11a062681a1", release_date="2024-01-08", modalities=["image", "text"], @@ -240,7 +240,7 @@ def get_fused_embeddings( model_name="google/siglip-base-patch16-256-multilingual", ), name="google/siglip-base-patch16-256-multilingual", - languages=["eng_Latn"], + languages=["eng-Latn"], revision="8952a4eafcde3cb7ab46b1dd629b33f8784ca9c6", release_date="2024-01-08", modalities=["image", "text"], @@ -265,7 +265,7 @@ def get_fused_embeddings( model_name="google/siglip-base-patch16-256", ), name="google/siglip-base-patch16-256", - languages=["eng_Latn"], + languages=["eng-Latn"], revision="b078df89e446d623010d890864d4207fe6399f61", release_date="2024-01-08", modalities=["image", "text"], @@ -290,7 +290,7 @@ def get_fused_embeddings( model_name="google/siglip-base-patch16-512", ), name="google/siglip-base-patch16-512", - languages=["eng_Latn"], + languages=["eng-Latn"], revision="753a949581523b60257d93e18391e8c27f72eb22", release_date="2024-01-08", modalities=["image", "text"], @@ -315,7 +315,7 @@ def get_fused_embeddings( model_name="google/siglip-base-patch16-384", ), name="google/siglip-base-patch16-384", - languages=["eng_Latn"], + languages=["eng-Latn"], revision="41aec1c83b32e0a6fca20ad88ba058aa5b5ea394", release_date="2024-01-08", modalities=["image", "text"], @@ -340,7 +340,7 @@ def get_fused_embeddings( model_name="google/siglip-base-patch16-224", ), name="google/siglip-base-patch16-224", - languages=["eng_Latn"], + languages=["eng-Latn"], revision="7fd15f0689c79d79e38b1c2e2e2370a7bf2761ed", release_date="2024-01-08", modalities=["image", "text"], @@ -365,7 +365,7 @@ def get_fused_embeddings( model_name="google/siglip-large-patch16-256", ), name="google/siglip-large-patch16-256", - languages=["eng_Latn"], + languages=["eng-Latn"], revision="d0da9f876e7d66b4e250cd2450c3ba2ce735e447", release_date="2024-01-08", modalities=["image", "text"], @@ -390,7 +390,7 @@ def get_fused_embeddings( model_name="google/siglip-large-patch16-384", ), name="google/siglip-large-patch16-384", - languages=["eng_Latn"], + languages=["eng-Latn"], revision="ce005573a40965dfd21fd937fbdeeebf2439fc35", release_date="2024-01-08", modalities=["image", "text"], diff --git a/mteb/models/stella_models.py b/mteb/models/stella_models.py index 273858e3a1..83b9078ef3 100644 --- a/mteb/models/stella_models.py +++ b/mteb/models/stella_models.py @@ -52,7 +52,7 @@ torch_dtype="auto", ), name="NovaSearch/stella_en_400M_v5", - languages=["eng_Latn"], + languages=["eng-Latn"], open_weights=True, use_instructions=True, revision="1bb50bc7bb726810eac2140e62155b88b0df198f", @@ -80,7 +80,7 @@ torch_dtype="auto", ), name="NovaSearch/stella_en_1.5B_v5", - languages=["eng_Latn"], + languages=["eng-Latn"], open_weights=True, use_instructions=True, revision="d03be74b361d4eb24f42a2fe5bd2e29917df4604", @@ -100,7 +100,7 @@ stella_large_zh_v3_1792d = ModelMeta( name="dunzhang/stella-large-zh-v3-1792d", - languages=["zho_Hans"], + languages=["zho-Hans"], open_weights=True, revision="d5d39eb8cd11c80a63df53314e59997074469f09", release_date="2024-02-17", @@ -127,7 +127,7 @@ stella_base_zh_v3_1792d = ModelMeta( name="infgrad/stella-base-zh-v3-1792d", - languages=["zho_Hans"], + languages=["zho-Hans"], open_weights=True, revision="82254892a0fba125aa2abf3a4800d2dd12821343", release_date="2024-02-17", @@ -155,7 +155,7 @@ stella_mrl_large_zh_v3_5_1792d = ModelMeta( name="dunzhang/stella-mrl-large-zh-v3.5-1792d", - languages=["zho_Hans"], + languages=["zho-Hans"], open_weights=True, revision="17bb1c32a93a8fc5f6fc9e91d5ea86da99983cfe", release_date="2024-02-27", @@ -177,7 +177,7 @@ zpoint_large_embedding_zh = ModelMeta( name="iampanda/zpoint_large_embedding_zh", - languages=["zho_Hans"], + languages=["zho-Hans"], open_weights=True, revision="b1075144f440ab4409c05622c1179130ebd57d03", release_date="2024-06-04", diff --git a/mteb/models/ua_sentence_models.py b/mteb/models/ua_sentence_models.py new file mode 100644 index 0000000000..dd23160f77 --- /dev/null +++ b/mteb/models/ua_sentence_models.py @@ -0,0 +1,31 @@ +"""Sentence models for evaluation on the Ukrainian part of MTEB""" + +from __future__ import annotations + +from mteb.model_meta import ModelMeta + +xlm_roberta_ua_distilled = ModelMeta( + name="panalexeu/xlm-roberta-ua-distilled", + n_parameters=278_000_000, + memory_usage_mb=1061, + max_tokens=512, + embed_dim=768, + revision="9216f50d76b032350ca312246fa2f5dcaa6ca971", + release_date="2025-04-15", + license="mit", + open_weights=True, + public_training_code="https://github.com/panalexeu/xlm-roberta-ua-distilled/blob/main/researches/research_final.ipynb", + similarity_fn_name="cosine", + framework=["Sentence Transformers"], + reference="https://github.com/panalexeu/xlm-roberta-ua-distilled/tree/main", + languages=["eng-Latn", "ukr-Cyrl"], + training_datasets={ + # "sentence-transformers/parallel-sentences-talks": ["train"], + # "sentence-transformers/parallel-sentences-wikimatrix": ["train"], + # "sentence-transformers/parallel-sentences-tatoeba": ["train"], + }, + adapted_from="FacebookAI/xlm-roberta-base", + modalities=["text"], + public_training_data=None, + use_instructions=False, +) diff --git a/mteb/models/uae_models.py b/mteb/models/uae_models.py index 6edc84c9e5..2670344bbc 100644 --- a/mteb/models/uae_models.py +++ b/mteb/models/uae_models.py @@ -62,7 +62,7 @@ def encode( }, ), name="WhereIsAI/UAE-Large-V1", - languages=["eng_Latn"], + languages=["eng-Latn"], open_weights=True, revision="369c368f70f16a613f19f5598d4f12d9f44235d4", release_date="2023-12-04", # initial commit of hf model. diff --git a/mteb/models/vdr_models.py b/mteb/models/vdr_models.py index bc1cd66c83..33a23da866 100644 --- a/mteb/models/vdr_models.py +++ b/mteb/models/vdr_models.py @@ -13,12 +13,12 @@ def instruction_template( return "{instruction}" -languages = [ - "eng_Latn", - "ita_Latn", - "fra_Latn", - "deu_Latn", - "spa_Latn", +vdr_languages = [ + "eng-Latn", + "ita-Latn", + "fra-Latn", + "deu-Latn", + "spa-Latn", ] vdr_2b_multi_v1 = ModelMeta( @@ -30,7 +30,7 @@ def instruction_template( apply_instruction_to_passages=True, ), name="llamaindex/vdr-2b-multi-v1", - languages=languages, + languages=vdr_languages, open_weights=True, revision="2c4e54c8db4071cc61fc3c62f4490124e40c37db", release_date="2024-01-08", diff --git a/mteb/models/vista_models.py b/mteb/models/vista_models.py index 0905e649ab..7f8ff80500 100644 --- a/mteb/models/vista_models.py +++ b/mteb/models/vista_models.py @@ -273,7 +273,7 @@ def calculate_probs(self, text_embeddings, image_embeddings): image_tokens_num=196, ), name="BAAI/bge-visualized-base", - languages=["eng_Latn"], + languages=["eng-Latn"], revision="98db10b10d22620010d06f11733346e1c98c34aa", release_date="2024-06-06", modalities=["image", "text"], @@ -300,7 +300,7 @@ def calculate_probs(self, text_embeddings, image_embeddings): image_tokens_num=256, ), name="BAAI/bge-visualized-m3", - languages=["eng_Latn"], + languages=["eng-Latn"], revision="98db10b10d22620010d06f11733346e1c98c34aa", release_date="2024-06-06", modalities=["image", "text"], diff --git a/mteb/models/vlm2vec_models.py b/mteb/models/vlm2vec_models.py index 70cc51cd28..65ca7b4004 100644 --- a/mteb/models/vlm2vec_models.py +++ b/mteb/models/vlm2vec_models.py @@ -380,7 +380,7 @@ def get_fused_embeddings( model_name="TIGER-Lab/VLM2Vec-LoRA", ), name="TIGER-Lab/VLM2Vec-LoRA", - languages=["eng_Latn"], + languages=["eng-Latn"], revision="7403b6327958071c1e33c822c7453adadccc7298", release_date="2024-10-08", modalities=["image", "text"], @@ -405,7 +405,7 @@ def get_fused_embeddings( model_name="TIGER-Lab/VLM2Vec-Full", ), name="TIGER-Lab/VLM2Vec-Full", - languages=["eng_Latn"], + languages=["eng-Latn"], revision="e9afa98002097ac2471827ba23ea1f2ddd229480", release_date="2024-10-08", modalities=["image", "text"], diff --git a/mteb/models/wav2vec2_models.py b/mteb/models/wav2vec2_models.py index 2eacc5a49f..4b78be92c5 100644 --- a/mteb/models/wav2vec2_models.py +++ b/mteb/models/wav2vec2_models.py @@ -17,61 +17,61 @@ # ISO 639-3 codes for languages supported by wav2vec2 models WAV2VEC2_LANGUAGES = [ - "afr_Latn", - "sqi_Latn", - "amh_Latn", - "ara_Latn", - "hye_Latn", - "asm_Latn", - "aze_Latn", - "eus_Latn", - "bel_Latn", - "ben_Beng", - "bos_Latn", - "bre_Latn", - "bul_Latn", - "mya_Latn", - "cat_Latn", - "khm_Latn", - "zho_Latn", - "hrv_Latn", - "ces_Latn", - "dan_Latn", - "nld_Latn", - "eng_Latn", - "epo_Latn", - "est_Latn", - "fin_Latn", - "fra_Latn", - "glg_Latn", - "kat_Latn", - "deu_Latn", - "ell_Latn", - "guj_Latn", - "hau_Latn", - "heb_Latn", - "hin_Deva", - "hun_Latn", - "isl_Latn", - "ind_Latn", - "gle_Latn", - "ita_Latn", - "jpn_Latn", - "jav_Latn", - "kan_Latn", - "kaz_Latn", - "kir_Latn", - "abk_Cyrl", - "bak_Cyrl", - "ceb_Latn", - "chv_Cyrl", - "div_Thaa", - "fao_Latn", - "grn_Latn", - "hat_Latn", - "haw_Latn", - "ina_Latn", - "kin_Latn", + "afr-Latn", + "sqi-Latn", + "amh-Latn", + "ara-Latn", + "hye-Latn", + "asm-Latn", + "aze-Latn", + "eus-Latn", + "bel-Latn", + "ben-Beng", + "bos-Latn", + "bre-Latn", + "bul-Latn", + "mya-Latn", + "cat-Latn", + "khm-Latn", + "zho-Latn", + "hrv-Latn", + "ces-Latn", + "dan-Latn", + "nld-Latn", + "eng-Latn", + "epo-Latn", + "est-Latn", + "fin-Latn", + "fra-Latn", + "glg-Latn", + "kat-Latn", + "deu-Latn", + "ell-Latn", + "guj-Latn", + "hau-Latn", + "heb-Latn", + "hin-Deva", + "hun-Latn", + "isl-Latn", + "ind-Latn", + "gle-Latn", + "ita-Latn", + "jpn-Latn", + "jav-Latn", + "kan-Latn", + "kaz-Latn", + "kir-Latn", + "abk-Cyrl", + "bak-Cyrl", + "ceb-Latn", + "chv-Cyrl", + "div-Thaa", + "fao-Latn", + "grn-Latn", + "hat-Latn", + "haw-Latn", + "ina-Latn", + "kin-Latn", ] @@ -229,7 +229,7 @@ def encode( Wav2Vec2AudioWrapper, model_name="vitouphy/wav2vec2-xls-r-300m-phoneme" ), name="vitouphy/wav2vec2-xls-r-300m-phoneme", - languages=["eng_Latn"], + languages=["eng-Latn"], revision="bf9913bf096d133cf4eca64ed75981ebf0545c9d", release_date="2022-05-19", modalities=["audio"], @@ -323,7 +323,7 @@ def encode( model_revision="0b5b8e868dd84f03fd87d01f9c4ff0f080fecfe8", ), name="facebook/wav2vec2-base", - languages=["en"], + languages=["eng-Latn"], open_weights=True, revision="0b5b8e868dd84f03fd87d01f9c4ff0f080fecfe8", release_date="2020-10-26", @@ -350,7 +350,7 @@ def encode( model_revision="22aad52d435eb6dbaf354bdad9b0da84ce7d6156", ), name="facebook/wav2vec2-base-960h", - languages=["en"], + languages=["eng-Latn"], open_weights=True, revision="22aad52d435eb6dbaf354bdad9b0da84ce7d6156", release_date="2020-10-26", @@ -377,7 +377,7 @@ def encode( model_revision="312b2410566b698c7a649068d413b2067848bd75", ), name="facebook/wav2vec2-large", - languages=["en"], + languages=["eng-Latn"], open_weights=True, revision="312b2410566b698c7a649068d413b2067848bd75", release_date="2020-10-26", @@ -404,7 +404,7 @@ def encode( model_revision="c3f9d884181a224a6ac87bf8885c84d1cff3384f", ), name="facebook/wav2vec2-large-xlsr-53", - languages=["en"], + languages=["eng-Latn"], open_weights=True, revision="c3f9d884181a224a6ac87bf8885c84d1cff3384f", release_date="2020-10-26", @@ -431,7 +431,7 @@ def encode( model_revision="ae45363bf3413b374fecd9dc8bc1df0e24c3b7f4", ), name="facebook/wav2vec2-lv-60-espeak-cv-ft", - languages=["en"], + languages=["eng-Latn"], open_weights=True, revision="ae45363bf3413b374fecd9dc8bc1df0e24c3b7f4", release_date="2020-10-26", diff --git a/mteb/models/wavlm_models.py b/mteb/models/wavlm_models.py index cd6a46fc85..7b429c9700 100644 --- a/mteb/models/wavlm_models.py +++ b/mteb/models/wavlm_models.py @@ -8,7 +8,6 @@ import torch import torchaudio from torch.utils.data import DataLoader -from tqdm import tqdm from transformers import Wav2Vec2FeatureExtractor, WavLMModel from mteb.encoder_interface import AudioBatch, AudioData, PromptType @@ -27,12 +26,11 @@ def __init__( self.model_name = model_name self.model_revision = model_revision self.device = device - + self.model = WavLMModel.from_pretrained( - self.model_name, - revision=self.model_revision + self.model_name, revision=self.model_revision ).to(self.device) - + self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained( self.model_name ) @@ -96,7 +94,7 @@ def _load_audio_file(self, path: str) -> torch.Tensor: def _pad_audio_batch(self, batch): batch = [x.reshape(-1) if x.ndim == 0 else x for x in batch] - max_length = max(audio.shape[0] for audio in batch) + max_length = max(audio.shape[0] for audio in batch) padded_batch = [ torch.nn.functional.pad(audio, (0, max_length - audio.shape[0])) for audio in batch @@ -118,16 +116,16 @@ def get_audio_embeddings( with torch.no_grad(): for i in range(0, len(processed_audio), batch_size): batch = processed_audio[i : i + batch_size] - + batch_tensor = self._pad_audio_batch(batch) - + if batch_tensor.ndim == 1: - batch_tensor = batch_tensor.unsqueeze(0) + batch_tensor = batch_tensor.unsqueeze(0) elif batch_tensor.ndim > 2: batch_tensor = batch_tensor.view(batch_tensor.size(0), -1) - + inputs = self.feature_extractor( - batch_tensor.cpu().numpy(), + batch_tensor.cpu().numpy(), sampling_rate=self.sampling_rate, return_tensors="pt", padding="longest", @@ -159,6 +157,7 @@ def encode( ) -> np.ndarray: return self.get_audio_embeddings(inputs, task_name=task_name, **kwargs).numpy() + wavlm_base = ModelMeta( loader=partial( WavlmWrapper, @@ -166,7 +165,7 @@ def encode( model_revision="efa81aae7ff777e464159e0f877d54eac5b84f81", ), name="microsoft/wavlm-base", - languages=["eng"], + languages=["eng-Latn"], open_weights=True, revision="efa81aae7ff777e464159e0f877d54eac5b84f81", release_date="2022-07-19", @@ -192,7 +191,7 @@ def encode( model_revision="fe13cca7e592cf0e11287cfede24e6999ac7dc4e", ), name="microsoft/wavlm-base-sd", - languages=["eng"], + languages=["eng-Latn"], open_weights=True, revision="fe13cca7e592cf0e11287cfede24e6999ac7dc4e", release_date="2022-07-19", @@ -218,7 +217,7 @@ def encode( model_revision="4c66d4806a428f2e922ccfa1a962776e232d487b", ), name="microsoft/wavlm-base-plus", - languages=["eng"], + languages=["eng-Latn"], open_weights=True, revision="4c66d4806a428f2e922ccfa1a962776e232d487b", release_date="2022-07-19", @@ -248,7 +247,7 @@ def encode( model_revision="feb593a6c23c1cc3d9510425c29b0a14d2b07b1e", ), name="microsoft/wavlm-base-plus-sv", - languages=["eng"], + languages=["eng-Latn"], open_weights=True, revision="feb593a6c23c1cc3d9510425c29b0a14d2b07b1e", release_date="2022-07-19", @@ -279,7 +278,7 @@ def encode( model_revision="5bd86f0662bd55704109a794c6a1b1790ea0f91a", ), name="microsoft/wavlm-base-plus-sd", - languages=["eng"], + languages=["eng-Latn"], open_weights=True, revision="5bd86f0662bd55704109a794c6a1b1790ea0f91a", release_date="2022-07-19", @@ -310,7 +309,7 @@ def encode( model_revision="0a23162ffc49adcf42bdf836a00cb2eb45af3601", ), name="microsoft/wavlm-base-sv", - languages=["eng"], + languages=["eng-Latn"], open_weights=True, revision="0a23162ffc49adcf42bdf836a00cb2eb45af3601", release_date="2022-07-19", @@ -336,7 +335,7 @@ def encode( model_revision="c1423ed94bb01d80a3f5ce5bc39f6026a0f4828c", ), name="microsoft/wavlm-large", - languages=["eng"], + languages=["eng-Latn"], open_weights=True, revision="c1423ed94bb01d80a3f5ce5bc39f6026a0f4828c", release_date="2022-07-19", diff --git a/mteb/models/whisper_models.py b/mteb/models/whisper_models.py index 06505f0a24..608866b848 100644 --- a/mteb/models/whisper_models.py +++ b/mteb/models/whisper_models.py @@ -154,11 +154,113 @@ def encode( # Model Metas for Different Whisper Models +whisper_langs = [ + "eng-Latn", + "zho-Hans", + "deu-Latn", + "spa-Latn", + "rus-Cyrl", + "kor-Hang", + "fra-Latn", + "jpn-Jpan", + "por-Latn", + "tur-Latn", + "pol-Latn", + "cat-Latn", + "nld-Latn", + "ara-Arab", + "swe-Latn", + "ita-Latn", + "ind-Latn", + "hin-Deva", + "fin-Latn", + "vie-Latn", + "heb-Hebr", + "ukr-Cyrl", + "ell-Grek", + "msa-Latn", + "ces-Latn", + "ron-Latn", + "dan-Latn", + "hun-Latn", + "tam-Taml", + "nob-Latn", + "tha-Thai", + "urd-Arab", + "hrv-Latn", + "bul-Cyrl", + "lit-Latn", + "lat-Latn", + "mri-Latn", + "mal-Mlym", + "cym-Latn", + "slk-Latn", + "tel-Telu", + "fas-Arab", + "lav-Latn", + "ben-Beng", + "srp-Cyrl", + "aze-Latn", + "slv-Latn", + "kan-Knda", + "est-Latn", + "mkd-Cyrl", + "bre-Latn", + "eus-Latn", + "isl-Latn", + "hye-Armn", + "nep-Deva", + "mon-Cyrl", + "bos-Latn", + "kaz-Cyrl", + "sqi-Latn", + "swa-Latn", + "glg-Latn", + "mar-Deva", + "pan-Guru", + "sin-Sinh", + "khm-Khmr", + "sna-Latn", + "yor-Latn", + "som-Latn", + "afr-Latn", + "oci-Latn", + "kat-Geor", + "bel-Cyrl", + "tgk-Cyrl", + "snd-Arab", + "guj-Gujr", + "amh-Ethi", + "yid-Hebr", + "lao-Laoo", + "uzb-Latn", + "fao-Latn", + "hat-Latn", + "pus-Arab", + "tuk-Latn", + "nno-Latn", + "mlt-Latn", + "san-Deva", + "ltz-Latn", + "mya-Mymr", + "bod-Tibt", + "tgl-Latn", + "mlg-Latn", + "asm-Beng", + "tat-Cyrl", + "haw-Latn", + "lin-Latn", + "hau-Latn", + "bak-Cyrl", + "jav-Latn", + "sun-Latn", +] + whisper_tiny = ModelMeta( loader=partial(WhisperAudioWrapper, model_name="openai/whisper-tiny"), name="openai/whisper-tiny", - languages=["eng", "multilingual"], + languages=whisper_langs, open_weights=True, revision="main", release_date="2022-09-27", @@ -180,7 +282,7 @@ def encode( whisper_base = ModelMeta( loader=partial(WhisperAudioWrapper, model_name="openai/whisper-base"), name="openai/whisper-base", - languages=["eng", "multilingual"], + languages=whisper_langs, open_weights=True, revision="main", release_date="2022-09-27", @@ -202,7 +304,7 @@ def encode( whisper_small = ModelMeta( loader=partial(WhisperAudioWrapper, model_name="openai/whisper-small"), name="openai/whisper-small", - languages=["eng", "multilingual"], + languages=whisper_langs, open_weights=True, revision="main", release_date="2022-09-27", @@ -224,7 +326,7 @@ def encode( whisper_medium = ModelMeta( loader=partial(WhisperAudioWrapper, model_name="openai/whisper-medium"), name="openai/whisper-medium", - languages=["eng", "multilingual"], + languages=whisper_langs, open_weights=True, revision="main", release_date="2022-09-27", @@ -246,7 +348,7 @@ def encode( whisper_large_v3 = ModelMeta( loader=partial(WhisperAudioWrapper, model_name="openai/whisper-large-v3"), name="openai/whisper-large-v3", - languages=["multilingual"], + languages=whisper_langs, open_weights=True, revision="main", release_date="2022-09-27", diff --git a/mteb/overview.py b/mteb/overview.py index 03ad0e67ba..461221e7fa 100644 --- a/mteb/overview.py +++ b/mteb/overview.py @@ -133,6 +133,15 @@ def filter_tasks_by_modalities( return [t for t in tasks if _modalities.intersection(t.modalities)] +def filter_aggregate_tasks(tasks: list[AbsTask]) -> list[AbsTask]: + """Returns input tasks that are *not* aggregate. + + Args: + tasks: A list of tasks to filter. + """ + return [t for t in tasks if not t.is_aggregate] + + class MTEBTasks(tuple): def __repr__(self) -> str: return "MTEBTasks" + super().__repr__() @@ -278,6 +287,7 @@ def get_tasks( exclusive_language_filter: bool = False, modalities: list[MODALITIES] | None = None, exclusive_modality_filter: bool = False, + exclude_aggregate: bool = False, ) -> MTEBTasks: """Get a list of tasks based on the specified filters. @@ -300,6 +310,7 @@ def get_tasks( exclusive_modality_filter: If True, only keep tasks where _all_ filter modalities are included in the task's modalities and ALL task modalities are in filter modalities (exact match). If False, keep tasks if _any_ of the task's modalities match the filter modalities. + exclude_aggregate: If True, exclude aggregate tasks. If False, both aggregate and non-aggregate tasks are returned. Returns: A list of all initialized tasks objects which pass all of the filters (AND operation). @@ -350,6 +361,8 @@ def get_tasks( _tasks = filter_tasks_by_modalities( _tasks, modalities, exclusive_modality_filter ) + if exclude_aggregate: + _tasks = filter_aggregate_tasks(_tasks) return MTEBTasks(_tasks) diff --git a/mteb/tasks/Audio/AudioClassification/eng/BeijingOpera.py b/mteb/tasks/Audio/AudioClassification/eng/BeijingOpera.py index 9147e594f4..4912ff4122 100644 --- a/mteb/tasks/Audio/AudioClassification/eng/BeijingOpera.py +++ b/mteb/tasks/Audio/AudioClassification/eng/BeijingOpera.py @@ -28,17 +28,19 @@ class BeijingOpera(AbsTaskAudioClassification): dialect=[], modalities=["audio"], sample_creation="created", - bibtex_citation="""@INPROCEEDINGS{6853981, - author={Tian, Mi and Srinivasamurthy, Ajay and Sandler, Mark and Serra, Xavier}, - booktitle={2014 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)}, - title={A study of instrument-wise onset detection in Beijing Opera percussion ensembles}, - year={2014}, - volume={}, - number={}, - pages={2159-2163}, - keywords={Decision support systems;Conferences;Acoustics;Speech;Speech processing;Time-frequency analysis;Beijing Opera;Onset Detection;Drum Transcription;Non-negative matrix factorization}, - doi={10.1109/ICASSP.2014.6853981}} - """, + bibtex_citation=r""" +@inproceedings{6853981, + author = {Tian, Mi and Srinivasamurthy, Ajay and Sandler, Mark and Serra, Xavier}, + booktitle = {2014 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)}, + doi = {10.1109/ICASSP.2014.6853981}, + keywords = {Decision support systems;Conferences;Acoustics;Speech;Speech processing;Time-frequency analysis;Beijing Opera;Onset Detection;Drum Transcription;Non-negative matrix factorization}, + number = {}, + pages = {2159-2163}, + title = {A study of instrument-wise onset detection in Beijing Opera percussion ensembles}, + volume = {}, + year = {2014}, +} +""", descriptive_stats={ "n_samples": {"train": 236, "test": 3021}, # test samples not found! }, diff --git a/mteb/tasks/Audio/AudioClassification/eng/CREMA_D.py b/mteb/tasks/Audio/AudioClassification/eng/CREMA_D.py index 3615b9c429..19d885b6f7 100644 --- a/mteb/tasks/Audio/AudioClassification/eng/CREMA_D.py +++ b/mteb/tasks/Audio/AudioClassification/eng/CREMA_D.py @@ -28,16 +28,18 @@ class CREMA_D(AbsTaskAudioClassification): dialect=[], modalities=["audio"], sample_creation="created", - bibtex_citation="""@article{cao2014crema, - title={Crema-d: Crowd-sourced emotional multimodal actors dataset}, - author={Cao, Houwei and Cooper, David G and Keutmann, Michael K and Gur, Ruben C and Nenkova, Ani and Verma, Ragini}, - journal={IEEE transactions on affective computing}, - volume={5}, - number={4}, - pages={377--390}, - year={2014}, - publisher={IEEE} - }""", + bibtex_citation=r""" +@article{cao2014crema, + author = {Cao, Houwei and Cooper, David G and Keutmann, Michael K and Gur, Ruben C and Nenkova, Ani and Verma, Ragini}, + journal = {IEEE transactions on affective computing}, + number = {4}, + pages = {377--390}, + publisher = {IEEE}, + title = {Crema-d: Crowd-sourced emotional multimodal actors dataset}, + volume = {5}, + year = {2014}, +} +""", descriptive_stats={ "n_samples": {"train": 7442}, }, diff --git a/mteb/tasks/Audio/AudioClassification/eng/CommonLanguageAgeDetection.py b/mteb/tasks/Audio/AudioClassification/eng/CommonLanguageAgeDetection.py index fc729ffb9b..8f4617bdf3 100644 --- a/mteb/tasks/Audio/AudioClassification/eng/CommonLanguageAgeDetection.py +++ b/mteb/tasks/Audio/AudioClassification/eng/CommonLanguageAgeDetection.py @@ -28,18 +28,19 @@ class CommonLanguageAgeDetection(AbsTaskAudioClassification): dialect=[], modalities=["audio"], sample_creation="found", - bibtex_citation="""@dataset{ganesh_sinisetty_2021_5036977, - author = {Ganesh Sinisetty and - Pavlo Ruban and - Oleksandr Dymov and - Mirco Ravanelli}, - title = {CommonLanguage}, - month = jun, - year = 2021, - publisher = {Zenodo}, - version = {0.1}, - doi = {10.5281/zenodo.5036977}, - url = {https://doi.org/10.5281/zenodo.5036977} + bibtex_citation=r""" +@dataset{ganesh_sinisetty_2021_5036977, + author = {Ganesh Sinisetty and +Pavlo Ruban and +Oleksandr Dymov and +Mirco Ravanelli}, + doi = {10.5281/zenodo.5036977}, + month = jun, + publisher = {Zenodo}, + title = {CommonLanguage}, + url = {https://doi.org/10.5281/zenodo.5036977}, + version = {0.1}, + year = {2021}, } """, descriptive_stats={ diff --git a/mteb/tasks/Audio/AudioClassification/eng/CommonLanguageGenderDetection.py b/mteb/tasks/Audio/AudioClassification/eng/CommonLanguageGenderDetection.py index 02f96514cd..15b3c869e8 100644 --- a/mteb/tasks/Audio/AudioClassification/eng/CommonLanguageGenderDetection.py +++ b/mteb/tasks/Audio/AudioClassification/eng/CommonLanguageGenderDetection.py @@ -28,18 +28,19 @@ class CommonLanguageGenderDetection(AbsTaskAudioClassification): dialect=[], modalities=["audio"], sample_creation="found", - bibtex_citation="""@dataset{ganesh_sinisetty_2021_5036977, - author = {Ganesh Sinisetty and - Pavlo Ruban and - Oleksandr Dymov and - Mirco Ravanelli}, - title = {CommonLanguage}, - month = jun, - year = 2021, - publisher = {Zenodo}, - version = {0.1}, - doi = {10.5281/zenodo.5036977}, - url = {https://doi.org/10.5281/zenodo.5036977} + bibtex_citation=r""" +@dataset{ganesh_sinisetty_2021_5036977, + author = {Ganesh Sinisetty and +Pavlo Ruban and +Oleksandr Dymov and +Mirco Ravanelli}, + doi = {10.5281/zenodo.5036977}, + month = jun, + publisher = {Zenodo}, + title = {CommonLanguage}, + url = {https://doi.org/10.5281/zenodo.5036977}, + version = {0.1}, + year = {2021}, } """, descriptive_stats={ diff --git a/mteb/tasks/Audio/AudioClassification/eng/CommonLanguageLanguageClassification.py b/mteb/tasks/Audio/AudioClassification/eng/CommonLanguageLanguageClassification.py index 2386cb28d4..b8fac9c656 100644 --- a/mteb/tasks/Audio/AudioClassification/eng/CommonLanguageLanguageClassification.py +++ b/mteb/tasks/Audio/AudioClassification/eng/CommonLanguageLanguageClassification.py @@ -28,18 +28,19 @@ class CommonLanguageLanguageClassification(AbsTaskAudioClassification): dialect=[], modalities=["audio"], sample_creation="found", - bibtex_citation="""@dataset{ganesh_sinisetty_2021_5036977, - author = {Ganesh Sinisetty and - Pavlo Ruban and - Oleksandr Dymov and - Mirco Ravanelli}, - title = {CommonLanguage}, - month = jun, - year = 2021, - publisher = {Zenodo}, - version = {0.1}, - doi = {10.5281/zenodo.5036977}, - url = {https://doi.org/10.5281/zenodo.5036977} + bibtex_citation=r""" +@dataset{ganesh_sinisetty_2021_5036977, + author = {Ganesh Sinisetty and +Pavlo Ruban and +Oleksandr Dymov and +Mirco Ravanelli}, + doi = {10.5281/zenodo.5036977}, + month = jun, + publisher = {Zenodo}, + title = {CommonLanguage}, + url = {https://doi.org/10.5281/zenodo.5036977}, + version = {0.1}, + year = {2021}, } """, descriptive_stats={ diff --git a/mteb/tasks/Audio/AudioClassification/eng/ESC50.py b/mteb/tasks/Audio/AudioClassification/eng/ESC50.py index 98cb23f2c6..6132dcd108 100644 --- a/mteb/tasks/Audio/AudioClassification/eng/ESC50.py +++ b/mteb/tasks/Audio/AudioClassification/eng/ESC50.py @@ -30,18 +30,20 @@ class ESC50Classification(AbsTaskAudioClassification): dialect=[], modalities=["audio"], sample_creation="found", - bibtex_citation="""@inproceedings{piczak2015dataset, - title = {{ESC}: {Dataset} for {Environmental Sound Classification}}, - author = {Piczak, Karol J.}, - booktitle = {Proceedings of the 23rd {Annual ACM Conference} on {Multimedia}}, - date = {2015-10-13}, - url = {http://dl.acm.org/citation.cfm?doid=2733373.2806390}, - doi = {10.1145/2733373.2806390}, - location = {{Brisbane, Australia}}, - isbn = {978-1-4503-3459-4}, - publisher = {{ACM Press}}, - pages = {1015--1018} - }""", + bibtex_citation=r""" +@inproceedings{piczak2015dataset, + author = {Piczak, Karol J.}, + booktitle = {Proceedings of the 23rd {Annual ACM Conference} on {Multimedia}}, + date = {2015-10-13}, + doi = {10.1145/2733373.2806390}, + isbn = {978-1-4503-3459-4}, + location = {{Brisbane, Australia}}, + pages = {1015--1018}, + publisher = {{ACM Press}}, + title = {{ESC}: {Dataset} for {Environmental Sound Classification}}, + url = {http://dl.acm.org/citation.cfm?doid=2733373.2806390}, +} +""", descriptive_stats={ "n_samples": {"train": 2000}, }, diff --git a/mteb/tasks/Audio/AudioClassification/eng/FSDD.py b/mteb/tasks/Audio/AudioClassification/eng/FSDD.py index 45ac58dae5..887c430adf 100644 --- a/mteb/tasks/Audio/AudioClassification/eng/FSDD.py +++ b/mteb/tasks/Audio/AudioClassification/eng/FSDD.py @@ -28,14 +28,15 @@ class FSDD(AbsTaskAudioClassification): dialect=[], modalities=["audio"], sample_creation="created", - bibtex_citation="""@misc{zohar2018free, - author={J. Zohar and S. Cãar and F. Jason and P. Yuxin and N. Hereman and T. Adhish}, - title={Jakobovski/Free-Spoken-Digit-Dataset: V1.0.8}, - year={2018}, - month={aug}, - url={https://doi.org/10.5281/zenodo.1342401} - } - """, + bibtex_citation=r""" +@misc{zohar2018free, + author = {J. Zohar and S. Cãar and F. Jason and P. Yuxin and N. Hereman and T. Adhish}, + month = {aug}, + title = {Jakobovski/Free-Spoken-Digit-Dataset: V1.0.8}, + url = {https://doi.org/10.5281/zenodo.1342401}, + year = {2018}, +} +""", descriptive_stats={ "n_samples": {"train": 2700, "test": 300}, }, diff --git a/mteb/tasks/Audio/AudioClassification/eng/GTZANGenre.py b/mteb/tasks/Audio/AudioClassification/eng/GTZANGenre.py index 5aef540a96..15c997d8f5 100644 --- a/mteb/tasks/Audio/AudioClassification/eng/GTZANGenre.py +++ b/mteb/tasks/Audio/AudioClassification/eng/GTZANGenre.py @@ -28,16 +28,19 @@ class GTZANGenre(AbsTaskAudioClassification): dialect=[], modalities=["audio"], sample_creation="found", - bibtex_citation="""@ARTICLE{1021072, - author={Tzanetakis, G. and Cook, P.}, - journal={IEEE Transactions on Speech and Audio Processing}, - title={Musical genre classification of audio signals}, - year={2002}, - volume={10}, - number={5}, - pages={293-302}, - keywords={Humans;Music information retrieval;Instruments;Computer science;Multiple signal classification;Signal analysis;Pattern recognition;Feature extraction;Wavelet analysis;Cultural differences}, - doi={10.1109/TSA.2002.800560}}""", + bibtex_citation=r""" +@article{1021072, + author = {Tzanetakis, G. and Cook, P.}, + doi = {10.1109/TSA.2002.800560}, + journal = {IEEE Transactions on Speech and Audio Processing}, + keywords = {Humans;Music information retrieval;Instruments;Computer science;Multiple signal classification;Signal analysis;Pattern recognition;Feature extraction;Wavelet analysis;Cultural differences}, + number = {5}, + pages = {293-302}, + title = {Musical genre classification of audio signals}, + volume = {10}, + year = {2002}, +} +""", descriptive_stats={ "n_samples": {"train": 1000}, }, diff --git a/mteb/tasks/Audio/AudioClassification/eng/GunshotTriangulation.py b/mteb/tasks/Audio/AudioClassification/eng/GunshotTriangulation.py index 06a6b72c58..6e70ac45af 100644 --- a/mteb/tasks/Audio/AudioClassification/eng/GunshotTriangulation.py +++ b/mteb/tasks/Audio/AudioClassification/eng/GunshotTriangulation.py @@ -28,16 +28,17 @@ class GunshotTriangulation(AbsTaskAudioClassification): dialect=[], modalities=["audio"], sample_creation="found", - bibtex_citation="""@misc{raponi2021soundgunsdigitalforensics, - title={Sound of Guns: Digital Forensics of Gun Audio Samples meets Artificial Intelligence}, - author={Simone Raponi and Isra Ali and Gabriele Oligeri}, - year={2021}, - eprint={2004.07948}, - archivePrefix={arXiv}, - primaryClass={eess.AS}, - url={https://arxiv.org/abs/2004.07948}, - } - }""", + bibtex_citation=r""" +@misc{raponi2021soundgunsdigitalforensics, + archiveprefix = {arXiv}, + author = {Simone Raponi and Isra Ali and Gabriele Oligeri}, + eprint = {2004.07948}, + primaryclass = {eess.AS}, + title = {Sound of Guns: Digital Forensics of Gun Audio Samples meets Artificial Intelligence}, + url = {https://arxiv.org/abs/2004.07948}, + year = {2021}, +} +""", descriptive_stats={ "n_samples": {"train": 88}, }, diff --git a/mteb/tasks/Audio/AudioClassification/eng/LibriCount.py b/mteb/tasks/Audio/AudioClassification/eng/LibriCount.py index ff466072b5..ac8d5e9e07 100644 --- a/mteb/tasks/Audio/AudioClassification/eng/LibriCount.py +++ b/mteb/tasks/Audio/AudioClassification/eng/LibriCount.py @@ -28,16 +28,19 @@ class LibriCount(AbsTaskAudioClassification): dialect=[], modalities=["audio"], sample_creation="created", # from LibriSpeech dataset - bibtex_citation="""@inproceedings{Stoter_2018, - title={Classification vs. Regression in Supervised Learning for Single Channel Speaker Count Estimation}, - url={http://dx.doi.org/10.1109/ICASSP.2018.8462159}, - DOI={10.1109/icassp.2018.8462159}, - booktitle={2018 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)}, - publisher={IEEE}, - author={Stoter, Fabian-Robert and Chakrabarty, Soumitro and Edler, Bernd and Habets, Emanuel A. P.}, - year={2018}, - month=apr, pages={436-440}} - """, + bibtex_citation=r""" +@inproceedings{Stoter_2018, + author = {Stoter, Fabian-Robert and Chakrabarty, Soumitro and Edler, Bernd and Habets, Emanuel A. P.}, + booktitle = {2018 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)}, + doi = {10.1109/icassp.2018.8462159}, + month = apr, + pages = {436-440}, + publisher = {IEEE}, + title = {Classification vs. Regression in Supervised Learning for Single Channel Speaker Count Estimation}, + url = {http://dx.doi.org/10.1109/ICASSP.2018.8462159}, + year = {2018}, +} +""", descriptive_stats={ "n_samples": {"train": 5720}, }, diff --git a/mteb/tasks/Audio/AudioClassification/eng/MridinghamStroke.py b/mteb/tasks/Audio/AudioClassification/eng/MridinghamStroke.py index 8729f46b88..57f1840ff6 100644 --- a/mteb/tasks/Audio/AudioClassification/eng/MridinghamStroke.py +++ b/mteb/tasks/Audio/AudioClassification/eng/MridinghamStroke.py @@ -28,17 +28,19 @@ class MridinghamStroke(AbsTaskAudioClassification): dialect=[], modalities=["audio"], sample_creation="created", - bibtex_citation="""@INPROCEEDINGS{6637633, - author={Anantapadmanabhan, Akshay and Bellur, Ashwin and Murthy, Hema A}, - booktitle={2013 IEEE International Conference on Acoustics, Speech and Signal Processing}, - title={Modal analysis and transcription of strokes of the mridangam using non-negative matrix factorization}, - year={2013}, - volume={}, - number={}, - pages={181-185}, - keywords={Instruments;Vectors;Hidden Markov models;Harmonic analysis;Modal analysis;Dictionaries;Music;Modal Analysis;Mridangam;automatic transcription;Non-negative Matrix Factorization;Hidden Markov models}, - doi={10.1109/ICASSP.2013.6637633}} - """, + bibtex_citation=r""" +@inproceedings{6637633, + author = {Anantapadmanabhan, Akshay and Bellur, Ashwin and Murthy, Hema A}, + booktitle = {2013 IEEE International Conference on Acoustics, Speech and Signal Processing}, + doi = {10.1109/ICASSP.2013.6637633}, + keywords = {Instruments;Vectors;Hidden Markov models;Harmonic analysis;Modal analysis;Dictionaries;Music;Modal Analysis;Mridangam;automatic transcription;Non-negative Matrix Factorization;Hidden Markov models}, + number = {}, + pages = {181-185}, + title = {Modal analysis and transcription of strokes of the mridangam using non-negative matrix factorization}, + volume = {}, + year = {2013}, +} +""", descriptive_stats={ "n_samples": {"train": 6977}, }, diff --git a/mteb/tasks/Audio/AudioClassification/eng/MridinghamTonic.py b/mteb/tasks/Audio/AudioClassification/eng/MridinghamTonic.py index e7c92c73f7..5ab39f53d3 100644 --- a/mteb/tasks/Audio/AudioClassification/eng/MridinghamTonic.py +++ b/mteb/tasks/Audio/AudioClassification/eng/MridinghamTonic.py @@ -28,17 +28,19 @@ class MridinghamTonic(AbsTaskAudioClassification): dialect=[], modalities=["audio"], sample_creation="created", - bibtex_citation="""@INPROCEEDINGS{6637633, - author={Anantapadmanabhan, Akshay and Bellur, Ashwin and Murthy, Hema A}, - booktitle={2013 IEEE International Conference on Acoustics, Speech and Signal Processing}, - title={Modal analysis and transcription of strokes of the mridangam using non-negative matrix factorization}, - year={2013}, - volume={}, - number={}, - pages={181-185}, - keywords={Instruments;Vectors;Hidden Markov models;Harmonic analysis;Modal analysis;Dictionaries;Music;Modal Analysis;Mridangam;automatic transcription;Non-negative Matrix Factorization;Hidden Markov models}, - doi={10.1109/ICASSP.2013.6637633}} - """, + bibtex_citation=r""" +@inproceedings{6637633, + author = {Anantapadmanabhan, Akshay and Bellur, Ashwin and Murthy, Hema A}, + booktitle = {2013 IEEE International Conference on Acoustics, Speech and Signal Processing}, + doi = {10.1109/ICASSP.2013.6637633}, + keywords = {Instruments;Vectors;Hidden Markov models;Harmonic analysis;Modal analysis;Dictionaries;Music;Modal Analysis;Mridangam;automatic transcription;Non-negative Matrix Factorization;Hidden Markov models}, + number = {}, + pages = {181-185}, + title = {Modal analysis and transcription of strokes of the mridangam using non-negative matrix factorization}, + volume = {}, + year = {2013}, +} +""", descriptive_stats={ "n_samples": {"train": 6977}, }, diff --git a/mteb/tasks/Audio/AudioClassification/eng/NSynth.py b/mteb/tasks/Audio/AudioClassification/eng/NSynth.py index 0e58917326..4c81d196f5 100644 --- a/mteb/tasks/Audio/AudioClassification/eng/NSynth.py +++ b/mteb/tasks/Audio/AudioClassification/eng/NSynth.py @@ -28,15 +28,17 @@ class NSynth(AbsTaskAudioClassification): dialect=[], modalities=["audio"], sample_creation="created", - bibtex_citation="""@misc{engel2017neuralaudiosynthesismusical, - title={Neural Audio Synthesis of Musical Notes with WaveNet Autoencoders}, - author={Jesse Engel and Cinjon Resnick and Adam Roberts and Sander Dieleman and Douglas Eck and Karen Simonyan and Mohammad Norouzi}, - year={2017}, - eprint={1704.01279}, - archivePrefix={arXiv}, - primaryClass={cs.LG}, - url={https://arxiv.org/abs/1704.01279}, - }""", + bibtex_citation=r""" +@misc{engel2017neuralaudiosynthesismusical, + archiveprefix = {arXiv}, + author = {Jesse Engel and Cinjon Resnick and Adam Roberts and Sander Dieleman and Douglas Eck and Karen Simonyan and Mohammad Norouzi}, + eprint = {1704.01279}, + primaryclass = {cs.LG}, + title = {Neural Audio Synthesis of Musical Notes with WaveNet Autoencoders}, + url = {https://arxiv.org/abs/1704.01279}, + year = {2017}, +} +""", descriptive_stats={ "n_samples": {"train": 289205, "validation": 12678, "test": 4096}, }, diff --git a/mteb/tasks/Audio/AudioClassification/eng/SIBFLEURS.py b/mteb/tasks/Audio/AudioClassification/eng/SIBFLEURS.py index 2132a86a8d..bbd012cfe5 100644 --- a/mteb/tasks/Audio/AudioClassification/eng/SIBFLEURS.py +++ b/mteb/tasks/Audio/AudioClassification/eng/SIBFLEURS.py @@ -36,16 +36,17 @@ class SIBFLEURSMultilingualClassification(AbsTaskAudioClassification): dialect=[], modalities=["audio"], sample_creation="found", - bibtex_citation="""@misc{schmidt2025fleursslumassivelymultilingualbenchmark, - title={Fleurs-SLU: A Massively Multilingual Benchmark for Spoken Language Understanding}, - author={Fabian David Schmidt and Ivan Vulić and Goran Glavaš and David Ifeoluwa Adelani}, - year={2025}, - eprint={2501.06117}, - archivePrefix={arXiv}, - primaryClass={cs.CL}, - url={https://arxiv.org/abs/2501.06117}, - } - """, + bibtex_citation=r""" +@misc{schmidt2025fleursslumassivelymultilingualbenchmark, + archiveprefix = {arXiv}, + author = {Fabian David Schmidt and Ivan Vulić and Goran Glavaš and David Ifeoluwa Adelani}, + eprint = {2501.06117}, + primaryclass = {cs.CL}, + title = {Fleurs-SLU: A Massively Multilingual Benchmark for Spoken Language Understanding}, + url = {https://arxiv.org/abs/2501.06117}, + year = {2025}, +} +""", descriptive_stats={ "n_samples": {"test": 177}, }, diff --git a/mteb/tasks/Audio/AudioClassification/eng/SpokenQAforIC.py b/mteb/tasks/Audio/AudioClassification/eng/SpokenQAforIC.py index 18a518757d..8df1d3e312 100644 --- a/mteb/tasks/Audio/AudioClassification/eng/SpokenQAforIC.py +++ b/mteb/tasks/Audio/AudioClassification/eng/SpokenQAforIC.py @@ -28,15 +28,17 @@ class SpokenQAforIC(AbsTaskAudioClassification): dialect=[], modalities=["audio"], sample_creation="multiple", - bibtex_citation="""@misc{shon2023sluephase2benchmarksuite, - title={SLUE Phase-2: A Benchmark Suite of Diverse Spoken Language Understanding Tasks}, - author={Suwon Shon and Siddhant Arora and Chyi-Jiunn Lin and Ankita Pasad and Felix Wu and Roshan Sharma and Wei-Lun Wu and Hung-Yi Lee and Karen Livescu and Shinji Watanabe}, - year={2023}, - eprint={2212.10525}, - archivePrefix={arXiv}, - primaryClass={cs.CL}, - url={https://arxiv.org/abs/2212.10525}, - }""", + bibtex_citation=r""" +@misc{shon2023sluephase2benchmarksuite, + archiveprefix = {arXiv}, + author = {Suwon Shon and Siddhant Arora and Chyi-Jiunn Lin and Ankita Pasad and Felix Wu and Roshan Sharma and Wei-Lun Wu and Hung-Yi Lee and Karen Livescu and Shinji Watanabe}, + eprint = {2212.10525}, + primaryclass = {cs.CL}, + title = {SLUE Phase-2: A Benchmark Suite of Diverse Spoken Language Understanding Tasks}, + url = {https://arxiv.org/abs/2212.10525}, + year = {2023}, +} +""", descriptive_stats={ "n_samples": {"train": 6121}, }, diff --git a/mteb/tasks/Audio/AudioClassification/eng/VoxCelebSA.py b/mteb/tasks/Audio/AudioClassification/eng/VoxCelebSA.py index 417d9c35a9..08826b809a 100644 --- a/mteb/tasks/Audio/AudioClassification/eng/VoxCelebSA.py +++ b/mteb/tasks/Audio/AudioClassification/eng/VoxCelebSA.py @@ -28,15 +28,17 @@ class VoxCelebSA(AbsTaskAudioClassification): dialect=[], modalities=["audio"], sample_creation="found", - bibtex_citation="""@misc{shon2022sluenewbenchmarktasks, - title={SLUE: New Benchmark Tasks for Spoken Language Understanding Evaluation on Natural Speech}, - author={Suwon Shon and Ankita Pasad and Felix Wu and Pablo Brusco and Yoav Artzi and Karen Livescu and Kyu J. Han}, - year={2022}, - eprint={2111.10367}, - archivePrefix={arXiv}, - primaryClass={cs.CL}, - url={https://arxiv.org/abs/2111.10367}, - }""", + bibtex_citation=r""" +@misc{shon2022sluenewbenchmarktasks, + archiveprefix = {arXiv}, + author = {Suwon Shon and Ankita Pasad and Felix Wu and Pablo Brusco and Yoav Artzi and Karen Livescu and Kyu J. Han}, + eprint = {2111.10367}, + primaryclass = {cs.CL}, + title = {SLUE: New Benchmark Tasks for Spoken Language Understanding Evaluation on Natural Speech}, + url = {https://arxiv.org/abs/2111.10367}, + year = {2022}, +} +""", descriptive_stats={ "n_samples": { "train": 3449 diff --git a/mteb/tasks/Audio/AudioClassification/eng/VoxLingua107Top10.py b/mteb/tasks/Audio/AudioClassification/eng/VoxLingua107Top10.py index 266b29a402..02977b6ba4 100644 --- a/mteb/tasks/Audio/AudioClassification/eng/VoxLingua107Top10.py +++ b/mteb/tasks/Audio/AudioClassification/eng/VoxLingua107Top10.py @@ -28,15 +28,17 @@ class VoxLingua107Top10(AbsTaskAudioClassification): dialect=[], modalities=["audio"], sample_creation="found", # from youtube - bibtex_citation="""@misc{valk2020voxlingua107datasetspokenlanguage, - title={VoxLingua107: a Dataset for Spoken Language Recognition}, - author={Jörgen Valk and Tanel Alumäe}, - year={2020}, - eprint={2011.12998}, - archivePrefix={arXiv}, - primaryClass={eess.AS}, - url={https://arxiv.org/abs/2011.12998}, - }""", + bibtex_citation=r""" +@misc{valk2020voxlingua107datasetspokenlanguage, + archiveprefix = {arXiv}, + author = {Jörgen Valk and Tanel Alumäe}, + eprint = {2011.12998}, + primaryclass = {eess.AS}, + title = {VoxLingua107: a Dataset for Spoken Language Recognition}, + url = {https://arxiv.org/abs/2011.12998}, + year = {2020}, +} +""", descriptive_stats={ "n_samples": {"train": 972}, }, diff --git a/mteb/tasks/Audio/AudioMultilabelClassification/eng/FSD2019Kaggle.py b/mteb/tasks/Audio/AudioMultilabelClassification/eng/FSD2019Kaggle.py index fac10ca0ed..6d2f3613ab 100644 --- a/mteb/tasks/Audio/AudioMultilabelClassification/eng/FSD2019Kaggle.py +++ b/mteb/tasks/Audio/AudioMultilabelClassification/eng/FSD2019Kaggle.py @@ -36,21 +36,22 @@ class FSD2019KaggleMultilingualClassification( dialect=[], modalities=["audio"], sample_creation="found", - bibtex_citation="""@dataset{eduardo_fonseca_2020_3612637, - author = {Eduardo Fonseca and - Manoj Plakal and - Frederic Font and - Daniel P. W. Ellis and - Xavier Serra}, - title = {FSDKaggle2019}, - month = jan, - year = 2020, - publisher = {Zenodo}, - version = {1.0}, - doi = {10.5281/zenodo.3612637}, - url = {https://doi.org/10.5281/zenodo.3612637}, - } - """, + bibtex_citation=r""" +@dataset{eduardo_fonseca_2020_3612637, + author = {Eduardo Fonseca and +Manoj Plakal and +Frederic Font and +Daniel P. W. Ellis and +Xavier Serra}, + doi = {10.5281/zenodo.3612637}, + month = jan, + publisher = {Zenodo}, + title = {FSDKaggle2019}, + url = {https://doi.org/10.5281/zenodo.3612637}, + version = {1.0}, + year = {2020}, +} +""", descriptive_stats={ "n_samples": {"test": 8961}, }, diff --git a/mteb/tasks/Audio/AudioMultilabelClassification/eng/FSD50HF.py b/mteb/tasks/Audio/AudioMultilabelClassification/eng/FSD50HF.py index 337ce2325a..234011388c 100644 --- a/mteb/tasks/Audio/AudioMultilabelClassification/eng/FSD50HF.py +++ b/mteb/tasks/Audio/AudioMultilabelClassification/eng/FSD50HF.py @@ -31,17 +31,19 @@ class FSD50HFMultilingualClassification(AbsTaskAudioMultilabelClassification): dialect=[], modalities=["audio"], sample_creation="found", - bibtex_citation="""@ARTICLE{9645159, - author={Fonseca, Eduardo and Favory, Xavier and Pons, Jordi and Font, Frederic and Serra, Xavier}, - journal={IEEE/ACM Transactions on Audio, Speech, and Language Processing}, - title={FSD50K: An Open Dataset of Human-Labeled Sound Events}, - year={2022}, - volume={30}, - number={}, - pages={829-852}, - keywords={Videos;Task analysis;Labeling;Vocabulary;Speech recognition;Ontologies;Benchmark testing;Audio dataset;sound event;recognition;classification;tagging;data collection;environmental sound}, - doi={10.1109/TASLP.2021.3133208}} - """, + bibtex_citation=r""" +@article{9645159, + author = {Fonseca, Eduardo and Favory, Xavier and Pons, Jordi and Font, Frederic and Serra, Xavier}, + doi = {10.1109/TASLP.2021.3133208}, + journal = {IEEE/ACM Transactions on Audio, Speech, and Language Processing}, + keywords = {Videos;Task analysis;Labeling;Vocabulary;Speech recognition;Ontologies;Benchmark testing;Audio dataset;sound event;recognition;classification;tagging;data collection;environmental sound}, + number = {}, + pages = {829-852}, + title = {FSD50K: An Open Dataset of Human-Labeled Sound Events}, + volume = {30}, + year = {2022}, +} +""", ) audio_column_name: str = "audio" diff --git a/mteb/tasks/Audio/AudioZeroshotClassification/eng/ESC50.py b/mteb/tasks/Audio/AudioZeroshotClassification/eng/ESC50.py index 02e363db6c..2751bd8639 100644 --- a/mteb/tasks/Audio/AudioZeroshotClassification/eng/ESC50.py +++ b/mteb/tasks/Audio/AudioZeroshotClassification/eng/ESC50.py @@ -30,18 +30,20 @@ class ESC50ZeroshotClassification(AbsTaskAudioZeroshotClassification): dialect=[], modalities=["audio"], sample_creation="found", - bibtex_citation="""@inproceedings{piczak2015dataset, - title = {{ESC}: {Dataset} for {Environmental Sound Classification}}, - author = {Piczak, Karol J.}, - booktitle = {Proceedings of the 23rd {Annual ACM Conference} on {Multimedia}}, - date = {2015-10-13}, - url = {http://dl.acm.org/citation.cfm?doid=2733373.2806390}, - doi = {10.1145/2733373.2806390}, - location = {{Brisbane, Australia}}, - isbn = {978-1-4503-3459-4}, - publisher = {{ACM Press}}, - pages = {1015--1018} - }""", + bibtex_citation=r""" +@inproceedings{piczak2015dataset, + author = {Piczak, Karol J.}, + booktitle = {Proceedings of the 23rd {Annual ACM Conference} on {Multimedia}}, + date = {2015-10-13}, + doi = {10.1145/2733373.2806390}, + isbn = {978-1-4503-3459-4}, + location = {{Brisbane, Australia}}, + pages = {1015--1018}, + publisher = {{ACM Press}}, + title = {{ESC}: {Dataset} for {Environmental Sound Classification}}, + url = {http://dl.acm.org/citation.cfm?doid=2733373.2806390}, +} +""", descriptive_stats={ "n_samples": {"train": 2000}, # Need actual number }, diff --git a/mteb/tasks/Audio/AudioZeroshotClassification/eng/Ravdess.py b/mteb/tasks/Audio/AudioZeroshotClassification/eng/Ravdess.py index ff24127c82..54d471dc28 100644 --- a/mteb/tasks/Audio/AudioZeroshotClassification/eng/Ravdess.py +++ b/mteb/tasks/Audio/AudioZeroshotClassification/eng/Ravdess.py @@ -28,19 +28,21 @@ class RavdessZeroshotClassification(AbsTaskAudioZeroshotClassification): dialect=[], modalities=["audio"], sample_creation="found", - bibtex_citation="""@article{10.1371/journal.pone.0196391, - doi = {10.1371/journal.pone.0196391}, - author = {Livingstone, Steven R. AND Russo, Frank A.}, - journal = {PLOS ONE}, - publisher = {Public Library of Science}, - title = {The Ryerson Audio-Visual Database ofal Speech and Song (RAVDESS): A dynamic, multimodal set of facial and vocal expressions in North American English}, - year = {2018}, - month = {05}, - volume = {13}, - url = {https://doi.org/10.1371/journal.pone.0196391}, - pages = {1-35}, - number = {5}, - }""", + bibtex_citation=r""" +@article{10.1371/journal.pone.0196391, + author = {Livingstone, Steven R. AND Russo, Frank A.}, + doi = {10.1371/journal.pone.0196391}, + journal = {PLOS ONE}, + month = {05}, + number = {5}, + pages = {1-35}, + publisher = {Public Library of Science}, + title = {The Ryerson Audio-Visual Database ofal Speech and Song (RAVDESS): A dynamic, multimodal set of facial and vocal expressions in North American English}, + url = {https://doi.org/10.1371/journal.pone.0196391}, + volume = {13}, + year = {2018}, +} +""", ) audio_column_name: str = "audio" diff --git a/mteb/tasks/Audio/AudioZeroshotClassification/eng/UrbanSound8k.py b/mteb/tasks/Audio/AudioZeroshotClassification/eng/UrbanSound8k.py index 19268f83ec..8008cf6514 100644 --- a/mteb/tasks/Audio/AudioZeroshotClassification/eng/UrbanSound8k.py +++ b/mteb/tasks/Audio/AudioZeroshotClassification/eng/UrbanSound8k.py @@ -28,13 +28,15 @@ class UrbanSound8kZeroshotClassification(AbsTaskAudioZeroshotClassification): dialect=[], modalities=["audio"], sample_creation="found", - bibtex_citation="""@article{Salamon2014ADA, - title={A Dataset and Taxonomy for Urban Sound Research}, - author={Justin Salamon and Christopher Jacoby and Juan Pablo Bello}, - journal={Proceedings of the 22nd ACM international conference on Multimedia}, - year={2014}, - url={https://api.semanticscholar.org/CorpusID:207217115} -}""", + bibtex_citation=r""" +@article{Salamon2014ADA, + author = {Justin Salamon and Christopher Jacoby and Juan Pablo Bello}, + journal = {Proceedings of the 22nd ACM international conference on Multimedia}, + title = {A Dataset and Taxonomy for Urban Sound Research}, + url = {https://api.semanticscholar.org/CorpusID:207217115}, + year = {2014}, +} +""", descriptive_stats={ "n_samples": {"train": 8732}, }, diff --git a/mteb/tasks/Audio/Clustering/eng/MusicGenre.py b/mteb/tasks/Audio/Clustering/eng/MusicGenre.py index b02cc4ebac..65214a4d50 100644 --- a/mteb/tasks/Audio/Clustering/eng/MusicGenre.py +++ b/mteb/tasks/Audio/Clustering/eng/MusicGenre.py @@ -27,12 +27,14 @@ class MusicGenreClustering(AbsTaskAudioClustering): dialect=[], modalities=["audio"], sample_creation="found", - bibtex_citation="""@inproceedings{homburg2005benchmark, - title={A Benchmark Dataset for Audio Classification and Clustering.}, - author={Homburg, Helge and Mierswa, Ingo and M{\"o}ller, B{\"u}lent and Morik, Katharina and Wurst, Michael}, - booktitle={ISMIR}, - volume={2005}, - pages={528--31}, - year={2005} - }""", + bibtex_citation=r""" +@inproceedings{homburg2005benchmark, + author = {Homburg, Helge and Mierswa, Ingo and M{\"o}ller, B{\"u}lent and Morik, Katharina and Wurst, Michael}, + booktitle = {ISMIR}, + pages = {528--31}, + title = {A Benchmark Dataset for Audio Classification and Clustering.}, + volume = {2005}, + year = {2005}, +} +""", ) diff --git a/mteb/tasks/Audio/Clustering/eng/VehicleSoundClustering.py b/mteb/tasks/Audio/Clustering/eng/VehicleSoundClustering.py index 074cf83b1c..c35e335739 100644 --- a/mteb/tasks/Audio/Clustering/eng/VehicleSoundClustering.py +++ b/mteb/tasks/Audio/Clustering/eng/VehicleSoundClustering.py @@ -26,13 +26,15 @@ class VehicleSoundClustering(AbsTaskAudioClustering): dialect=[], modalities=["audio"], sample_creation="created", - bibtex_citation="""@inproceedings{inproceedings, - author = {Bazilinskyy, Pavlo and Aa, Arne and Schoustra, Michael and Spruit, John and Staats, Laurens and van der Vlist, Klaas Jan and de Winter, Joost}, - year = {2018}, - month = {05}, - pages = {}, - title = {An auditory dataset of passing vehicles recorded with a smartphone} - }""", + bibtex_citation=r""" +@inproceedings{inproceedings, + author = {Bazilinskyy, Pavlo and Aa, Arne and Schoustra, Michael and Spruit, John and Staats, Laurens and van der Vlist, Klaas Jan and de Winter, Joost}, + month = {05}, + pages = {}, + title = {An auditory dataset of passing vehicles recorded with a smartphone}, + year = {2018}, +} +""", descriptive_stats={ "n_samples": {"test": 1705}, }, diff --git a/mteb/tasks/Audio/Clustering/eng/VoiceGender.py b/mteb/tasks/Audio/Clustering/eng/VoiceGender.py index 2ec8c89509..d7edfa87da 100644 --- a/mteb/tasks/Audio/Clustering/eng/VoiceGender.py +++ b/mteb/tasks/Audio/Clustering/eng/VoiceGender.py @@ -27,10 +27,12 @@ class VoiceGenderClustering(AbsTaskAudioClustering): dialect=[], modalities=["audio"], sample_creation="found", - bibtex_citation="""@InProceedings{Chung18b, - author = "Chung, J.~S. and Nagrani, A. and Zisserman, A.", - title = "VoxCeleb2: Deep Speaker Recognition", - booktitle = "INTERSPEECH", - year = "2018 - }""", + bibtex_citation=r""" +@inproceedings{Chung18b, + author = {Joon Son Chung and Arsha Nagrani and Andrew Zisserman}, + booktitle = {Proceedings of Interspeech}, + title = {VoxCeleb2: Deep Speaker Recognition}, + year = {2018}, +} +""", ) diff --git a/mteb/tasks/BitextMining/dan/BornholmskBitextMining.py b/mteb/tasks/BitextMining/dan/BornholmskBitextMining.py index 242f51ac37..a94765d961 100644 --- a/mteb/tasks/BitextMining/dan/BornholmskBitextMining.py +++ b/mteb/tasks/BitextMining/dan/BornholmskBitextMining.py @@ -27,18 +27,18 @@ class BornholmBitextMining(AbsTaskBitextMining): annotations_creators="expert-annotated", dialect=["da-dan-bornholm"], sample_creation="created", - bibtex_citation=""" + bibtex_citation=r""" @inproceedings{derczynskiBornholmskNaturalLanguage2019, - title = {Bornholmsk natural language processing: Resources and tools}, - url = {https://pure.itu.dk/ws/files/84551091/W19_6138.pdf}, - shorttitle = {Bornholmsk natural language processing}, - pages = {338--344}, - booktitle = {Proceedings of the Nordic Conference of Computational Linguistics (2019)}, - publisher = {Linköping University Electronic Press}, - author = {Derczynski, Leon and Kjeldsen, Alex Speed}, - urldate = {2024-04-24}, - date = {2019}, - file = {Available Version (via Google Scholar):/Users/au554730/Zotero/storage/FBQ73ZYN/Derczynski and Kjeldsen - 2019 - Bornholmsk natural language processing Resources .pdf:application/pdf}, + author = {Derczynski, Leon and Kjeldsen, Alex Speed}, + booktitle = {Proceedings of the Nordic Conference of Computational Linguistics (2019)}, + date = {2019}, + file = {Available Version (via Google Scholar):/Users/au554730/Zotero/storage/FBQ73ZYN/Derczynski and Kjeldsen - 2019 - Bornholmsk natural language processing Resources .pdf:application/pdf}, + pages = {338--344}, + publisher = {Linköping University Electronic Press}, + shorttitle = {Bornholmsk natural language processing}, + title = {Bornholmsk natural language processing: Resources and tools}, + url = {https://pure.itu.dk/ws/files/84551091/W19_6138.pdf}, + urldate = {2024-04-24}, } """, prompt="Retrieve parallel sentences.", diff --git a/mteb/tasks/BitextMining/eng/PubChemSMILESBitextMining.py b/mteb/tasks/BitextMining/eng/PubChemSMILESBitextMining.py index 4951d8c596..5e75a54c0c 100644 --- a/mteb/tasks/BitextMining/eng/PubChemSMILESBitextMining.py +++ b/mteb/tasks/BitextMining/eng/PubChemSMILESBitextMining.py @@ -41,24 +41,25 @@ class PubChemSMILESBitextMining(MultilingualTask, AbsTaskBitextMining): annotations_creators="derived", dialect=[], sample_creation="created", - bibtex_citation=""" - @article{kasmaee2024chemteb, - title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, - author={Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, - journal={arXiv preprint arXiv:2412.00532}, - year={2024} - } - @article{kim2023pubchem, - title={PubChem 2023 update}, - author={Kim, Sunghwan and Chen, Jie and Cheng, Tiejun and Gindulyte, Asta and He, Jia and He, Siqian and Li, Qingliang and Shoemaker, Benjamin A and Thiessen, Paul A and Yu, Bo and others}, - journal={Nucleic acids research}, - volume={51}, - number={D1}, - pages={D1373--D1380}, - year={2023}, - publisher={Oxford University Press} - } - """, + bibtex_citation=r""" +@article{kasmaee2024chemteb, + author = {Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, + journal = {arXiv preprint arXiv:2412.00532}, + title = {ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, + year = {2024}, +} + +@article{kim2023pubchem, + author = {Kim, Sunghwan and Chen, Jie and Cheng, Tiejun and Gindulyte, Asta and He, Jia and He, Siqian and Li, Qingliang and Shoemaker, Benjamin A and Thiessen, Paul A and Yu, Bo and others}, + journal = {Nucleic acids research}, + number = {D1}, + pages = {D1373--D1380}, + publisher = {Oxford University Press}, + title = {PubChem 2023 update}, + volume = {51}, + year = {2023}, +} +""", ) def dataset_transform(self): diff --git a/mteb/tasks/BitextMining/multilingual/BUCCBitextMining.py b/mteb/tasks/BitextMining/multilingual/BUCCBitextMining.py index 8c2563bbf3..c22883b112 100644 --- a/mteb/tasks/BitextMining/multilingual/BUCCBitextMining.py +++ b/mteb/tasks/BitextMining/multilingual/BUCCBitextMining.py @@ -43,24 +43,26 @@ class BUCCBitextMining(AbsTaskBitextMining, MultilingualTask): annotations_creators="human-annotated", dialect=[], sample_creation="human-translated", - bibtex_citation="""@inproceedings{zweigenbaum-etal-2017-overview, - title = "Overview of the Second {BUCC} Shared Task: Spotting Parallel Sentences in Comparable Corpora", - author = "Zweigenbaum, Pierre and - Sharoff, Serge and - Rapp, Reinhard", - editor = "Sharoff, Serge and - Zweigenbaum, Pierre and - Rapp, Reinhard", - booktitle = "Proceedings of the 10th Workshop on Building and Using Comparable Corpora", - month = aug, - year = "2017", - address = "Vancouver, Canada", - publisher = "Association for Computational Linguistics", - url = "https://aclanthology.org/W17-2512", - doi = "10.18653/v1/W17-2512", - pages = "60--67", - abstract = "This paper presents the BUCC 2017 shared task on parallel sentence extraction from comparable corpora. It recalls the design of the datasets, presents their final construction and statistics and the methods used to evaluate system results. 13 runs were submitted to the shared task by 4 teams, covering three of the four proposed language pairs: French-English (7 runs), German-English (3 runs), and Chinese-English (3 runs). The best F-scores as measured against the gold standard were 0.84 (German-English), 0.80 (French-English), and 0.43 (Chinese-English). Because of the design of the dataset, in which not all gold parallel sentence pairs are known, these are only minimum values. We examined manually a small sample of the false negative sentence pairs for the most precise French-English runs and estimated the number of parallel sentence pairs not yet in the provided gold standard. Adding them to the gold standard leads to revised estimates for the French-English F-scores of at most +1.5pt. This suggests that the BUCC 2017 datasets provide a reasonable approximate evaluation of the parallel sentence spotting task.", -}""", + bibtex_citation=r""" +@inproceedings{zweigenbaum-etal-2017-overview, + abstract = {This paper presents the BUCC 2017 shared task on parallel sentence extraction from comparable corpora. It recalls the design of the datasets, presents their final construction and statistics and the methods used to evaluate system results. 13 runs were submitted to the shared task by 4 teams, covering three of the four proposed language pairs: French-English (7 runs), German-English (3 runs), and Chinese-English (3 runs). The best F-scores as measured against the gold standard were 0.84 (German-English), 0.80 (French-English), and 0.43 (Chinese-English). Because of the design of the dataset, in which not all gold parallel sentence pairs are known, these are only minimum values. We examined manually a small sample of the false negative sentence pairs for the most precise French-English runs and estimated the number of parallel sentence pairs not yet in the provided gold standard. Adding them to the gold standard leads to revised estimates for the French-English F-scores of at most +1.5pt. This suggests that the BUCC 2017 datasets provide a reasonable approximate evaluation of the parallel sentence spotting task.}, + address = {Vancouver, Canada}, + author = {Zweigenbaum, Pierre and +Sharoff, Serge and +Rapp, Reinhard}, + booktitle = {Proceedings of the 10th Workshop on Building and Using Comparable Corpora}, + doi = {10.18653/v1/W17-2512}, + editor = {Sharoff, Serge and +Zweigenbaum, Pierre and +Rapp, Reinhard}, + month = aug, + pages = {60--67}, + publisher = {Association for Computational Linguistics}, + title = {Overview of the Second {BUCC} Shared Task: Spotting Parallel Sentences in Comparable Corpora}, + url = {https://aclanthology.org/W17-2512}, + year = {2017}, +} +""", ) def dataset_transform(self): diff --git a/mteb/tasks/BitextMining/multilingual/BUCCBitextMiningFast.py b/mteb/tasks/BitextMining/multilingual/BUCCBitextMiningFast.py index 6154003abe..567eb9a281 100644 --- a/mteb/tasks/BitextMining/multilingual/BUCCBitextMiningFast.py +++ b/mteb/tasks/BitextMining/multilingual/BUCCBitextMiningFast.py @@ -38,23 +38,25 @@ class BUCCBitextMiningFast(AbsTaskBitextMining, MultilingualTask): annotations_creators="human-annotated", dialect=[], sample_creation="human-translated", - bibtex_citation="""@inproceedings{zweigenbaum-etal-2017-overview, - title = "Overview of the Second {BUCC} Shared Task: Spotting Parallel Sentences in Comparable Corpora", - author = "Zweigenbaum, Pierre and - Sharoff, Serge and - Rapp, Reinhard", - editor = "Sharoff, Serge and - Zweigenbaum, Pierre and - Rapp, Reinhard", - booktitle = "Proceedings of the 10th Workshop on Building and Using Comparable Corpora", - month = aug, - year = "2017", - address = "Vancouver, Canada", - publisher = "Association for Computational Linguistics", - url = "https://aclanthology.org/W17-2512", - doi = "10.18653/v1/W17-2512", - pages = "60--67", - abstract = "This paper presents the BUCC 2017 shared task on parallel sentence extraction from comparable corpora. It recalls the design of the datasets, presents their final construction and statistics and the methods used to evaluate system results. 13 runs were submitted to the shared task by 4 teams, covering three of the four proposed language pairs: French-English (7 runs), German-English (3 runs), and Chinese-English (3 runs). The best F-scores as measured against the gold standard were 0.84 (German-English), 0.80 (French-English), and 0.43 (Chinese-English). Because of the design of the dataset, in which not all gold parallel sentence pairs are known, these are only minimum values. We examined manually a small sample of the false negative sentence pairs for the most precise French-English runs and estimated the number of parallel sentence pairs not yet in the provided gold standard. Adding them to the gold standard leads to revised estimates for the French-English F-scores of at most +1.5pt. This suggests that the BUCC 2017 datasets provide a reasonable approximate evaluation of the parallel sentence spotting task.", -}""", + bibtex_citation=r""" +@inproceedings{zweigenbaum-etal-2017-overview, + abstract = {This paper presents the BUCC 2017 shared task on parallel sentence extraction from comparable corpora. It recalls the design of the datasets, presents their final construction and statistics and the methods used to evaluate system results. 13 runs were submitted to the shared task by 4 teams, covering three of the four proposed language pairs: French-English (7 runs), German-English (3 runs), and Chinese-English (3 runs). The best F-scores as measured against the gold standard were 0.84 (German-English), 0.80 (French-English), and 0.43 (Chinese-English). Because of the design of the dataset, in which not all gold parallel sentence pairs are known, these are only minimum values. We examined manually a small sample of the false negative sentence pairs for the most precise French-English runs and estimated the number of parallel sentence pairs not yet in the provided gold standard. Adding them to the gold standard leads to revised estimates for the French-English F-scores of at most +1.5pt. This suggests that the BUCC 2017 datasets provide a reasonable approximate evaluation of the parallel sentence spotting task.}, + address = {Vancouver, Canada}, + author = {Zweigenbaum, Pierre and +Sharoff, Serge and +Rapp, Reinhard}, + booktitle = {Proceedings of the 10th Workshop on Building and Using Comparable Corpora}, + doi = {10.18653/v1/W17-2512}, + editor = {Sharoff, Serge and +Zweigenbaum, Pierre and +Rapp, Reinhard}, + month = aug, + pages = {60--67}, + publisher = {Association for Computational Linguistics}, + title = {Overview of the Second {BUCC} Shared Task: Spotting Parallel Sentences in Comparable Corpora}, + url = {https://aclanthology.org/W17-2512}, + year = {2017}, +} +""", adapted_from=["BUCC"], ) diff --git a/mteb/tasks/BitextMining/multilingual/BibleNLPBitextMining.py b/mteb/tasks/BitextMining/multilingual/BibleNLPBitextMining.py index 07724153c9..dedc2355c9 100644 --- a/mteb/tasks/BitextMining/multilingual/BibleNLPBitextMining.py +++ b/mteb/tasks/BitextMining/multilingual/BibleNLPBitextMining.py @@ -884,12 +884,14 @@ class BibleNLPBitextMining(AbsTaskBitextMining, MultilingualTask): annotations_creators="expert-annotated", dialect=[], sample_creation="created", - bibtex_citation="""@article{akerman2023ebible, - title={The eBible Corpus: Data and Model Benchmarks for Bible Translation for Low-Resource Languages}, - author={Akerman, Vesa and Baines, David and Daspit, Damien and Hermjakob, Ulf and Jang, Taeho and Leong, Colin and Martin, Michael and Mathew, Joel and Robie, Jonathan and Schwarting, Marcus}, - journal={arXiv preprint arXiv:2304.09919}, - year={2023} - }""", + bibtex_citation=r""" +@article{akerman2023ebible, + author = {Akerman, Vesa and Baines, David and Daspit, Damien and Hermjakob, Ulf and Jang, Taeho and Leong, Colin and Martin, Michael and Mathew, Joel and Robie, Jonathan and Schwarting, Marcus}, + journal = {arXiv preprint arXiv:2304.09919}, + title = {The eBible Corpus: Data and Model Benchmarks for Bible Translation for Low-Resource Languages}, + year = {2023}, +} +""", ) def load_data(self, **kwargs: Any) -> None: diff --git a/mteb/tasks/BitextMining/multilingual/DiaBLaBitextMining.py b/mteb/tasks/BitextMining/multilingual/DiaBLaBitextMining.py index b7806d60ac..893df2c8fd 100644 --- a/mteb/tasks/BitextMining/multilingual/DiaBLaBitextMining.py +++ b/mteb/tasks/BitextMining/multilingual/DiaBLaBitextMining.py @@ -33,15 +33,15 @@ class DiaBLaBitextMining(AbsTaskBitextMining, MultilingualTask): annotations_creators="human-annotated", dialect=[], sample_creation="created", - bibtex_citation=""" - @inproceedings{gonzalez2019diabla, - title={DiaBLa: A Corpus of Bilingual Spontaneous Written Dialogues for Machine Translation}, - author={González, Matilde and García, Clara and Sánchez, Lucía}, - booktitle={Proceedings of the 12th Language Resources and Evaluation Conference}, - pages={4192--4198}, - year={2019} - } - """, + bibtex_citation=r""" +@inproceedings{gonzalez2019diabla, + author = {González, Matilde and García, Clara and Sánchez, Lucía}, + booktitle = {Proceedings of the 12th Language Resources and Evaluation Conference}, + pages = {4192--4198}, + title = {DiaBLa: A Corpus of Bilingual Spontaneous Written Dialogues for Machine Translation}, + year = {2019}, +} +""", ) def load_data(self, **kwargs): diff --git a/mteb/tasks/BitextMining/multilingual/FloresBitextMining.py b/mteb/tasks/BitextMining/multilingual/FloresBitextMining.py index 786b5f0fd9..c28ef5944b 100644 --- a/mteb/tasks/BitextMining/multilingual/FloresBitextMining.py +++ b/mteb/tasks/BitextMining/multilingual/FloresBitextMining.py @@ -259,15 +259,15 @@ class FloresBitextMining(AbsTaskBitextMining, MultilingualTask): annotations_creators="human-annotated", dialect=[], sample_creation="created", - bibtex_citation=""" - @inproceedings{goyal2022flores, - title={The FLORES-101 Evaluation Benchmark for Low-Resource and Multilingual Machine Translation}, - author={Goyal, Naman and Gao, Cynthia and Chaudhary, Vishrav and Chen, Peng-Jen and Wenzek, Guillaume and Ju, Da and Krishnan, Sanjana and Ranzato, Marc'Aurelio and Guzm{\'a}n, Francisco}, - booktitle={Proceedings of the 2022 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies}, - pages={19--35}, - year={2022} - } - """, + bibtex_citation=r""" +@inproceedings{goyal2022flores, + author = {Goyal, Naman and Gao, Cynthia and Chaudhary, Vishrav and Chen, Peng-Jen and Wenzek, Guillaume and Ju, Da and Krishnan, Sanjana and Ranzato, Marc'Aurelio and Guzm{\'a}n, Francisco}, + booktitle = {Proceedings of the 2022 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies}, + pages = {19--35}, + title = {The FLORES-101 Evaluation Benchmark for Low-Resource and Multilingual Machine Translation}, + year = {2022}, +} +""", ) def load_data(self, **kwargs: Any) -> None: diff --git a/mteb/tasks/BitextMining/multilingual/IN22ConvBitextMining.py b/mteb/tasks/BitextMining/multilingual/IN22ConvBitextMining.py index 61a8717507..3ed3395d17 100644 --- a/mteb/tasks/BitextMining/multilingual/IN22ConvBitextMining.py +++ b/mteb/tasks/BitextMining/multilingual/IN22ConvBitextMining.py @@ -91,15 +91,17 @@ class IN22ConvBitextMining(AbsTaskBitextMining, MultilingualTask): annotations_creators="expert-annotated", dialect=[], sample_creation="created", - bibtex_citation="""@article{gala2023indictrans, -title={IndicTrans2: Towards High-Quality and Accessible Machine Translation Models for all 22 Scheduled Indian Languages}, -author={Jay Gala and Pranjal A Chitale and A K Raghavan and Varun Gumma and Sumanth Doddapaneni and Aswanth Kumar M and Janki Atul Nawale and Anupama Sujatha and Ratish Puduppully and Vivek Raghavan and Pratyush Kumar and Mitesh M Khapra and Raj Dabre and Anoop Kunchukuttan}, -journal={Transactions on Machine Learning Research}, -issn={2835-8856}, -year={2023}, -url={https://openreview.net/forum?id=vfT4YuzAYA}, -note={} -}""", + bibtex_citation=r""" +@article{gala2023indictrans, + author = {Jay Gala and Pranjal A Chitale and A K Raghavan and Varun Gumma and Sumanth Doddapaneni and Aswanth Kumar M and Janki Atul Nawale and Anupama Sujatha and Ratish Puduppully and Vivek Raghavan and Pratyush Kumar and Mitesh M Khapra and Raj Dabre and Anoop Kunchukuttan}, + issn = {2835-8856}, + journal = {Transactions on Machine Learning Research}, + note = {}, + title = {IndicTrans2: Towards High-Quality and Accessible Machine Translation Models for all 22 Scheduled Indian Languages}, + url = {https://openreview.net/forum?id=vfT4YuzAYA}, + year = {2023}, +} +""", ) def load_data(self, **kwargs: Any) -> None: diff --git a/mteb/tasks/BitextMining/multilingual/IN22GenBitextMining.py b/mteb/tasks/BitextMining/multilingual/IN22GenBitextMining.py index 503c64e5f0..5082a354d6 100644 --- a/mteb/tasks/BitextMining/multilingual/IN22GenBitextMining.py +++ b/mteb/tasks/BitextMining/multilingual/IN22GenBitextMining.py @@ -85,15 +85,17 @@ class IN22GenBitextMining(AbsTaskBitextMining, MultilingualTask): annotations_creators="expert-annotated", dialect=[], sample_creation="created", - bibtex_citation="""@article{gala2023indictrans, -title={IndicTrans2: Towards High-Quality and Accessible Machine Translation Models for all 22 Scheduled Indian Languages}, -author={Jay Gala and Pranjal A Chitale and A K Raghavan and Varun Gumma and Sumanth Doddapaneni and Aswanth Kumar M and Janki Atul Nawale and Anupama Sujatha and Ratish Puduppully and Vivek Raghavan and Pratyush Kumar and Mitesh M Khapra and Raj Dabre and Anoop Kunchukuttan}, -journal={Transactions on Machine Learning Research}, -issn={2835-8856}, -year={2023}, -url={https://openreview.net/forum?id=vfT4YuzAYA}, -note={} -}""", + bibtex_citation=r""" +@article{gala2023indictrans, + author = {Jay Gala and Pranjal A Chitale and A K Raghavan and Varun Gumma and Sumanth Doddapaneni and Aswanth Kumar M and Janki Atul Nawale and Anupama Sujatha and Ratish Puduppully and Vivek Raghavan and Pratyush Kumar and Mitesh M Khapra and Raj Dabre and Anoop Kunchukuttan}, + issn = {2835-8856}, + journal = {Transactions on Machine Learning Research}, + note = {}, + title = {IndicTrans2: Towards High-Quality and Accessible Machine Translation Models for all 22 Scheduled Indian Languages}, + url = {https://openreview.net/forum?id=vfT4YuzAYA}, + year = {2023}, +} +""", ) def load_data(self, **kwargs: Any) -> None: diff --git a/mteb/tasks/BitextMining/multilingual/IWSLT2017BitextMining.py b/mteb/tasks/BitextMining/multilingual/IWSLT2017BitextMining.py index ee83b6f5ca..6dfcaaa564 100644 --- a/mteb/tasks/BitextMining/multilingual/IWSLT2017BitextMining.py +++ b/mteb/tasks/BitextMining/multilingual/IWSLT2017BitextMining.py @@ -59,26 +59,26 @@ class IWSLT2017BitextMining(AbsTaskBitextMining, MultilingualTask): annotations_creators="expert-annotated", dialect=[], sample_creation="found", - bibtex_citation=""" + bibtex_citation=r""" @inproceedings{cettolo-etal-2017-overview, - title = "Overview of the {IWSLT} 2017 Evaluation Campaign", - author = {Cettolo, Mauro and - Federico, Marcello and - Bentivogli, Luisa and - Niehues, Jan and - St{\"u}ker, Sebastian and - Sudoh, Katsuhito and - Yoshino, Koichiro and - Federmann, Christian}, - editor = "Sakti, Sakriani and - Utiyama, Masao", - booktitle = "Proceedings of the 14th International Conference on Spoken Language Translation", - month = dec # " 14-15", - year = "2017", - address = "Tokyo, Japan", - publisher = "International Workshop on Spoken Language Translation", - url = "https://aclanthology.org/2017.iwslt-1.1", - pages = "2--14", + address = {Tokyo, Japan}, + author = {Cettolo, Mauro and +Federico, Marcello and +Bentivogli, Luisa and +Niehues, Jan and +St{\"u}ker, Sebastian and +Sudoh, Katsuhito and +Yoshino, Koichiro and +Federmann, Christian}, + booktitle = {Proceedings of the 14th International Conference on Spoken Language Translation}, + editor = {Sakti, Sakriani and +Utiyama, Masao}, + month = dec # { 14-15}, + pages = {2--14}, + publisher = {International Workshop on Spoken Language Translation}, + title = {Overview of the {IWSLT} 2017 Evaluation Campaign}, + url = {https://aclanthology.org/2017.iwslt-1.1}, + year = {2017}, } """, ) diff --git a/mteb/tasks/BitextMining/multilingual/IndicGenBenchFloresBitextMining.py b/mteb/tasks/BitextMining/multilingual/IndicGenBenchFloresBitextMining.py index 4f5169619b..2a19e1c82c 100644 --- a/mteb/tasks/BitextMining/multilingual/IndicGenBenchFloresBitextMining.py +++ b/mteb/tasks/BitextMining/multilingual/IndicGenBenchFloresBitextMining.py @@ -120,14 +120,16 @@ class IndicGenBenchFloresBitextMining(AbsTaskBitextMining, MultilingualTask): annotations_creators="expert-annotated", dialect=[], sample_creation="human-translated and localized", - bibtex_citation="""@misc{singh2024indicgenbench, - title={IndicGenBench: A Multilingual Benchmark to Evaluate Generation Capabilities of LLMs on Indic Languages}, - author={Harman Singh and Nitish Gupta and Shikhar Bharadwaj and Dinesh Tewari and Partha Talukdar}, - year={2024}, - eprint={2404.16816}, - archivePrefix={arXiv}, - primaryClass={cs.CL} -}""", + bibtex_citation=r""" +@misc{singh2024indicgenbench, + archiveprefix = {arXiv}, + author = {Harman Singh and Nitish Gupta and Shikhar Bharadwaj and Dinesh Tewari and Partha Talukdar}, + eprint = {2404.16816}, + primaryclass = {cs.CL}, + title = {IndicGenBench: A Multilingual Benchmark to Evaluate Generation Capabilities of LLMs on Indic Languages}, + year = {2024}, +} +""", ) def load_data(self, **kwargs: Any) -> None: diff --git a/mteb/tasks/BitextMining/multilingual/LinceMTBitextMining.py b/mteb/tasks/BitextMining/multilingual/LinceMTBitextMining.py index 8abb8ce1ff..a0551b3eea 100644 --- a/mteb/tasks/BitextMining/multilingual/LinceMTBitextMining.py +++ b/mteb/tasks/BitextMining/multilingual/LinceMTBitextMining.py @@ -31,13 +31,13 @@ class LinceMTBitextMining(AbsTaskBitextMining, MultilingualTask): annotations_creators="human-annotated", dialect=[], sample_creation="found", - bibtex_citation=""" - @inproceedings{aguilar2020lince, - title={LinCE: A Centralized Benchmark for Linguistic Code-switching Evaluation}, - author={Aguilar, Gustavo and Kar, Sudipta and Solorio, Thamar}, - booktitle={Proceedings of the Twelfth Language Resources and Evaluation Conference}, - pages={1803--1813}, - year={2020} - } - """, + bibtex_citation=r""" +@inproceedings{aguilar2020lince, + author = {Aguilar, Gustavo and Kar, Sudipta and Solorio, Thamar}, + booktitle = {Proceedings of the Twelfth Language Resources and Evaluation Conference}, + pages = {1803--1813}, + title = {LinCE: A Centralized Benchmark for Linguistic Code-switching Evaluation}, + year = {2020}, +} +""", ) diff --git a/mteb/tasks/BitextMining/multilingual/NTREXBitextMining.py b/mteb/tasks/BitextMining/multilingual/NTREXBitextMining.py index 0137d9330d..99c0456517 100644 --- a/mteb/tasks/BitextMining/multilingual/NTREXBitextMining.py +++ b/mteb/tasks/BitextMining/multilingual/NTREXBitextMining.py @@ -270,17 +270,17 @@ class NTREXBitextMining(AbsTaskBitextMining, MultilingualTask): annotations_creators="expert-annotated", dialect=[], sample_creation="human-translated and localized", - bibtex_citation=""" + bibtex_citation=r""" @inproceedings{federmann-etal-2022-ntrex, - title = "{NTREX}-128 {--} News Test References for {MT} Evaluation of 128 Languages", - author = "Federmann, Christian and Kocmi, Tom and Xin, Ying", - booktitle = "Proceedings of the First Workshop on Scaling Up Multilingual Evaluation", - month = "nov", - year = "2022", - address = "Online", - publisher = "Association for Computational Linguistics", - url = "https://aclanthology.org/2022.sumeval-1.4", - pages = "21--24", + address = {Online}, + author = {Federmann, Christian and Kocmi, Tom and Xin, Ying}, + booktitle = {Proceedings of the First Workshop on Scaling Up Multilingual Evaluation}, + month = {nov}, + pages = {21--24}, + publisher = {Association for Computational Linguistics}, + title = {{NTREX}-128 {--} News Test References for {MT} Evaluation of 128 Languages}, + url = {https://aclanthology.org/2022.sumeval-1.4}, + year = {2022}, } """, ) diff --git a/mteb/tasks/BitextMining/multilingual/NollySentiBitextMining.py b/mteb/tasks/BitextMining/multilingual/NollySentiBitextMining.py index 4662833008..ec00149a34 100644 --- a/mteb/tasks/BitextMining/multilingual/NollySentiBitextMining.py +++ b/mteb/tasks/BitextMining/multilingual/NollySentiBitextMining.py @@ -34,13 +34,13 @@ class NollySentiBitextMining(AbsTaskBitextMining, MultilingualTask): annotations_creators="human-annotated", dialect=[], sample_creation="found", - bibtex_citation=""" - @inproceedings{shode2023nollysenti, - title={NollySenti: Leveraging Transfer Learning and Machine Translation for Nigerian Movie Sentiment Classification}, - author={Shode, Iyanuoluwa and Adelani, David Ifeoluwa and Peng, Jing and Feldman, Anna}, - booktitle={Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers)}, - pages={986--998}, - year={2023} - } - """, + bibtex_citation=r""" +@inproceedings{shode2023nollysenti, + author = {Shode, Iyanuoluwa and Adelani, David Ifeoluwa and Peng, Jing and Feldman, Anna}, + booktitle = {Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers)}, + pages = {986--998}, + title = {NollySenti: Leveraging Transfer Learning and Machine Translation for Nigerian Movie Sentiment Classification}, + year = {2023}, +} +""", ) diff --git a/mteb/tasks/BitextMining/multilingual/NorwegianCourtsBitextMining.py b/mteb/tasks/BitextMining/multilingual/NorwegianCourtsBitextMining.py index 81a880974c..c7c4c15bdf 100644 --- a/mteb/tasks/BitextMining/multilingual/NorwegianCourtsBitextMining.py +++ b/mteb/tasks/BitextMining/multilingual/NorwegianCourtsBitextMining.py @@ -26,12 +26,12 @@ class NorwegianCourtsBitextMining(AbsTaskBitextMining): annotations_creators="human-annotated", dialect=[], sample_creation="found", - bibtex_citation=""" + bibtex_citation=r""" @inproceedings{opus4, - title={OPUS-MT — Building open translation services for the World}, - author={Tiedemann, J{\"o}rg and Thottingal, Santhosh}, - booktitle={Proceedings of the 22nd Annual Conference of the European Association for Machine Translation (EAMT)}, - year={2020} + author = {Tiedemann, J{\"o}rg and Thottingal, Santhosh}, + booktitle = {Proceedings of the 22nd Annual Conference of the European Association for Machine Translation (EAMT)}, + title = {OPUS-MT — Building open translation services for the World}, + year = {2020}, } """, prompt="Retrieve parallel sentences in Norwegian Bokmål and Nynorsk", diff --git a/mteb/tasks/BitextMining/multilingual/NusaTranslationBitextMining.py b/mteb/tasks/BitextMining/multilingual/NusaTranslationBitextMining.py index c328461746..02c1859b8f 100644 --- a/mteb/tasks/BitextMining/multilingual/NusaTranslationBitextMining.py +++ b/mteb/tasks/BitextMining/multilingual/NusaTranslationBitextMining.py @@ -41,14 +41,13 @@ class NusaTranslationBitextMining(AbsTaskBitextMining, MultilingualTask): annotations_creators="human-annotated", dialect=[], sample_creation="created", - bibtex_citation=""" - @inproceedings{cahyawijaya2023nusawrites, - title={NusaWrites: Constructing High-Quality Corpora for Underrepresented and Extremely Low-Resource Languages}, - author={Cahyawijaya, Samuel and Lovenia, Holy and Koto, Fajri and Adhista, Dea and Dave, Emmanuel and Oktavianti, Sarah and Akbar, Salsabil and Lee, Jhonson and Shadieq, Nuur and Cenggoro, Tjeng Wawan and others}, - booktitle={Proceedings of the 13th International Joint Conference on Natural Language Processing and the 3rd Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics (Volume 1: Long Papers)}, - pages={921--945}, - year={2023} - } - - """, + bibtex_citation=r""" +@inproceedings{cahyawijaya2023nusawrites, + author = {Cahyawijaya, Samuel and Lovenia, Holy and Koto, Fajri and Adhista, Dea and Dave, Emmanuel and Oktavianti, Sarah and Akbar, Salsabil and Lee, Jhonson and Shadieq, Nuur and Cenggoro, Tjeng Wawan and others}, + booktitle = {Proceedings of the 13th International Joint Conference on Natural Language Processing and the 3rd Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics (Volume 1: Long Papers)}, + pages = {921--945}, + title = {NusaWrites: Constructing High-Quality Corpora for Underrepresented and Extremely Low-Resource Languages}, + year = {2023}, +} +""", ) diff --git a/mteb/tasks/BitextMining/multilingual/NusaXBitextMining.py b/mteb/tasks/BitextMining/multilingual/NusaXBitextMining.py index ee609dac82..ac37841066 100644 --- a/mteb/tasks/BitextMining/multilingual/NusaXBitextMining.py +++ b/mteb/tasks/BitextMining/multilingual/NusaXBitextMining.py @@ -41,21 +41,22 @@ class NusaXBitextMining(AbsTaskBitextMining, MultilingualTask): annotations_creators="human-annotated", dialect=[], sample_creation="created", - bibtex_citation=""" - @inproceedings{winata2023nusax, - title={NusaX: Multilingual Parallel Sentiment Dataset for 10 Indonesian Local Languages}, - author={Winata, Genta Indra and Aji, Alham Fikri and Cahyawijaya, Samuel and Mahendra, Rahmad and Koto, Fajri and Romadhony, Ade and Kurniawan, Kemal and Moeljadi, David and Prasojo, Radityo Eko and Fung, Pascale and others}, - booktitle={Proceedings of the 17th Conference of the European Chapter of the Association for Computational Linguistics}, - pages={815--834}, - year={2023} - } - @misc{winata2024miners, - title={MINERS: Multilingual Language Models as Semantic Retrievers}, - author={Genta Indra Winata and Ruochen Zhang and David Ifeoluwa Adelani}, - year={2024}, - eprint={2406.07424}, - archivePrefix={arXiv}, - primaryClass={cs.CL} - } - """, + bibtex_citation=r""" +@inproceedings{winata2023nusax, + author = {Winata, Genta Indra and Aji, Alham Fikri and Cahyawijaya, Samuel and Mahendra, Rahmad and Koto, Fajri and Romadhony, Ade and Kurniawan, Kemal and Moeljadi, David and Prasojo, Radityo Eko and Fung, Pascale and others}, + booktitle = {Proceedings of the 17th Conference of the European Chapter of the Association for Computational Linguistics}, + pages = {815--834}, + title = {NusaX: Multilingual Parallel Sentiment Dataset for 10 Indonesian Local Languages}, + year = {2023}, +} + +@misc{winata2024miners, + archiveprefix = {arXiv}, + author = {Genta Indra Winata and Ruochen Zhang and David Ifeoluwa Adelani}, + eprint = {2406.07424}, + primaryclass = {cs.CL}, + title = {MINERS: Multilingual Language Models as Semantic Retrievers}, + year = {2024}, +} +""", ) diff --git a/mteb/tasks/BitextMining/multilingual/PhincBitextMining.py b/mteb/tasks/BitextMining/multilingual/PhincBitextMining.py index c7fec75637..b4d9293834 100644 --- a/mteb/tasks/BitextMining/multilingual/PhincBitextMining.py +++ b/mteb/tasks/BitextMining/multilingual/PhincBitextMining.py @@ -31,13 +31,13 @@ class PhincBitextMining(AbsTaskBitextMining, MultilingualTask): annotations_creators="human-annotated", dialect=[], sample_creation="found", - bibtex_citation=""" - @inproceedings{srivastava2020phinc, - title={PHINC: A Parallel Hinglish Social Media Code-Mixed Corpus for Machine Translation}, - author={Srivastava, Vivek and Singh, Mayank}, - booktitle={Proceedings of the Sixth Workshop on Noisy User-generated Text (W-NUT 2020)}, - pages={41--49}, - year={2020} - } - """, + bibtex_citation=r""" +@inproceedings{srivastava2020phinc, + author = {Srivastava, Vivek and Singh, Mayank}, + booktitle = {Proceedings of the Sixth Workshop on Noisy User-generated Text (W-NUT 2020)}, + pages = {41--49}, + title = {PHINC: A Parallel Hinglish Social Media Code-Mixed Corpus for Machine Translation}, + year = {2020}, +} +""", ) diff --git a/mteb/tasks/BitextMining/multilingual/TatoebaBitextMining.py b/mteb/tasks/BitextMining/multilingual/TatoebaBitextMining.py index 4312332022..6f265b7261 100644 --- a/mteb/tasks/BitextMining/multilingual/TatoebaBitextMining.py +++ b/mteb/tasks/BitextMining/multilingual/TatoebaBitextMining.py @@ -145,11 +145,11 @@ class TatoebaBitextMining(AbsTaskBitextMining, MultilingualTask): annotations_creators="human-annotated", dialect=[], # No specific dialect mentioned sample_creation="found", - bibtex_citation=""" - @misc{tatoeba, - author = {Tatoeba community}, - title = {Tatoeba: Collection of sentences and translations}, - year = {2021}, - } - """, + bibtex_citation=r""" +@misc{tatoeba, + author = {Tatoeba community}, + title = {Tatoeba: Collection of sentences and translations}, + year = {2021}, +} +""", ) diff --git a/mteb/tasks/BitextMining/multilingual/WebFAQBitextMining.py b/mteb/tasks/BitextMining/multilingual/WebFAQBitextMining.py index c0561e01ab..8b5fe51976 100644 --- a/mteb/tasks/BitextMining/multilingual/WebFAQBitextMining.py +++ b/mteb/tasks/BitextMining/multilingual/WebFAQBitextMining.py @@ -222,15 +222,17 @@ class WebFAQBitextMiningQuestions(AbsTaskBitextMining, MultilingualTask): annotations_creators="human-annotated", dialect=[], sample_creation="human-translated", - bibtex_citation="""@misc{dinzinger2025webfaq, - title={WebFAQ: A Multilingual Collection of Natural Q&A Datasets for Dense Retrieval}, - author={Michael Dinzinger and Laura Caspari and Kanishka Ghosh Dastidar and Jelena Mitrović and Michael Granitzer}, - year={2025}, - eprint={2502.20936}, - archivePrefix={arXiv}, - primaryClass={cs.CL}, - url={https://arxiv.org/abs/2502.20936}, -}""", + bibtex_citation=r""" +@misc{dinzinger2025webfaq, + archiveprefix = {arXiv}, + author = {Michael Dinzinger and Laura Caspari and Kanishka Ghosh Dastidar and Jelena Mitrović and Michael Granitzer}, + eprint = {2502.20936}, + primaryclass = {cs.CL}, + title = {WebFAQ: A Multilingual Collection of Natural Q&A Datasets for Dense Retrieval}, + url = {https://arxiv.org/abs/2502.20936}, + year = {2025}, +} +""", ) def dataset_transform(self): @@ -276,15 +278,17 @@ class WebFAQBitextMiningQAs(AbsTaskBitextMining, MultilingualTask): annotations_creators="human-annotated", dialect=[], sample_creation="human-translated", - bibtex_citation="""@misc{dinzinger2025webfaq, - title={WebFAQ: A Multilingual Collection of Natural Q&A Datasets for Dense Retrieval}, - author={Michael Dinzinger and Laura Caspari and Kanishka Ghosh Dastidar and Jelena Mitrović and Michael Granitzer}, - year={2025}, - eprint={2502.20936}, - archivePrefix={arXiv}, - primaryClass={cs.CL}, - url={https://arxiv.org/abs/2502.20936}, -}""", + bibtex_citation=r""" +@misc{dinzinger2025webfaq, + archiveprefix = {arXiv}, + author = {Michael Dinzinger and Laura Caspari and Kanishka Ghosh Dastidar and Jelena Mitrović and Michael Granitzer}, + eprint = {2502.20936}, + primaryclass = {cs.CL}, + title = {WebFAQ: A Multilingual Collection of Natural Q&A Datasets for Dense Retrieval}, + url = {https://arxiv.org/abs/2502.20936}, + year = {2025}, +} +""", ) def dataset_transform(self): diff --git a/mteb/tasks/BitextMining/srn/SRNCorpusBitextMining.py b/mteb/tasks/BitextMining/srn/SRNCorpusBitextMining.py index b4072553b6..cdcfdb79d3 100644 --- a/mteb/tasks/BitextMining/srn/SRNCorpusBitextMining.py +++ b/mteb/tasks/BitextMining/srn/SRNCorpusBitextMining.py @@ -46,12 +46,12 @@ class SRNCorpusBitextMining(AbsTaskBitextMining, MultilingualTask): annotations_creators="human-annotated", dialect=[], sample_creation="found", - bibtex_citation=""" + bibtex_citation=r""" @article{zwennicker2022towards, - title={Towards a general purpose machine translation system for Sranantongo}, - author={Zwennicker, Just and Stap, David}, - journal={arXiv preprint arXiv:2212.06383}, - year={2022} + author = {Zwennicker, Just and Stap, David}, + journal = {arXiv preprint arXiv:2212.06383}, + title = {Towards a general purpose machine translation system for Sranantongo}, + year = {2022}, } """, ) diff --git a/mteb/tasks/BitextMining/vie/VieMedEVBitextMining.py b/mteb/tasks/BitextMining/vie/VieMedEVBitextMining.py index ab32025167..f841012f5f 100644 --- a/mteb/tasks/BitextMining/vie/VieMedEVBitextMining.py +++ b/mteb/tasks/BitextMining/vie/VieMedEVBitextMining.py @@ -33,12 +33,14 @@ class VieMedEVBitextMining(AbsTaskBitextMining): annotations_creators="expert-annotated", dialect=[], sample_creation="human-translated and localized", - bibtex_citation="""@inproceedings{medev, - title = {{Improving Vietnamese-English Medical Machine Translation}}, - author = {Nhu Vo and Dat Quoc Nguyen and Dung D. Le and Massimo Piccardi and Wray Buntine}, - booktitle = {Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING)}, - year = {2024} -}""", + bibtex_citation=r""" +@inproceedings{medev, + author = {Nhu Vo and Dat Quoc Nguyen and Dung D. Le and Massimo Piccardi and Wray Buntine}, + booktitle = {Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING)}, + title = {{Improving Vietnamese-English Medical Machine Translation}}, + year = {2024}, +} +""", ) def dataset_transform(self): diff --git a/mteb/tasks/Classification/__init__.py b/mteb/tasks/Classification/__init__.py index b2aab22714..eafd1a7109 100644 --- a/mteb/tasks/Classification/__init__.py +++ b/mteb/tasks/Classification/__init__.py @@ -108,6 +108,7 @@ from .multilingual.NusaParagraphTopicClassification import * from .multilingual.NusaXSenti import * from .multilingual.ScalaClassification import * +from .multilingual.ScandiSentClassification import * from .multilingual.SIB200Classification import * from .multilingual.SouthAfricanLangClassification import * from .multilingual.SwissJudgementClassification import * @@ -129,9 +130,12 @@ from .rus.HeadlineClassification import * from .rus.InappropriatenessClassification import * from .rus.KinopoiskClassification import * +from .rus.ru_nlu_intent_classification import * +from .rus.ru_toixic_classification_okmlcup import * from .rus.RuReviewsClassification import * from .rus.RuSciBenchGRNTIClassification import * from .rus.RuSciBenchOECDClassification import * +from .rus.senti_ru_eval import * from .san.SanskritShlokasClassification import * from .sin.SinhalaNewsClassification import * from .sin.SinhalaNewsSourceClassification import * diff --git a/mteb/tasks/Classification/ara/AJGT.py b/mteb/tasks/Classification/ara/AJGT.py index 2baa389794..987415df7c 100644 --- a/mteb/tasks/Classification/ara/AJGT.py +++ b/mteb/tasks/Classification/ara/AJGT.py @@ -26,14 +26,14 @@ class AJGT(AbsTaskClassification): annotations_creators="human-annotated", dialect=["ara-arab-MSA", "ara-arab-JO"], sample_creation="found", - bibtex_citation=""" + bibtex_citation=r""" @inproceedings{alomari2017arabic, - title={Arabic tweets sentimental analysis using machine learning}, - author={Alomari, Khaled Mohammad and ElSherif, Hatem M and Shaalan, Khaled}, - booktitle={International Conference on Industrial, Engineering and Other Applications of Applied Intelligent Systems}, - pages={602--610}, - year={2017}, - organization={Springer} + author = {Alomari, Khaled Mohammad and ElSherif, Hatem M and Shaalan, Khaled}, + booktitle = {International Conference on Industrial, Engineering and Other Applications of Applied Intelligent Systems}, + organization = {Springer}, + pages = {602--610}, + title = {Arabic tweets sentimental analysis using machine learning}, + year = {2017}, } """, ) diff --git a/mteb/tasks/Classification/ara/HotelReviewSentimentClassification.py b/mteb/tasks/Classification/ara/HotelReviewSentimentClassification.py index 24b7bc33fc..7743f47531 100644 --- a/mteb/tasks/Classification/ara/HotelReviewSentimentClassification.py +++ b/mteb/tasks/Classification/ara/HotelReviewSentimentClassification.py @@ -27,14 +27,14 @@ class HotelReviewSentimentClassification(AbsTaskClassification): annotations_creators="derived", dialect=["ara-arab-EG", "ara-arab-JO", "ara-arab-LB", "ara-arab-SA"], sample_creation="found", - bibtex_citation=""" + bibtex_citation=r""" @article{elnagar2018hotel, - title={Hotel Arabic-reviews dataset construction for sentiment analysis applications}, - author={Elnagar, Ashraf and Khalifa, Yasmin S and Einea, Anas}, - journal={Intelligent natural language processing: Trends and applications}, - pages={35--52}, - year={2018}, - publisher={Springer} + author = {Elnagar, Ashraf and Khalifa, Yasmin S and Einea, Anas}, + journal = {Intelligent natural language processing: Trends and applications}, + pages = {35--52}, + publisher = {Springer}, + title = {Hotel Arabic-reviews dataset construction for sentiment analysis applications}, + year = {2018}, } """, ) diff --git a/mteb/tasks/Classification/ara/RestaurantReviewSentimentClassification.py b/mteb/tasks/Classification/ara/RestaurantReviewSentimentClassification.py index 363d0526d7..f5240e6702 100644 --- a/mteb/tasks/Classification/ara/RestaurantReviewSentimentClassification.py +++ b/mteb/tasks/Classification/ara/RestaurantReviewSentimentClassification.py @@ -26,14 +26,14 @@ class RestaurantReviewSentimentClassification(AbsTaskClassification): annotations_creators="derived", dialect=["ara-arab-EG", "ara-arab-JO", "ara-arab-SA"], sample_creation="found", - bibtex_citation=""" + bibtex_citation=r""" @inproceedings{elsahar2015building, - title={Building large arabic multi-domain resources for sentiment analysis}, - author={ElSahar, Hady and El-Beltagy, Samhaa R}, - booktitle={International conference on intelligent text processing and computational linguistics}, - pages={23--34}, - year={2015}, - organization={Springer} + author = {ElSahar, Hady and El-Beltagy, Samhaa R}, + booktitle = {International conference on intelligent text processing and computational linguistics}, + organization = {Springer}, + pages = {23--34}, + title = {Building large arabic multi-domain resources for sentiment analysis}, + year = {2015}, } """, ) diff --git a/mteb/tasks/Classification/ara/TweetEmotionClassification.py b/mteb/tasks/Classification/ara/TweetEmotionClassification.py index e7fb8687ac..e2da60f449 100644 --- a/mteb/tasks/Classification/ara/TweetEmotionClassification.py +++ b/mteb/tasks/Classification/ara/TweetEmotionClassification.py @@ -27,14 +27,14 @@ class TweetEmotionClassification(AbsTaskClassification): annotations_creators="human-annotated", dialect=["ara-arab-EG", "ara-arab-LB", "ara-arab-JO", "ara-arab-SA"], sample_creation="found", - bibtex_citation=""" + bibtex_citation=r""" @inproceedings{al2018emotional, - title={Emotional tone detection in arabic tweets}, - author={Al-Khatib, Amr and El-Beltagy, Samhaa R}, - booktitle={Computational Linguistics and Intelligent Text Processing: 18th International Conference, CICLing 2017, Budapest, Hungary, April 17--23, 2017, Revised Selected Papers, Part II 18}, - pages={105--114}, - year={2018}, - organization={Springer} + author = {Al-Khatib, Amr and El-Beltagy, Samhaa R}, + booktitle = {Computational Linguistics and Intelligent Text Processing: 18th International Conference, CICLing 2017, Budapest, Hungary, April 17--23, 2017, Revised Selected Papers, Part II 18}, + organization = {Springer}, + pages = {105--114}, + title = {Emotional tone detection in arabic tweets}, + year = {2018}, } """, ) diff --git a/mteb/tasks/Classification/ara/TweetSarcasmClassification.py b/mteb/tasks/Classification/ara/TweetSarcasmClassification.py index 9c5f141d0b..2f7fb95f52 100644 --- a/mteb/tasks/Classification/ara/TweetSarcasmClassification.py +++ b/mteb/tasks/Classification/ara/TweetSarcasmClassification.py @@ -26,26 +26,26 @@ class TweetSarcasmClassification(AbsTaskClassification): annotations_creators="human-annotated", dialect=["ara-arab-EG", "ara-arab-LB", "ara-arab-MA", "ara-arab-SA"], sample_creation="found", - bibtex_citation=""" + bibtex_citation=r""" @inproceedings{abu-farha-magdy-2020-arabic, - title = "From {A}rabic Sentiment Analysis to Sarcasm Detection: The {A}r{S}arcasm Dataset", - author = "Abu Farha, Ibrahim and - Magdy, Walid", - editor = "Al-Khalifa, Hend and - Magdy, Walid and - Darwish, Kareem and - Elsayed, Tamer and - Mubarak, Hamdy", - booktitle = "Proceedings of the 4th Workshop on Open-Source Arabic Corpora and Processing Tools, with a Shared Task on Offensive Language Detection", - month = may, - year = "2020", - address = "Marseille, France", - publisher = "European Language Resource Association", - url = "https://aclanthology.org/2020.osact-1.5", - pages = "32--39", - abstract = "Sarcasm is one of the main challenges for sentiment analysis systems. Its complexity comes from the expression of opinion using implicit indirect phrasing. In this paper, we present ArSarcasm, an Arabic sarcasm detection dataset, which was created through the reannotation of available Arabic sentiment analysis datasets. The dataset contains 10,547 tweets, 16{\%} of which are sarcastic. In addition to sarcasm the data was annotated for sentiment and dialects. Our analysis shows the highly subjective nature of these tasks, which is demonstrated by the shift in sentiment labels based on annotators{'} biases. Experiments show the degradation of state-of-the-art sentiment analysers when faced with sarcastic content. Finally, we train a deep learning model for sarcasm detection using BiLSTM. The model achieves an F1 score of 0.46, which shows the challenging nature of the task, and should act as a basic baseline for future research on our dataset.", - language = "English", - ISBN = "979-10-95546-51-1", + abstract = {Sarcasm is one of the main challenges for sentiment analysis systems. Its complexity comes from the expression of opinion using implicit indirect phrasing. In this paper, we present ArSarcasm, an Arabic sarcasm detection dataset, which was created through the reannotation of available Arabic sentiment analysis datasets. The dataset contains 10,547 tweets, 16{\%} of which are sarcastic. In addition to sarcasm the data was annotated for sentiment and dialects. Our analysis shows the highly subjective nature of these tasks, which is demonstrated by the shift in sentiment labels based on annotators{'} biases. Experiments show the degradation of state-of-the-art sentiment analysers when faced with sarcastic content. Finally, we train a deep learning model for sarcasm detection using BiLSTM. The model achieves an F1 score of 0.46, which shows the challenging nature of the task, and should act as a basic baseline for future research on our dataset.}, + address = {Marseille, France}, + author = {Abu Farha, Ibrahim and +Magdy, Walid}, + booktitle = {Proceedings of the 4th Workshop on Open-Source Arabic Corpora and Processing Tools, with a Shared Task on Offensive Language Detection}, + editor = {Al-Khalifa, Hend and +Magdy, Walid and +Darwish, Kareem and +Elsayed, Tamer and +Mubarak, Hamdy}, + isbn = {979-10-95546-51-1}, + language = {English}, + month = may, + pages = {32--39}, + publisher = {European Language Resource Association}, + title = {From {A}rabic Sentiment Analysis to Sarcasm Detection: The {A}r{S}arcasm Dataset}, + url = {https://aclanthology.org/2020.osact-1.5}, + year = {2020}, } """, ) diff --git a/mteb/tasks/Classification/ben/BengaliDocumentClassification.py b/mteb/tasks/Classification/ben/BengaliDocumentClassification.py index 145eba57ab..c1fe72afee 100644 --- a/mteb/tasks/Classification/ben/BengaliDocumentClassification.py +++ b/mteb/tasks/Classification/ben/BengaliDocumentClassification.py @@ -26,22 +26,22 @@ class BengaliDocumentClassification(AbsTaskClassification): license="cc-by-nc-sa-4.0", annotations_creators="derived", sample_creation="found", - bibtex_citation=""" - @inproceedings{akash-etal-2023-shironaam, - title = "Shironaam: {B}engali News Headline Generation using Auxiliary Information", - author = "Akash, Abu Ubaida and - Nayeem, Mir Tafseer and - Shohan, Faisal Tareque and - Islam, Tanvir", - booktitle = "Proceedings of the 17th Conference of the European Chapter of the Association for Computational Linguistics", - month = may, - year = "2023", - address = "Dubrovnik, Croatia", - publisher = "Association for Computational Linguistics", - url = "https://aclanthology.org/2023.eacl-main.4", - pages = "52--67" - } - """, + bibtex_citation=r""" +@inproceedings{akash-etal-2023-shironaam, + address = {Dubrovnik, Croatia}, + author = {Akash, Abu Ubaida and +Nayeem, Mir Tafseer and +Shohan, Faisal Tareque and +Islam, Tanvir}, + booktitle = {Proceedings of the 17th Conference of the European Chapter of the Association for Computational Linguistics}, + month = may, + pages = {52--67}, + publisher = {Association for Computational Linguistics}, + title = {Shironaam: {B}engali News Headline Generation using Auxiliary Information}, + url = {https://aclanthology.org/2023.eacl-main.4}, + year = {2023}, +} +""", ) def dataset_transform(self) -> None: diff --git a/mteb/tasks/Classification/ben/BengaliHateSpeechClassification.py b/mteb/tasks/Classification/ben/BengaliHateSpeechClassification.py index 86763f0e50..9645415a58 100644 --- a/mteb/tasks/Classification/ben/BengaliHateSpeechClassification.py +++ b/mteb/tasks/Classification/ben/BengaliHateSpeechClassification.py @@ -26,12 +26,13 @@ class BengaliHateSpeechClassification(AbsTaskClassification): license="mit", annotations_creators="expert-annotated", sample_creation="found", - bibtex_citation="""@inproceedings{karim2020BengaliNLP, - title={Classification Benchmarks for Under-resourced Bengali Language based on Multichannel Convolutional-LSTM Network}, - author={Karim, Md. Rezaul and Chakravarti, Bharathi Raja and P. McCrae, John and Cochez, Michael}, - booktitle={7th IEEE International Conference on Data Science and Advanced Analytics (IEEE DSAA,2020)}, - publisher={IEEE}, - year={2020} + bibtex_citation=r""" +@inproceedings{karim2020BengaliNLP, + author = {Karim, Md. Rezaul and Chakravarti, Bharathi Raja and P. McCrae, John and Cochez, Michael}, + booktitle = {7th IEEE International Conference on Data Science and Advanced Analytics (IEEE DSAA,2020)}, + publisher = {IEEE}, + title = {Classification Benchmarks for Under-resourced Bengali Language based on Multichannel Convolutional-LSTM Network}, + year = {2020}, } """, ) diff --git a/mteb/tasks/Classification/ben/BengaliSentimentAnalysis.py b/mteb/tasks/Classification/ben/BengaliSentimentAnalysis.py index 87af91c8a8..b476bb4f58 100644 --- a/mteb/tasks/Classification/ben/BengaliSentimentAnalysis.py +++ b/mteb/tasks/Classification/ben/BengaliSentimentAnalysis.py @@ -26,13 +26,15 @@ class BengaliSentimentAnalysis(AbsTaskClassification): license="cc-by-4.0", annotations_creators="human-annotated", sample_creation="found", - bibtex_citation="""@inproceedings{sazzed2020cross, - title={Cross-lingual sentiment classification in low-resource Bengali language}, - author={Sazzed, Salim}, - booktitle={Proceedings of the Sixth Workshop on Noisy User-generated Text (W-NUT 2020)}, - pages={50--60}, - year={2020} - }""", + bibtex_citation=r""" +@inproceedings{sazzed2020cross, + author = {Sazzed, Salim}, + booktitle = {Proceedings of the Sixth Workshop on Noisy User-generated Text (W-NUT 2020)}, + pages = {50--60}, + title = {Cross-lingual sentiment classification in low-resource Bengali language}, + year = {2020}, +} +""", ) def dataset_transform(self): diff --git a/mteb/tasks/Classification/bul/BulgarianStoreReviewSentimentClassfication.py b/mteb/tasks/Classification/bul/BulgarianStoreReviewSentimentClassfication.py index 7878fa89e2..d134f9af86 100644 --- a/mteb/tasks/Classification/bul/BulgarianStoreReviewSentimentClassfication.py +++ b/mteb/tasks/Classification/bul/BulgarianStoreReviewSentimentClassfication.py @@ -26,14 +26,15 @@ class BulgarianStoreReviewSentimentClassfication(AbsTaskClassification): annotations_creators="human-annotated", dialect=[], sample_creation="found", - bibtex_citation="""@data{DVN/TXIK9P_2018, -author = {Georgieva-Trifonova, Tsvetanka and Stefanova, Milena and Kalchev, Stefan}, -publisher = {Harvard Dataverse}, -title = {{Dataset for ``Customer Feedback Text Analysis for Online Stores Reviews in Bulgarian''}}, -year = {2018}, -version = {V1}, -doi = {10.7910/DVN/TXIK9P}, -url = {https://doi.org/10.7910/DVN/TXIK9P} + bibtex_citation=r""" +@data{DVN/TXIK9P_2018, + author = {Georgieva-Trifonova, Tsvetanka and Stefanova, Milena and Kalchev, Stefan}, + doi = {10.7910/DVN/TXIK9P}, + publisher = {Harvard Dataverse}, + title = {{Dataset for ``Customer Feedback Text Analysis for Online Stores Reviews in Bulgarian''}}, + url = {https://doi.org/10.7910/DVN/TXIK9P}, + version = {V1}, + year = {2018}, } """, ) diff --git a/mteb/tasks/Classification/ces/CSFDCZMovieReviewSentimentClassification.py b/mteb/tasks/Classification/ces/CSFDCZMovieReviewSentimentClassification.py index ca081f9a61..9b1f68f0a3 100644 --- a/mteb/tasks/Classification/ces/CSFDCZMovieReviewSentimentClassification.py +++ b/mteb/tasks/Classification/ces/CSFDCZMovieReviewSentimentClassification.py @@ -26,14 +26,14 @@ class CSFDCZMovieReviewSentimentClassification(AbsTaskClassification): annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation=""" + bibtex_citation=r""" @misc{štefánik2023resources, - title={Resources and Few-shot Learners for In-context Learning in Slavic Languages}, - author={Michal Štefánik and Marek Kadlčík and Piotr Gramacki and Petr Sojka}, - year={2023}, - eprint={2304.01922}, - archivePrefix={arXiv}, - primaryClass={cs.CL} + archiveprefix = {arXiv}, + author = {Michal Štefánik and Marek Kadlčík and Piotr Gramacki and Petr Sojka}, + eprint = {2304.01922}, + primaryclass = {cs.CL}, + title = {Resources and Few-shot Learners for In-context Learning in Slavic Languages}, + year = {2023}, } """, ) diff --git a/mteb/tasks/Classification/ces/CzechProductReviewSentimentClassification.py b/mteb/tasks/Classification/ces/CzechProductReviewSentimentClassification.py index 8705a73c39..cd29ac4353 100644 --- a/mteb/tasks/Classification/ces/CzechProductReviewSentimentClassification.py +++ b/mteb/tasks/Classification/ces/CzechProductReviewSentimentClassification.py @@ -26,24 +26,24 @@ class CzechProductReviewSentimentClassification(AbsTaskClassification): license="cc-by-nc-sa-4.0", annotations_creators="derived", sample_creation="found", - bibtex_citation=""" - @inproceedings{habernal-etal-2013-sentiment, - title = "Sentiment Analysis in {C}zech Social Media Using Supervised Machine Learning", - author = "Habernal, Ivan and - Pt{\'a}{\v{c}}ek, Tom{\'a}{\v{s}} and - Steinberger, Josef", - editor = "Balahur, Alexandra and - van der Goot, Erik and - Montoyo, Andres", - booktitle = "Proceedings of the 4th Workshop on Computational Approaches to Subjectivity, Sentiment and Social Media Analysis", - month = jun, - year = "2013", - address = "Atlanta, Georgia", - publisher = "Association for Computational Linguistics", - url = "https://aclanthology.org/W13-1609", - pages = "65--74", - } - """, + bibtex_citation=r""" +@inproceedings{habernal-etal-2013-sentiment, + address = {Atlanta, Georgia}, + author = {Habernal, Ivan and +Pt{\'a}{\v{c}}ek, Tom{\'a}{\v{s}} and +Steinberger, Josef}, + booktitle = {Proceedings of the 4th Workshop on Computational Approaches to Subjectivity, Sentiment and Social Media Analysis}, + editor = {Balahur, Alexandra and +van der Goot, Erik and +Montoyo, Andres}, + month = jun, + pages = {65--74}, + publisher = {Association for Computational Linguistics}, + title = {Sentiment Analysis in {C}zech Social Media Using Supervised Machine Learning}, + url = {https://aclanthology.org/W13-1609}, + year = {2013}, +} +""", ) samples_per_label = 16 diff --git a/mteb/tasks/Classification/ces/CzechSoMeSentimentClassification.py b/mteb/tasks/Classification/ces/CzechSoMeSentimentClassification.py index 0e61196b19..333cd3aa4a 100644 --- a/mteb/tasks/Classification/ces/CzechSoMeSentimentClassification.py +++ b/mteb/tasks/Classification/ces/CzechSoMeSentimentClassification.py @@ -26,24 +26,24 @@ class CzechSoMeSentimentClassification(AbsTaskClassification): license="cc-by-nc-sa-4.0", annotations_creators="derived", sample_creation="found", - bibtex_citation=""" - @inproceedings{habernal-etal-2013-sentiment, - title = "Sentiment Analysis in {C}zech Social Media Using Supervised Machine Learning", - author = "Habernal, Ivan and - Pt{\'a}{\v{c}}ek, Tom{\'a}{\v{s}} and - Steinberger, Josef", - editor = "Balahur, Alexandra and - van der Goot, Erik and - Montoyo, Andres", - booktitle = "Proceedings of the 4th Workshop on Computational Approaches to Subjectivity, Sentiment and Social Media Analysis", - month = jun, - year = "2013", - address = "Atlanta, Georgia", - publisher = "Association for Computational Linguistics", - url = "https://aclanthology.org/W13-1609", - pages = "65--74", - } - """, + bibtex_citation=r""" +@inproceedings{habernal-etal-2013-sentiment, + address = {Atlanta, Georgia}, + author = {Habernal, Ivan and +Pt{\'a}{\v{c}}ek, Tom{\'a}{\v{s}} and +Steinberger, Josef}, + booktitle = {Proceedings of the 4th Workshop on Computational Approaches to Subjectivity, Sentiment and Social Media Analysis}, + editor = {Balahur, Alexandra and +van der Goot, Erik and +Montoyo, Andres}, + month = jun, + pages = {65--74}, + publisher = {Association for Computational Linguistics}, + title = {Sentiment Analysis in {C}zech Social Media Using Supervised Machine Learning}, + url = {https://aclanthology.org/W13-1609}, + year = {2013}, +} +""", ) samples_per_label = 16 diff --git a/mteb/tasks/Classification/ces/CzechSubjectivityClassification.py b/mteb/tasks/Classification/ces/CzechSubjectivityClassification.py index 18bcc7e10e..e2b1f10efc 100644 --- a/mteb/tasks/Classification/ces/CzechSubjectivityClassification.py +++ b/mteb/tasks/Classification/ces/CzechSubjectivityClassification.py @@ -26,17 +26,18 @@ class CzechSubjectivityClassification(AbsTaskClassification): annotations_creators="human-annotated", dialect=[], sample_creation="found", - bibtex_citation="""@inproceedings{priban-steinberger-2022-czech, - title = "\{C\}zech Dataset for Cross-lingual Subjectivity Classification", - author = "P{\v{r}}ib{\'a}{\v{n}}, Pavel and - Steinberger, Josef", - booktitle = "Proceedings of the Thirteenth Language Resources and Evaluation Conference", - month = jun, - year = "2022", - address = "Marseille, France", - publisher = "European Language Resources Association", - url = "https://aclanthology.org/2022.lrec-1.148", - pages = "1381--1391", + bibtex_citation=r""" +@inproceedings{priban-steinberger-2022-czech, + address = {Marseille, France}, + author = {P{\v{r}}ib{\'a}{\v{n}}, Pavel and +Steinberger, Josef}, + booktitle = {Proceedings of the Thirteenth Language Resources and Evaluation Conference}, + month = jun, + pages = {1381--1391}, + publisher = {European Language Resources Association}, + title = {\{C\}zech Dataset for Cross-lingual Subjectivity Classification}, + url = {https://aclanthology.org/2022.lrec-1.148}, + year = {2022}, } """, ) diff --git a/mteb/tasks/Classification/dan/AngryTweetsClassification.py b/mteb/tasks/Classification/dan/AngryTweetsClassification.py index b22efde7a5..886612db48 100644 --- a/mteb/tasks/Classification/dan/AngryTweetsClassification.py +++ b/mteb/tasks/Classification/dan/AngryTweetsClassification.py @@ -26,13 +26,15 @@ class AngryTweetsClassification(AbsTaskClassification): annotations_creators="human-annotated", dialect=[], sample_creation="found", - bibtex_citation="""@inproceedings{pauli2021danlp, - title={DaNLP: An open-source toolkit for Danish Natural Language Processing}, - author={Pauli, Amalie Brogaard and Barrett, Maria and Lacroix, Oph{\'e}lie and Hvingelby, Rasmus}, - booktitle={Proceedings of the 23rd Nordic Conference on Computational Linguistics (NoDaLiDa)}, - pages={460--466}, - year={2021} -}""", + bibtex_citation=r""" +@inproceedings{pauli2021danlp, + author = {Pauli, Amalie Brogaard and Barrett, Maria and Lacroix, Oph{\'e}lie and Hvingelby, Rasmus}, + booktitle = {Proceedings of the 23rd Nordic Conference on Computational Linguistics (NoDaLiDa)}, + pages = {460--466}, + title = {DaNLP: An open-source toolkit for Danish Natural Language Processing}, + year = {2021}, +} +""", prompt="Classify Danish tweets by sentiment. (positive, negative, neutral).", ) diff --git a/mteb/tasks/Classification/dan/DKHateClassification.py b/mteb/tasks/Classification/dan/DKHateClassification.py index fb6c04cc40..e67e3ebee0 100644 --- a/mteb/tasks/Classification/dan/DKHateClassification.py +++ b/mteb/tasks/Classification/dan/DKHateClassification.py @@ -26,35 +26,37 @@ class DKHateClassification(AbsTaskClassification): annotations_creators="expert-annotated", dialect=[], sample_creation="found", - bibtex_citation="""@inproceedings{sigurbergsson-derczynski-2020-offensive, - title = "Offensive Language and Hate Speech Detection for {D}anish", - author = "Sigurbergsson, Gudbjartur Ingi and - Derczynski, Leon", - editor = "Calzolari, Nicoletta and - B{\'e}chet, Fr{\'e}d{\'e}ric and - Blache, Philippe and - Choukri, Khalid and - Cieri, Christopher and - Declerck, Thierry and - Goggi, Sara and - Isahara, Hitoshi and - Maegaard, Bente and - Mariani, Joseph and - Mazo, H{\'e}l{\`e}ne and - Moreno, Asuncion and - Odijk, Jan and - Piperidis, Stelios", - booktitle = "Proceedings of the Twelfth Language Resources and Evaluation Conference", - month = may, - year = "2020", - address = "Marseille, France", - publisher = "European Language Resources Association", - url = "https://aclanthology.org/2020.lrec-1.430", - pages = "3498--3508", - abstract = "The presence of offensive language on social media platforms and the implications this poses is becoming a major concern in modern society. Given the enormous amount of content created every day, automatic methods are required to detect and deal with this type of content. Until now, most of the research has focused on solving the problem for the English language, while the problem is multilingual. We construct a Danish dataset DKhate containing user-generated comments from various social media platforms, and to our knowledge, the first of its kind, annotated for various types and target of offensive language. We develop four automatic classification systems, each designed to work for both the English and the Danish language. In the detection of offensive language in English, the best performing system achieves a macro averaged F1-score of 0.74, and the best performing system for Danish achieves a macro averaged F1-score of 0.70. In the detection of whether or not an offensive post is targeted, the best performing system for English achieves a macro averaged F1-score of 0.62, while the best performing system for Danish achieves a macro averaged F1-score of 0.73. Finally, in the detection of the target type in a targeted offensive post, the best performing system for English achieves a macro averaged F1-score of 0.56, and the best performing system for Danish achieves a macro averaged F1-score of 0.63. Our work for both the English and the Danish language captures the type and targets of offensive language, and present automatic methods for detecting different kinds of offensive language such as hate speech and cyberbullying.", - language = "English", - ISBN = "979-10-95546-34-4", -}""", + bibtex_citation=r""" +@inproceedings{sigurbergsson-derczynski-2020-offensive, + abstract = {The presence of offensive language on social media platforms and the implications this poses is becoming a major concern in modern society. Given the enormous amount of content created every day, automatic methods are required to detect and deal with this type of content. Until now, most of the research has focused on solving the problem for the English language, while the problem is multilingual. We construct a Danish dataset DKhate containing user-generated comments from various social media platforms, and to our knowledge, the first of its kind, annotated for various types and target of offensive language. We develop four automatic classification systems, each designed to work for both the English and the Danish language. In the detection of offensive language in English, the best performing system achieves a macro averaged F1-score of 0.74, and the best performing system for Danish achieves a macro averaged F1-score of 0.70. In the detection of whether or not an offensive post is targeted, the best performing system for English achieves a macro averaged F1-score of 0.62, while the best performing system for Danish achieves a macro averaged F1-score of 0.73. Finally, in the detection of the target type in a targeted offensive post, the best performing system for English achieves a macro averaged F1-score of 0.56, and the best performing system for Danish achieves a macro averaged F1-score of 0.63. Our work for both the English and the Danish language captures the type and targets of offensive language, and present automatic methods for detecting different kinds of offensive language such as hate speech and cyberbullying.}, + address = {Marseille, France}, + author = {Sigurbergsson, Gudbjartur Ingi and +Derczynski, Leon}, + booktitle = {Proceedings of the Twelfth Language Resources and Evaluation Conference}, + editor = {Calzolari, Nicoletta and +B{\'e}chet, Fr{\'e}d{\'e}ric and +Blache, Philippe and +Choukri, Khalid and +Cieri, Christopher and +Declerck, Thierry and +Goggi, Sara and +Isahara, Hitoshi and +Maegaard, Bente and +Mariani, Joseph and +Mazo, H{\'e}l{\`e}ne and +Moreno, Asuncion and +Odijk, Jan and +Piperidis, Stelios}, + isbn = {979-10-95546-34-4}, + language = {English}, + month = may, + pages = {3498--3508}, + publisher = {European Language Resources Association}, + title = {Offensive Language and Hate Speech Detection for {D}anish}, + url = {https://aclanthology.org/2020.lrec-1.430}, + year = {2020}, +} +""", prompt="Classify Danish tweets based on offensiveness (offensive, not offensive)", ) diff --git a/mteb/tasks/Classification/dan/DanishPoliticalCommentsClassification.py b/mteb/tasks/Classification/dan/DanishPoliticalCommentsClassification.py index 8f82e91ecc..c0bac1528e 100644 --- a/mteb/tasks/Classification/dan/DanishPoliticalCommentsClassification.py +++ b/mteb/tasks/Classification/dan/DanishPoliticalCommentsClassification.py @@ -30,12 +30,14 @@ class DanishPoliticalCommentsClassification(AbsTaskClassification): annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation="""@techreport{SAMsentiment, - author={Mads Guldborg Kjeldgaard Kongsbak and Steffan Eybye Christensen and Lucas Høyberg Puvis~de~Chavannes and Peter Due Jensen}, - title={Sentiment Analysis Multitool, SAM}, - year={2019}, - institution={IT University of Copenhagen}, -}""", + bibtex_citation=r""" +@techreport{SAMsentiment, + author = {Mads Guldborg Kjeldgaard Kongsbak and Steffan Eybye Christensen and Lucas Høyberg Puvis~de~Chavannes and Peter Due Jensen}, + institution = {IT University of Copenhagen}, + title = {Sentiment Analysis Multitool, SAM}, + year = {2019}, +} +""", prompt="Classify Danish political comments for sentiment", ) diff --git a/mteb/tasks/Classification/dan/DdiscoCohesionClassification.py b/mteb/tasks/Classification/dan/DdiscoCohesionClassification.py index b28396869e..c1eb16d190 100644 --- a/mteb/tasks/Classification/dan/DdiscoCohesionClassification.py +++ b/mteb/tasks/Classification/dan/DdiscoCohesionClassification.py @@ -26,36 +26,36 @@ class DdiscoCohesionClassification(AbsTaskClassification): license="cc-by-sa-3.0", annotations_creators="expert-annotated", sample_creation="found", - bibtex_citation=""" - @inproceedings{flansmose-mikkelsen-etal-2022-ddisco, - title = "{DD}is{C}o: A Discourse Coherence Dataset for {D}anish", - author = "Flansmose Mikkelsen, Linea and - Kinch, Oliver and - Jess Pedersen, Anders and - Lacroix, Oph{\'e}lie", - editor = "Calzolari, Nicoletta and - B{\'e}chet, Fr{\'e}d{\'e}ric and - Blache, Philippe and - Choukri, Khalid and - Cieri, Christopher and - Declerck, Thierry and - Goggi, Sara and - Isahara, Hitoshi and - Maegaard, Bente and - Mariani, Joseph and - Mazo, H{\'e}l{\`e}ne and - Odijk, Jan and - Piperidis, Stelios", - booktitle = "Proceedings of the Thirteenth Language Resources and Evaluation Conference", - month = jun, - year = "2022", - address = "Marseille, France", - publisher = "European Language Resources Association", - url = "https://aclanthology.org/2022.lrec-1.260", - pages = "2440--2445", - abstract = "To date, there has been no resource for studying discourse coherence on real-world Danish texts. Discourse coherence has mostly been approached with the assumption that incoherent texts can be represented by coherent texts in which sentences have been shuffled. However, incoherent real-world texts rarely resemble that. We thus present DDisCo, a dataset including text from the Danish Wikipedia and Reddit annotated for discourse coherence. We choose to annotate real-world texts instead of relying on artificially incoherent text for training and testing models. Then, we evaluate the performance of several methods, including neural networks, on the dataset.", + bibtex_citation=r""" +@inproceedings{flansmose-mikkelsen-etal-2022-ddisco, + abstract = {To date, there has been no resource for studying discourse coherence on real-world Danish texts. Discourse coherence has mostly been approached with the assumption that incoherent texts can be represented by coherent texts in which sentences have been shuffled. However, incoherent real-world texts rarely resemble that. We thus present DDisCo, a dataset including text from the Danish Wikipedia and Reddit annotated for discourse coherence. We choose to annotate real-world texts instead of relying on artificially incoherent text for training and testing models. Then, we evaluate the performance of several methods, including neural networks, on the dataset.}, + address = {Marseille, France}, + author = {Flansmose Mikkelsen, Linea and +Kinch, Oliver and +Jess Pedersen, Anders and +Lacroix, Oph{\'e}lie}, + booktitle = {Proceedings of the Thirteenth Language Resources and Evaluation Conference}, + editor = {Calzolari, Nicoletta and +B{\'e}chet, Fr{\'e}d{\'e}ric and +Blache, Philippe and +Choukri, Khalid and +Cieri, Christopher and +Declerck, Thierry and +Goggi, Sara and +Isahara, Hitoshi and +Maegaard, Bente and +Mariani, Joseph and +Mazo, H{\'e}l{\`e}ne and +Odijk, Jan and +Piperidis, Stelios}, + month = jun, + pages = {2440--2445}, + publisher = {European Language Resources Association}, + title = {{DD}is{C}o: A Discourse Coherence Dataset for {D}anish}, + url = {https://aclanthology.org/2022.lrec-1.260}, + year = {2022}, } - """, +""", ) def dataset_transform(self): diff --git a/mteb/tasks/Classification/dan/LccSentimentClassification.py b/mteb/tasks/Classification/dan/LccSentimentClassification.py index 39b974dcd3..8ff87f3ec4 100644 --- a/mteb/tasks/Classification/dan/LccSentimentClassification.py +++ b/mteb/tasks/Classification/dan/LccSentimentClassification.py @@ -26,26 +26,28 @@ class LccSentimentClassification(AbsTaskClassification): annotations_creators="expert-annotated", dialect=[], sample_creation="found", - bibtex_citation="""@inproceedings{quasthoff-etal-2006-corpus, - title = "Corpus Portal for Search in Monolingual Corpora", - author = "Quasthoff, Uwe and - Richter, Matthias and - Biemann, Christian", - editor = "Calzolari, Nicoletta and - Choukri, Khalid and - Gangemi, Aldo and - Maegaard, Bente and - Mariani, Joseph and - Odijk, Jan and - Tapias, Daniel", - booktitle = "Proceedings of the Fifth International Conference on Language Resources and Evaluation ({LREC}{'}06)", - month = may, - year = "2006", - address = "Genoa, Italy", - publisher = "European Language Resources Association (ELRA)", - url = "http://www.lrec-conf.org/proceedings/lrec2006/pdf/641_pdf.pdf", - abstract = "A simple and flexible schema for storing and presenting monolingual language resources is proposed. In this format, data for 18 different languages is already available in various sizes. The data is provided free of charge for online use and download. The main target is to ease the application of algorithms for monolingual and interlingual studies.", -}""", + bibtex_citation=r""" +@inproceedings{quasthoff-etal-2006-corpus, + abstract = {A simple and flexible schema for storing and presenting monolingual language resources is proposed. In this format, data for 18 different languages is already available in various sizes. The data is provided free of charge for online use and download. The main target is to ease the application of algorithms for monolingual and interlingual studies.}, + address = {Genoa, Italy}, + author = {Quasthoff, Uwe and +Richter, Matthias and +Biemann, Christian}, + booktitle = {Proceedings of the Fifth International Conference on Language Resources and Evaluation ({LREC}{'}06)}, + editor = {Calzolari, Nicoletta and +Choukri, Khalid and +Gangemi, Aldo and +Maegaard, Bente and +Mariani, Joseph and +Odijk, Jan and +Tapias, Daniel}, + month = may, + publisher = {European Language Resources Association (ELRA)}, + title = {Corpus Portal for Search in Monolingual Corpora}, + url = {http://www.lrec-conf.org/proceedings/lrec2006/pdf/641_pdf.pdf}, + year = {2006}, +} +""", prompt="Classify texts based on sentiment", ) diff --git a/mteb/tasks/Classification/deu/GermanPoliticiansTwitterSentimentClassification.py b/mteb/tasks/Classification/deu/GermanPoliticiansTwitterSentimentClassification.py index 02cbe51f5f..979a70c707 100644 --- a/mteb/tasks/Classification/deu/GermanPoliticiansTwitterSentimentClassification.py +++ b/mteb/tasks/Classification/deu/GermanPoliticiansTwitterSentimentClassification.py @@ -26,28 +26,28 @@ class GermanPoliticiansTwitterSentimentClassification(AbsTaskClassification): annotations_creators="human-annotated", dialect=[], sample_creation="found", - bibtex_citation=""" - @inproceedings{schmidt-etal-2022-sentiment, - title = "Sentiment Analysis on {T}witter for the Major {G}erman Parties during the 2021 {G}erman Federal Election", - author = "Schmidt, Thomas and - Fehle, Jakob and - Weissenbacher, Maximilian and - Richter, Jonathan and - Gottschalk, Philipp and - Wolff, Christian", - editor = "Schaefer, Robin and - Bai, Xiaoyu and - Stede, Manfred and - Zesch, Torsten", - booktitle = "Proceedings of the 18th Conference on Natural Language Processing (KONVENS 2022)", - month = "12--15 " # sep, - year = "2022", - address = "Potsdam, Germany", - publisher = "KONVENS 2022 Organizers", - url = "https://aclanthology.org/2022.konvens-1.9", - pages = "74--87", - } - """, + bibtex_citation=r""" +@inproceedings{schmidt-etal-2022-sentiment, + address = {Potsdam, Germany}, + author = {Schmidt, Thomas and +Fehle, Jakob and +Weissenbacher, Maximilian and +Richter, Jonathan and +Gottschalk, Philipp and +Wolff, Christian}, + booktitle = {Proceedings of the 18th Conference on Natural Language Processing (KONVENS 2022)}, + editor = {Schaefer, Robin and +Bai, Xiaoyu and +Stede, Manfred and +Zesch, Torsten}, + month = {12--15 } # sep, + pages = {74--87}, + publisher = {KONVENS 2022 Organizers}, + title = {Sentiment Analysis on {T}witter for the Major {G}erman Parties during the 2021 {G}erman Federal Election}, + url = {https://aclanthology.org/2022.konvens-1.9}, + year = {2022}, +} +""", ) def dataset_transform(self): diff --git a/mteb/tasks/Classification/deu/TenKGnadClassification.py b/mteb/tasks/Classification/deu/TenKGnadClassification.py index 592d66c983..12fe0223ff 100644 --- a/mteb/tasks/Classification/deu/TenKGnadClassification.py +++ b/mteb/tasks/Classification/deu/TenKGnadClassification.py @@ -27,16 +27,16 @@ class TenKGnadClassification(AbsTaskClassification): annotations_creators="expert-annotated", dialect=[], sample_creation="found", - bibtex_citation=""" - @InProceedings{Schabus2017, - Author = {Dietmar Schabus and Marcin Skowron and Martin Trapp}, - Title = {One Million Posts: A Data Set of German Online Discussions}, - Booktitle = {Proceedings of the 40th International ACM SIGIR Conference on Research and Development in Information Retrieval (SIGIR)}, - Pages = {1241--1244}, - Year = {2017}, - Address = {Tokyo, Japan}, - Doi = {10.1145/3077136.3080711}, - Month = aug - } - """, + bibtex_citation=r""" +@inproceedings{Schabus2017, + address = {Tokyo, Japan}, + author = {Dietmar Schabus and Marcin Skowron and Martin Trapp}, + booktitle = {Proceedings of the 40th International ACM SIGIR Conference on Research and Development in Information Retrieval (SIGIR)}, + doi = {10.1145/3077136.3080711}, + month = aug, + pages = {1241--1244}, + title = {One Million Posts: A Data Set of German Online Discussions}, + year = {2017}, +} +""", ) diff --git a/mteb/tasks/Classification/ell/GreekLegalCodeClassification.py b/mteb/tasks/Classification/ell/GreekLegalCodeClassification.py index 29fb9bbb90..008d454a50 100644 --- a/mteb/tasks/Classification/ell/GreekLegalCodeClassification.py +++ b/mteb/tasks/Classification/ell/GreekLegalCodeClassification.py @@ -29,16 +29,17 @@ class GreekLegalCodeClassification(AbsTaskClassification): annotations_creators="human-annotated", dialect=[], sample_creation="found", - bibtex_citation="""@inproceedings{papaloukas-etal-2021-glc, - title = "Multi-granular Legal Topic Classification on Greek Legislation", - author = "Papaloukas, Christos and Chalkidis, Ilias and Athinaios, Konstantinos and Pantazi, Despina-Athanasia and Koubarakis, Manolis", - booktitle = "Proceedings of the Natural Legal Language Processing Workshop 2021", - year = "2021", - address = "Punta Cana, Dominican Republic", - publisher = "Association for Computational Linguistics", - url = "https://arxiv.org/abs/2109.15298", - doi = "10.48550/arXiv.2109.15298", - pages = "63--75" + bibtex_citation=r""" +@inproceedings{papaloukas-etal-2021-glc, + address = {Punta Cana, Dominican Republic}, + author = {Papaloukas, Christos and Chalkidis, Ilias and Athinaios, Konstantinos and Pantazi, Despina-Athanasia and Koubarakis, Manolis}, + booktitle = {Proceedings of the Natural Legal Language Processing Workshop 2021}, + doi = {10.48550/arXiv.2109.15298}, + pages = {63--75}, + publisher = {Association for Computational Linguistics}, + title = {Multi-granular Legal Topic Classification on Greek Legislation}, + url = {https://arxiv.org/abs/2109.15298}, + year = {2021}, } """, ) diff --git a/mteb/tasks/Classification/eng/AmazonPolarityClassification.py b/mteb/tasks/Classification/eng/AmazonPolarityClassification.py index 5f787e1af6..3c5b1350f1 100644 --- a/mteb/tasks/Classification/eng/AmazonPolarityClassification.py +++ b/mteb/tasks/Classification/eng/AmazonPolarityClassification.py @@ -29,12 +29,14 @@ class AmazonPolarityClassification(AbsTaskClassification): license="apache-2.0", dialect=[], sample_creation="found", - bibtex_citation="""@article{McAuley2013HiddenFA, - title={Hidden factors and hidden topics: understanding rating dimensions with review text}, - author={Julian McAuley and Jure Leskovec}, - journal={Proceedings of the 7th ACM conference on Recommender systems}, - year={2013}, - url={https://api.semanticscholar.org/CorpusID:6440341} -}""", + bibtex_citation=r""" +@article{McAuley2013HiddenFA, + author = {Julian McAuley and Jure Leskovec}, + journal = {Proceedings of the 7th ACM conference on Recommender systems}, + title = {Hidden factors and hidden topics: understanding rating dimensions with review text}, + url = {https://api.semanticscholar.org/CorpusID:6440341}, + year = {2013}, +} +""", prompt="Classify Amazon reviews into positive or negative sentiment", ) diff --git a/mteb/tasks/Classification/eng/ArxivClassification.py b/mteb/tasks/Classification/eng/ArxivClassification.py index ae13b6f489..943f1c1cfe 100644 --- a/mteb/tasks/Classification/eng/ArxivClassification.py +++ b/mteb/tasks/Classification/eng/ArxivClassification.py @@ -27,14 +27,16 @@ class ArxivClassification(AbsTaskClassification): annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation="""@ARTICLE{8675939, - author={He, Jun and Wang, Liqun and Liu, Liu and Feng, Jiao and Wu, Hao}, - journal={IEEE Access}, - title={Long Document Classification From Local Word Glimpses via Recurrent Attention Learning}, - year={2019}, - volume={7}, - number={}, - pages={40707-40718}, - doi={10.1109/ACCESS.2019.2907992} - }""", + bibtex_citation=r""" +@article{8675939, + author = {He, Jun and Wang, Liqun and Liu, Liu and Feng, Jiao and Wu, Hao}, + doi = {10.1109/ACCESS.2019.2907992}, + journal = {IEEE Access}, + number = {}, + pages = {40707-40718}, + title = {Long Document Classification From Local Word Glimpses via Recurrent Attention Learning}, + volume = {7}, + year = {2019}, +} +""", ) diff --git a/mteb/tasks/Classification/eng/Banking77Classification.py b/mteb/tasks/Classification/eng/Banking77Classification.py index 5b6db45c64..5581df7fb0 100644 --- a/mteb/tasks/Classification/eng/Banking77Classification.py +++ b/mteb/tasks/Classification/eng/Banking77Classification.py @@ -29,29 +29,31 @@ class Banking77Classification(AbsTaskClassification): annotations_creators="human-annotated", dialect=[], sample_creation="found", - bibtex_citation="""@inproceedings{casanueva-etal-2020-efficient, - title = "Efficient Intent Detection with Dual Sentence Encoders", - author = "Casanueva, I{\~n}igo and - Tem{\v{c}}inas, Tadas and - Gerz, Daniela and - Henderson, Matthew and - Vuli{\'c}, Ivan", - editor = "Wen, Tsung-Hsien and - Celikyilmaz, Asli and - Yu, Zhou and - Papangelis, Alexandros and - Eric, Mihail and - Kumar, Anuj and - Casanueva, I{\~n}igo and - Shah, Rushin", - booktitle = "Proceedings of the 2nd Workshop on Natural Language Processing for Conversational AI", - month = jul, - year = "2020", - address = "Online", - publisher = "Association for Computational Linguistics", - url = "https://aclanthology.org/2020.nlp4convai-1.5", - doi = "10.18653/v1/2020.nlp4convai-1.5", - pages = "38--45", -}""", + bibtex_citation=r""" +@inproceedings{casanueva-etal-2020-efficient, + address = {Online}, + author = {Casanueva, I{\~n}igo and +Tem{\v{c}}inas, Tadas and +Gerz, Daniela and +Henderson, Matthew and +Vuli{\'c}, Ivan}, + booktitle = {Proceedings of the 2nd Workshop on Natural Language Processing for Conversational AI}, + doi = {10.18653/v1/2020.nlp4convai-1.5}, + editor = {Wen, Tsung-Hsien and +Celikyilmaz, Asli and +Yu, Zhou and +Papangelis, Alexandros and +Eric, Mihail and +Kumar, Anuj and +Casanueva, I{\~n}igo and +Shah, Rushin}, + month = jul, + pages = {38--45}, + publisher = {Association for Computational Linguistics}, + title = {Efficient Intent Detection with Dual Sentence Encoders}, + url = {https://aclanthology.org/2020.nlp4convai-1.5}, + year = {2020}, +} +""", prompt="Given a online banking query, find the corresponding intents", ) diff --git a/mteb/tasks/Classification/eng/DBpediaClassification.py b/mteb/tasks/Classification/eng/DBpediaClassification.py index ac7ee41ae8..51904a4c08 100644 --- a/mteb/tasks/Classification/eng/DBpediaClassification.py +++ b/mteb/tasks/Classification/eng/DBpediaClassification.py @@ -26,19 +26,19 @@ class DBpediaClassification(AbsTaskClassification): annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation=""" - @inproceedings{NIPS2015_250cf8b5, - author = {Zhang, Xiang and Zhao, Junbo and LeCun, Yann}, - booktitle = {Advances in Neural Information Processing Systems}, - editor = {C. Cortes and N. Lawrence and D. Lee and M. Sugiyama and R. Garnett}, - pages = {}, - publisher = {Curran Associates, Inc.}, - title = {Character-level Convolutional Networks for Text Classification}, - url = {https://proceedings.neurips.cc/paper_files/paper/2015/file/250cf8b51c773f3f8dc8b4be867a9a02-Paper.pdf}, - volume = {28}, - year = {2015} - } - """, + bibtex_citation=r""" +@inproceedings{NIPS2015_250cf8b5, + author = {Zhang, Xiang and Zhao, Junbo and LeCun, Yann}, + booktitle = {Advances in Neural Information Processing Systems}, + editor = {C. Cortes and N. Lawrence and D. Lee and M. Sugiyama and R. Garnett}, + pages = {}, + publisher = {Curran Associates, Inc.}, + title = {Character-level Convolutional Networks for Text Classification}, + url = {https://proceedings.neurips.cc/paper_files/paper/2015/file/250cf8b51c773f3f8dc8b4be867a9a02-Paper.pdf}, + volume = {28}, + year = {2015}, +} +""", ) def dataset_transform(self): diff --git a/mteb/tasks/Classification/eng/EmotionClassification.py b/mteb/tasks/Classification/eng/EmotionClassification.py index 05133cb17f..d04d8143a6 100644 --- a/mteb/tasks/Classification/eng/EmotionClassification.py +++ b/mteb/tasks/Classification/eng/EmotionClassification.py @@ -29,27 +29,29 @@ class EmotionClassification(AbsTaskClassification): annotations_creators="human-annotated", dialect=[], sample_creation="found", - bibtex_citation="""@inproceedings{saravia-etal-2018-carer, - title = "{CARER}: Contextualized Affect Representations for Emotion Recognition", - author = "Saravia, Elvis and - Liu, Hsien-Chi Toby and - Huang, Yen-Hao and - Wu, Junlin and - Chen, Yi-Shin", - editor = "Riloff, Ellen and - Chiang, David and - Hockenmaier, Julia and - Tsujii, Jun{'}ichi", - booktitle = "Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing", - month = oct # "-" # nov, - year = "2018", - address = "Brussels, Belgium", - publisher = "Association for Computational Linguistics", - url = "https://aclanthology.org/D18-1404", - doi = "10.18653/v1/D18-1404", - pages = "3687--3697", - abstract = "Emotions are expressed in nuanced ways, which varies by collective or individual experiences, knowledge, and beliefs. Therefore, to understand emotion, as conveyed through text, a robust mechanism capable of capturing and modeling different linguistic nuances and phenomena is needed. We propose a semi-supervised, graph-based algorithm to produce rich structural descriptors which serve as the building blocks for constructing contextualized affect representations from text. The pattern-based representations are further enriched with word embeddings and evaluated through several emotion recognition tasks. Our experimental results demonstrate that the proposed method outperforms state-of-the-art techniques on emotion recognition tasks.", -}""", + bibtex_citation=r""" +@inproceedings{saravia-etal-2018-carer, + abstract = {Emotions are expressed in nuanced ways, which varies by collective or individual experiences, knowledge, and beliefs. Therefore, to understand emotion, as conveyed through text, a robust mechanism capable of capturing and modeling different linguistic nuances and phenomena is needed. We propose a semi-supervised, graph-based algorithm to produce rich structural descriptors which serve as the building blocks for constructing contextualized affect representations from text. The pattern-based representations are further enriched with word embeddings and evaluated through several emotion recognition tasks. Our experimental results demonstrate that the proposed method outperforms state-of-the-art techniques on emotion recognition tasks.}, + address = {Brussels, Belgium}, + author = {Saravia, Elvis and +Liu, Hsien-Chi Toby and +Huang, Yen-Hao and +Wu, Junlin and +Chen, Yi-Shin}, + booktitle = {Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing}, + doi = {10.18653/v1/D18-1404}, + editor = {Riloff, Ellen and +Chiang, David and +Hockenmaier, Julia and +Tsujii, Jun{'}ichi}, + month = oct # {-} # nov, + pages = {3687--3697}, + publisher = {Association for Computational Linguistics}, + title = {{CARER}: Contextualized Affect Representations for Emotion Recognition}, + url = {https://aclanthology.org/D18-1404}, + year = {2018}, +} +""", prompt="Classify the emotion expressed in the given Twitter message into one of the six emotions: anger, fear, joy, love, sadness, and surprise", ) diff --git a/mteb/tasks/Classification/eng/FinancialPhrasebankClassification.py b/mteb/tasks/Classification/eng/FinancialPhrasebankClassification.py index b9abb5445a..6d3f672f9d 100644 --- a/mteb/tasks/Classification/eng/FinancialPhrasebankClassification.py +++ b/mteb/tasks/Classification/eng/FinancialPhrasebankClassification.py @@ -28,15 +28,15 @@ class FinancialPhrasebankClassification(AbsTaskClassification): annotations_creators="expert-annotated", dialect=[], sample_creation="found", - bibtex_citation=""" - @article{Malo2014GoodDO, - title={Good debt or bad debt: Detecting semantic orientations in economic texts}, - author={P. Malo and A. Sinha and P. Korhonen and J. Wallenius and P. Takala}, - journal={Journal of the Association for Information Science and Technology}, - year={2014}, - volume={65} - } - """, + bibtex_citation=r""" +@article{Malo2014GoodDO, + author = {P. Malo and A. Sinha and P. Korhonen and J. Wallenius and P. Takala}, + journal = {Journal of the Association for Information Science and Technology}, + title = {Good debt or bad debt: Detecting semantic orientations in economic texts}, + volume = {65}, + year = {2014}, +} +""", ) def dataset_transform(self): diff --git a/mteb/tasks/Classification/eng/FrenkEnClassification.py b/mteb/tasks/Classification/eng/FrenkEnClassification.py index 0d435caf93..b9de110e20 100644 --- a/mteb/tasks/Classification/eng/FrenkEnClassification.py +++ b/mteb/tasks/Classification/eng/FrenkEnClassification.py @@ -27,13 +27,15 @@ class FrenkEnClassification(AbsTaskClassification): annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation="""@misc{ljubešić2019frenk, - title={The FRENK Datasets of Socially Unacceptable Discourse in Slovene and English}, - author={Nikola Ljubešić and Darja Fišer and Tomaž Erjavec}, - year={2019}, - eprint={1906.02045}, - archivePrefix={arXiv}, - primaryClass={cs.CL}, - url={https://arxiv.org/abs/1906.02045} - }""", + bibtex_citation=r""" +@misc{ljubešić2019frenk, + archiveprefix = {arXiv}, + author = {Nikola Ljubešić and Darja Fišer and Tomaž Erjavec}, + eprint = {1906.02045}, + primaryclass = {cs.CL}, + title = {The FRENK Datasets of Socially Unacceptable Discourse in Slovene and English}, + url = {https://arxiv.org/abs/1906.02045}, + year = {2019}, +} +""", ) diff --git a/mteb/tasks/Classification/eng/ImdbClassification.py b/mteb/tasks/Classification/eng/ImdbClassification.py index 75b540bf47..df2ac734ed 100644 --- a/mteb/tasks/Classification/eng/ImdbClassification.py +++ b/mteb/tasks/Classification/eng/ImdbClassification.py @@ -29,24 +29,26 @@ class ImdbClassification(AbsTaskClassification): annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation="""@inproceedings{maas-etal-2011-learning, - title = "Learning Word Vectors for Sentiment Analysis", - author = "Maas, Andrew L. and - Daly, Raymond E. and - Pham, Peter T. and - Huang, Dan and - Ng, Andrew Y. and - Potts, Christopher", - editor = "Lin, Dekang and - Matsumoto, Yuji and - Mihalcea, Rada", - booktitle = "Proceedings of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies", - month = jun, - year = "2011", - address = "Portland, Oregon, USA", - publisher = "Association for Computational Linguistics", - url = "https://aclanthology.org/P11-1015", - pages = "142--150", -}""", + bibtex_citation=r""" +@inproceedings{maas-etal-2011-learning, + address = {Portland, Oregon, USA}, + author = {Maas, Andrew L. and +Daly, Raymond E. and +Pham, Peter T. and +Huang, Dan and +Ng, Andrew Y. and +Potts, Christopher}, + booktitle = {Proceedings of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies}, + editor = {Lin, Dekang and +Matsumoto, Yuji and +Mihalcea, Rada}, + month = jun, + pages = {142--150}, + publisher = {Association for Computational Linguistics}, + title = {Learning Word Vectors for Sentiment Analysis}, + url = {https://aclanthology.org/P11-1015}, + year = {2011}, +} +""", prompt="Classify the sentiment expressed in the given movie review text from the IMDB dataset", ) diff --git a/mteb/tasks/Classification/eng/LegalBenchClassification.py b/mteb/tasks/Classification/eng/LegalBenchClassification.py index 8958aee7e6..d19df22a19 100644 --- a/mteb/tasks/Classification/eng/LegalBenchClassification.py +++ b/mteb/tasks/Classification/eng/LegalBenchClassification.py @@ -32,15 +32,16 @@ class CanadaTaxCourtOutcomesLegalBenchClassification(AbsTaskClassification): annotations_creators="expert-annotated", dialect=["en-CA"], sample_creation="found", - bibtex_citation=""" - @misc{guha2023legalbench, - title={LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, - author={Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, - year={2023}, - eprint={2308.11462}, - archivePrefix={arXiv}, - primaryClass={cs.CL} - }""", + bibtex_citation=r""" +@misc{guha2023legalbench, + archiveprefix = {arXiv}, + author = {Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, + eprint = {2308.11462}, + primaryclass = {cs.CL}, + title = {LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, + year = {2023}, +} +""", ) def dataset_transform(self): @@ -72,21 +73,23 @@ class ContractNLIConfidentialityOfAgreementLegalBenchClassification( annotations_creators="expert-annotated", dialect=[], sample_creation="found", - bibtex_citation=""" - @misc{guha2023legalbench, - title={LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, - author={Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, - year={2023}, - eprint={2308.11462}, - archivePrefix={arXiv}, - primaryClass={cs.CL} - }, - @article{koreeda2021contractnli, - title={ContractNLI: A dataset for document-level natural language inference for contracts}, - author={Koreeda, Yuta and Manning, Christopher D}, - journal={arXiv preprint arXiv:2110.01799}, - year={2021} - }""", + bibtex_citation=r""" +@misc{guha2023legalbench, + archiveprefix = {arXiv}, + author = {Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, + eprint = {2308.11462}, + primaryclass = {cs.CL}, + title = {LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, + year = {2023}, +} + +@article{koreeda2021contractnli, + author = {Koreeda, Yuta and Manning, Christopher D}, + journal = {arXiv preprint arXiv:2110.01799}, + title = {ContractNLI: A dataset for document-level natural language inference for contracts}, + year = {2021}, +} +""", ) def dataset_transform(self): @@ -122,21 +125,23 @@ class ContractNLIExplicitIdentificationLegalBenchClassification(AbsTaskClassific annotations_creators="expert-annotated", dialect=[], sample_creation="found", - bibtex_citation=""" - @misc{guha2023legalbench, - title={LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, - author={Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, - year={2023}, - eprint={2308.11462}, - archivePrefix={arXiv}, - primaryClass={cs.CL} - }, - @article{koreeda2021contractnli, - title={ContractNLI: A dataset for document-level natural language inference for contracts}, - author={Koreeda, Yuta and Manning, Christopher D}, - journal={arXiv preprint arXiv:2110.01799}, - year={2021} - }""", + bibtex_citation=r""" +@misc{guha2023legalbench, + archiveprefix = {arXiv}, + author = {Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, + eprint = {2308.11462}, + primaryclass = {cs.CL}, + title = {LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, + year = {2023}, +} + +@article{koreeda2021contractnli, + author = {Koreeda, Yuta and Manning, Christopher D}, + journal = {arXiv preprint arXiv:2110.01799}, + title = {ContractNLI: A dataset for document-level natural language inference for contracts}, + year = {2021}, +} +""", ) def dataset_transform(self): @@ -174,21 +179,23 @@ class ContractNLIInclusionOfVerballyConveyedInformationLegalBenchClassification( annotations_creators="expert-annotated", dialect=[], sample_creation="found", - bibtex_citation=""" - @misc{guha2023legalbench, - title={LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, - author={Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, - year={2023}, - eprint={2308.11462}, - archivePrefix={arXiv}, - primaryClass={cs.CL} - }, - @article{koreeda2021contractnli, - title={ContractNLI: A dataset for document-level natural language inference for contracts}, - author={Koreeda, Yuta and Manning, Christopher D}, - journal={arXiv preprint arXiv:2110.01799}, - year={2021} - }""", + bibtex_citation=r""" +@misc{guha2023legalbench, + archiveprefix = {arXiv}, + author = {Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, + eprint = {2308.11462}, + primaryclass = {cs.CL}, + title = {LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, + year = {2023}, +} + +@article{koreeda2021contractnli, + author = {Koreeda, Yuta and Manning, Christopher D}, + journal = {arXiv preprint arXiv:2110.01799}, + title = {ContractNLI: A dataset for document-level natural language inference for contracts}, + year = {2021}, +} +""", ) def dataset_transform(self): @@ -224,21 +231,23 @@ class ContractNLILimitedUseLegalBenchClassification(AbsTaskClassification): annotations_creators="expert-annotated", dialect=[], sample_creation="found", - bibtex_citation=""" - @misc{guha2023legalbench, - title={LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, - author={Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, - year={2023}, - eprint={2308.11462}, - archivePrefix={arXiv}, - primaryClass={cs.CL} - }, - @article{koreeda2021contractnli, - title={ContractNLI: A dataset for document-level natural language inference for contracts}, - author={Koreeda, Yuta and Manning, Christopher D}, - journal={arXiv preprint arXiv:2110.01799}, - year={2021} - }""", + bibtex_citation=r""" +@misc{guha2023legalbench, + archiveprefix = {arXiv}, + author = {Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, + eprint = {2308.11462}, + primaryclass = {cs.CL}, + title = {LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, + year = {2023}, +} + +@article{koreeda2021contractnli, + author = {Koreeda, Yuta and Manning, Christopher D}, + journal = {arXiv preprint arXiv:2110.01799}, + title = {ContractNLI: A dataset for document-level natural language inference for contracts}, + year = {2021}, +} +""", ) def dataset_transform(self): @@ -274,21 +283,23 @@ class ContractNLINoLicensingLegalBenchClassification(AbsTaskClassification): annotations_creators="expert-annotated", dialect=[], sample_creation="found", - bibtex_citation=""" - @misc{guha2023legalbench, - title={LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, - author={Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, - year={2023}, - eprint={2308.11462}, - archivePrefix={arXiv}, - primaryClass={cs.CL} - }, - @article{koreeda2021contractnli, - title={ContractNLI: A dataset for document-level natural language inference for contracts}, - author={Koreeda, Yuta and Manning, Christopher D}, - journal={arXiv preprint arXiv:2110.01799}, - year={2021} - }""", + bibtex_citation=r""" +@misc{guha2023legalbench, + archiveprefix = {arXiv}, + author = {Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, + eprint = {2308.11462}, + primaryclass = {cs.CL}, + title = {LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, + year = {2023}, +} + +@article{koreeda2021contractnli, + author = {Koreeda, Yuta and Manning, Christopher D}, + journal = {arXiv preprint arXiv:2110.01799}, + title = {ContractNLI: A dataset for document-level natural language inference for contracts}, + year = {2021}, +} +""", ) def dataset_transform(self): @@ -326,21 +337,23 @@ class ContractNLINoticeOnCompelledDisclosureLegalBenchClassification( annotations_creators="expert-annotated", dialect=[], sample_creation="found", - bibtex_citation=""" - @misc{guha2023legalbench, - title={LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, - author={Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, - year={2023}, - eprint={2308.11462}, - archivePrefix={arXiv}, - primaryClass={cs.CL} - }, - @article{koreeda2021contractnli, - title={ContractNLI: A dataset for document-level natural language inference for contracts}, - author={Koreeda, Yuta and Manning, Christopher D}, - journal={arXiv preprint arXiv:2110.01799}, - year={2021} - }""", + bibtex_citation=r""" +@misc{guha2023legalbench, + archiveprefix = {arXiv}, + author = {Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, + eprint = {2308.11462}, + primaryclass = {cs.CL}, + title = {LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, + year = {2023}, +} + +@article{koreeda2021contractnli, + author = {Koreeda, Yuta and Manning, Christopher D}, + journal = {arXiv preprint arXiv:2110.01799}, + title = {ContractNLI: A dataset for document-level natural language inference for contracts}, + year = {2021}, +} +""", ) def dataset_transform(self): @@ -378,21 +391,23 @@ class ContractNLIPermissibleAcquirementOfSimilarInformationLegalBenchClassificat annotations_creators="expert-annotated", dialect=[], sample_creation="found", - bibtex_citation=""" - @misc{guha2023legalbench, - title={LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, - author={Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, - year={2023}, - eprint={2308.11462}, - archivePrefix={arXiv}, - primaryClass={cs.CL} - }, - @article{koreeda2021contractnli, - title={ContractNLI: A dataset for document-level natural language inference for contracts}, - author={Koreeda, Yuta and Manning, Christopher D}, - journal={arXiv preprint arXiv:2110.01799}, - year={2021} - }""", + bibtex_citation=r""" +@misc{guha2023legalbench, + archiveprefix = {arXiv}, + author = {Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, + eprint = {2308.11462}, + primaryclass = {cs.CL}, + title = {LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, + year = {2023}, +} + +@article{koreeda2021contractnli, + author = {Koreeda, Yuta and Manning, Christopher D}, + journal = {arXiv preprint arXiv:2110.01799}, + title = {ContractNLI: A dataset for document-level natural language inference for contracts}, + year = {2021}, +} +""", ) def dataset_transform(self): @@ -428,21 +443,23 @@ class ContractNLIPermissibleCopyLegalBenchClassification(AbsTaskClassification): annotations_creators="expert-annotated", dialect=[], sample_creation="found", - bibtex_citation=""" - @misc{guha2023legalbench, - title={LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, - author={Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, - year={2023}, - eprint={2308.11462}, - archivePrefix={arXiv}, - primaryClass={cs.CL} - }, - @article{koreeda2021contractnli, - title={ContractNLI: A dataset for document-level natural language inference for contracts}, - author={Koreeda, Yuta and Manning, Christopher D}, - journal={arXiv preprint arXiv:2110.01799}, - year={2021} - }""", + bibtex_citation=r""" +@misc{guha2023legalbench, + archiveprefix = {arXiv}, + author = {Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, + eprint = {2308.11462}, + primaryclass = {cs.CL}, + title = {LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, + year = {2023}, +} + +@article{koreeda2021contractnli, + author = {Koreeda, Yuta and Manning, Christopher D}, + journal = {arXiv preprint arXiv:2110.01799}, + title = {ContractNLI: A dataset for document-level natural language inference for contracts}, + year = {2021}, +} +""", ) def dataset_transform(self): @@ -480,21 +497,23 @@ class ContractNLIPermissibleDevelopmentOfSimilarInformationLegalBenchClassificat annotations_creators="expert-annotated", dialect=[], sample_creation="found", - bibtex_citation=""" - @misc{guha2023legalbench, - title={LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, - author={Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, - year={2023}, - eprint={2308.11462}, - archivePrefix={arXiv}, - primaryClass={cs.CL} - }, - @article{koreeda2021contractnli, - title={ContractNLI: A dataset for document-level natural language inference for contracts}, - author={Koreeda, Yuta and Manning, Christopher D}, - journal={arXiv preprint arXiv:2110.01799}, - year={2021} - }""", + bibtex_citation=r""" +@misc{guha2023legalbench, + archiveprefix = {arXiv}, + author = {Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, + eprint = {2308.11462}, + primaryclass = {cs.CL}, + title = {LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, + year = {2023}, +} + +@article{koreeda2021contractnli, + author = {Koreeda, Yuta and Manning, Christopher D}, + journal = {arXiv preprint arXiv:2110.01799}, + title = {ContractNLI: A dataset for document-level natural language inference for contracts}, + year = {2021}, +} +""", ) def dataset_transform(self): @@ -532,21 +551,23 @@ class ContractNLIPermissiblePostAgreementPossessionLegalBenchClassification( annotations_creators="expert-annotated", dialect=[], sample_creation="found", - bibtex_citation=""" - @misc{guha2023legalbench, - title={LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, - author={Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, - year={2023}, - eprint={2308.11462}, - archivePrefix={arXiv}, - primaryClass={cs.CL} - }, - @article{koreeda2021contractnli, - title={ContractNLI: A dataset for document-level natural language inference for contracts}, - author={Koreeda, Yuta and Manning, Christopher D}, - journal={arXiv preprint arXiv:2110.01799}, - year={2021} - }""", + bibtex_citation=r""" +@misc{guha2023legalbench, + archiveprefix = {arXiv}, + author = {Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, + eprint = {2308.11462}, + primaryclass = {cs.CL}, + title = {LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, + year = {2023}, +} + +@article{koreeda2021contractnli, + author = {Koreeda, Yuta and Manning, Christopher D}, + journal = {arXiv preprint arXiv:2110.01799}, + title = {ContractNLI: A dataset for document-level natural language inference for contracts}, + year = {2021}, +} +""", ) def dataset_transform(self): @@ -584,21 +605,23 @@ class ContractNLIReturnOfConfidentialInformationLegalBenchClassification( annotations_creators="expert-annotated", dialect=[], sample_creation="found", - bibtex_citation=""" - @misc{guha2023legalbench, - title={LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, - author={Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, - year={2023}, - eprint={2308.11462}, - archivePrefix={arXiv}, - primaryClass={cs.CL} - }, - @article{koreeda2021contractnli, - title={ContractNLI: A dataset for document-level natural language inference for contracts}, - author={Koreeda, Yuta and Manning, Christopher D}, - journal={arXiv preprint arXiv:2110.01799}, - year={2021} - }""", + bibtex_citation=r""" +@misc{guha2023legalbench, + archiveprefix = {arXiv}, + author = {Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, + eprint = {2308.11462}, + primaryclass = {cs.CL}, + title = {LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, + year = {2023}, +} + +@article{koreeda2021contractnli, + author = {Koreeda, Yuta and Manning, Christopher D}, + journal = {arXiv preprint arXiv:2110.01799}, + title = {ContractNLI: A dataset for document-level natural language inference for contracts}, + year = {2021}, +} +""", ) def dataset_transform(self): @@ -634,21 +657,23 @@ class ContractNLISharingWithEmployeesLegalBenchClassification(AbsTaskClassificat annotations_creators="expert-annotated", dialect=[], sample_creation="found", - bibtex_citation=""" - @misc{guha2023legalbench, - title={LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, - author={Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, - year={2023}, - eprint={2308.11462}, - archivePrefix={arXiv}, - primaryClass={cs.CL} - }, - @article{koreeda2021contractnli, - title={ContractNLI: A dataset for document-level natural language inference for contracts}, - author={Koreeda, Yuta and Manning, Christopher D}, - journal={arXiv preprint arXiv:2110.01799}, - year={2021} - }""", + bibtex_citation=r""" +@misc{guha2023legalbench, + archiveprefix = {arXiv}, + author = {Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, + eprint = {2308.11462}, + primaryclass = {cs.CL}, + title = {LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, + year = {2023}, +} + +@article{koreeda2021contractnli, + author = {Koreeda, Yuta and Manning, Christopher D}, + journal = {arXiv preprint arXiv:2110.01799}, + title = {ContractNLI: A dataset for document-level natural language inference for contracts}, + year = {2021}, +} +""", ) def dataset_transform(self): @@ -684,21 +709,23 @@ class ContractNLISharingWithThirdPartiesLegalBenchClassification(AbsTaskClassifi annotations_creators="expert-annotated", dialect=[], sample_creation="found", - bibtex_citation=""" - @misc{guha2023legalbench, - title={LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, - author={Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, - year={2023}, - eprint={2308.11462}, - archivePrefix={arXiv}, - primaryClass={cs.CL} - }, - @article{koreeda2021contractnli, - title={ContractNLI: A dataset for document-level natural language inference for contracts}, - author={Koreeda, Yuta and Manning, Christopher D}, - journal={arXiv preprint arXiv:2110.01799}, - year={2021} - }""", + bibtex_citation=r""" +@misc{guha2023legalbench, + archiveprefix = {arXiv}, + author = {Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, + eprint = {2308.11462}, + primaryclass = {cs.CL}, + title = {LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, + year = {2023}, +} + +@article{koreeda2021contractnli, + author = {Koreeda, Yuta and Manning, Christopher D}, + journal = {arXiv preprint arXiv:2110.01799}, + title = {ContractNLI: A dataset for document-level natural language inference for contracts}, + year = {2021}, +} +""", ) def dataset_transform(self): @@ -734,21 +761,23 @@ class ContractNLISurvivalOfObligationsLegalBenchClassification(AbsTaskClassifica annotations_creators="expert-annotated", dialect=[], sample_creation="found", - bibtex_citation=""" - @misc{guha2023legalbench, - title={LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, - author={Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, - year={2023}, - eprint={2308.11462}, - archivePrefix={arXiv}, - primaryClass={cs.CL} - }, - @article{koreeda2021contractnli, - title={ContractNLI: A dataset for document-level natural language inference for contracts}, - author={Koreeda, Yuta and Manning, Christopher D}, - journal={arXiv preprint arXiv:2110.01799}, - year={2021} - }""", + bibtex_citation=r""" +@misc{guha2023legalbench, + archiveprefix = {arXiv}, + author = {Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, + eprint = {2308.11462}, + primaryclass = {cs.CL}, + title = {LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, + year = {2023}, +} + +@article{koreeda2021contractnli, + author = {Koreeda, Yuta and Manning, Christopher D}, + journal = {arXiv preprint arXiv:2110.01799}, + title = {ContractNLI: A dataset for document-level natural language inference for contracts}, + year = {2021}, +} +""", ) def dataset_transform(self): @@ -784,16 +813,16 @@ class CorporateLobbyingLegalBenchClassification(AbsTaskClassification): annotations_creators="expert-annotated", dialect=[], sample_creation="found", - bibtex_citation=""" - @misc{guha2023legalbench, - title={LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, - author={Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, - year={2023}, - eprint={2308.11462}, - archivePrefix={arXiv}, - primaryClass={cs.CL} - } - """, + bibtex_citation=r""" +@misc{guha2023legalbench, + archiveprefix = {arXiv}, + author = {Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, + eprint = {2308.11462}, + primaryclass = {cs.CL}, + title = {LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, + year = {2023}, +} +""", ) def dataset_transform(self): @@ -844,22 +873,23 @@ class CUADAffiliateLicenseLicenseeLegalBenchClassification(AbsTaskClassification annotations_creators="expert-annotated", dialect=[], sample_creation="found", - bibtex_citation=""" - @misc{guha2023legalbench, - title={LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, - author={Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, - year={2023}, - eprint={2308.11462}, - archivePrefix={arXiv}, - primaryClass={cs.CL} - }, - @article{hendrycks2021cuad, - title={Cuad: An expert-annotated nlp dataset for legal contract review}, - author={Hendrycks, Dan and Burns, Collin and Chen, Anya and Ball, Spencer}, - journal={arXiv preprint arXiv:2103.06268}, - year={2021} - } - """, + bibtex_citation=r""" +@misc{guha2023legalbench, + archiveprefix = {arXiv}, + author = {Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, + eprint = {2308.11462}, + primaryclass = {cs.CL}, + title = {LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, + year = {2023}, +} + +@article{hendrycks2021cuad, + author = {Hendrycks, Dan and Burns, Collin and Chen, Anya and Ball, Spencer}, + journal = {arXiv preprint arXiv:2103.06268}, + title = {Cuad: An expert-annotated nlp dataset for legal contract review}, + year = {2021}, +} +""", ) def dataset_transform(self): @@ -895,22 +925,23 @@ class CUADAffiliateLicenseLicensorLegalBenchClassification(AbsTaskClassification annotations_creators="expert-annotated", dialect=[], sample_creation="found", - bibtex_citation=""" - @misc{guha2023legalbench, - title={LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, - author={Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, - year={2023}, - eprint={2308.11462}, - archivePrefix={arXiv}, - primaryClass={cs.CL} - }, - @article{hendrycks2021cuad, - title={Cuad: An expert-annotated nlp dataset for legal contract review}, - author={Hendrycks, Dan and Burns, Collin and Chen, Anya and Ball, Spencer}, - journal={arXiv preprint arXiv:2103.06268}, - year={2021} - } - """, + bibtex_citation=r""" +@misc{guha2023legalbench, + archiveprefix = {arXiv}, + author = {Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, + eprint = {2308.11462}, + primaryclass = {cs.CL}, + title = {LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, + year = {2023}, +} + +@article{hendrycks2021cuad, + author = {Hendrycks, Dan and Burns, Collin and Chen, Anya and Ball, Spencer}, + journal = {arXiv preprint arXiv:2103.06268}, + title = {Cuad: An expert-annotated nlp dataset for legal contract review}, + year = {2021}, +} +""", ) def dataset_transform(self): @@ -946,22 +977,23 @@ class CUADAntiAssignmentLegalBenchClassification(AbsTaskClassification): annotations_creators="expert-annotated", dialect=[], sample_creation="found", - bibtex_citation=""" - @misc{guha2023legalbench, - title={LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, - author={Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, - year={2023}, - eprint={2308.11462}, - archivePrefix={arXiv}, - primaryClass={cs.CL} - }, - @article{hendrycks2021cuad, - title={Cuad: An expert-annotated nlp dataset for legal contract review}, - author={Hendrycks, Dan and Burns, Collin and Chen, Anya and Ball, Spencer}, - journal={arXiv preprint arXiv:2103.06268}, - year={2021} - } - """, + bibtex_citation=r""" +@misc{guha2023legalbench, + archiveprefix = {arXiv}, + author = {Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, + eprint = {2308.11462}, + primaryclass = {cs.CL}, + title = {LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, + year = {2023}, +} + +@article{hendrycks2021cuad, + author = {Hendrycks, Dan and Burns, Collin and Chen, Anya and Ball, Spencer}, + journal = {arXiv preprint arXiv:2103.06268}, + title = {Cuad: An expert-annotated nlp dataset for legal contract review}, + year = {2021}, +} +""", ) def dataset_transform(self): @@ -997,22 +1029,23 @@ class CUADAuditRightsLegalBenchClassification(AbsTaskClassification): annotations_creators="expert-annotated", dialect=[], sample_creation="found", - bibtex_citation=""" - @misc{guha2023legalbench, - title={LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, - author={Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, - year={2023}, - eprint={2308.11462}, - archivePrefix={arXiv}, - primaryClass={cs.CL} - }, - @article{hendrycks2021cuad, - title={Cuad: An expert-annotated nlp dataset for legal contract review}, - author={Hendrycks, Dan and Burns, Collin and Chen, Anya and Ball, Spencer}, - journal={arXiv preprint arXiv:2103.06268}, - year={2021} - } - """, + bibtex_citation=r""" +@misc{guha2023legalbench, + archiveprefix = {arXiv}, + author = {Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, + eprint = {2308.11462}, + primaryclass = {cs.CL}, + title = {LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, + year = {2023}, +} + +@article{hendrycks2021cuad, + author = {Hendrycks, Dan and Burns, Collin and Chen, Anya and Ball, Spencer}, + journal = {arXiv preprint arXiv:2103.06268}, + title = {Cuad: An expert-annotated nlp dataset for legal contract review}, + year = {2021}, +} +""", ) def dataset_transform(self): @@ -1048,22 +1081,23 @@ class CUADCapOnLiabilityLegalBenchClassification(AbsTaskClassification): annotations_creators="expert-annotated", dialect=[], sample_creation="found", - bibtex_citation=""" - @misc{guha2023legalbench, - title={LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, - author={Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, - year={2023}, - eprint={2308.11462}, - archivePrefix={arXiv}, - primaryClass={cs.CL} - }, - @article{hendrycks2021cuad, - title={Cuad: An expert-annotated nlp dataset for legal contract review}, - author={Hendrycks, Dan and Burns, Collin and Chen, Anya and Ball, Spencer}, - journal={arXiv preprint arXiv:2103.06268}, - year={2021} - } - """, + bibtex_citation=r""" +@misc{guha2023legalbench, + archiveprefix = {arXiv}, + author = {Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, + eprint = {2308.11462}, + primaryclass = {cs.CL}, + title = {LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, + year = {2023}, +} + +@article{hendrycks2021cuad, + author = {Hendrycks, Dan and Burns, Collin and Chen, Anya and Ball, Spencer}, + journal = {arXiv preprint arXiv:2103.06268}, + title = {Cuad: An expert-annotated nlp dataset for legal contract review}, + year = {2021}, +} +""", ) def dataset_transform(self): @@ -1099,22 +1133,23 @@ class CUADChangeOfControlLegalBenchClassification(AbsTaskClassification): annotations_creators="expert-annotated", dialect=[], sample_creation="found", - bibtex_citation=""" - @misc{guha2023legalbench, - title={LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, - author={Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, - year={2023}, - eprint={2308.11462}, - archivePrefix={arXiv}, - primaryClass={cs.CL} - }, - @article{hendrycks2021cuad, - title={Cuad: An expert-annotated nlp dataset for legal contract review}, - author={Hendrycks, Dan and Burns, Collin and Chen, Anya and Ball, Spencer}, - journal={arXiv preprint arXiv:2103.06268}, - year={2021} - } - """, + bibtex_citation=r""" +@misc{guha2023legalbench, + archiveprefix = {arXiv}, + author = {Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, + eprint = {2308.11462}, + primaryclass = {cs.CL}, + title = {LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, + year = {2023}, +} + +@article{hendrycks2021cuad, + author = {Hendrycks, Dan and Burns, Collin and Chen, Anya and Ball, Spencer}, + journal = {arXiv preprint arXiv:2103.06268}, + title = {Cuad: An expert-annotated nlp dataset for legal contract review}, + year = {2021}, +} +""", ) def dataset_transform(self): @@ -1152,22 +1187,23 @@ class CUADCompetitiveRestrictionExceptionLegalBenchClassification( annotations_creators="expert-annotated", dialect=[], sample_creation="found", - bibtex_citation=""" - @misc{guha2023legalbench, - title={LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, - author={Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, - year={2023}, - eprint={2308.11462}, - archivePrefix={arXiv}, - primaryClass={cs.CL} - }, - @article{hendrycks2021cuad, - title={Cuad: An expert-annotated nlp dataset for legal contract review}, - author={Hendrycks, Dan and Burns, Collin and Chen, Anya and Ball, Spencer}, - journal={arXiv preprint arXiv:2103.06268}, - year={2021} - } - """, + bibtex_citation=r""" +@misc{guha2023legalbench, + archiveprefix = {arXiv}, + author = {Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, + eprint = {2308.11462}, + primaryclass = {cs.CL}, + title = {LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, + year = {2023}, +} + +@article{hendrycks2021cuad, + author = {Hendrycks, Dan and Burns, Collin and Chen, Anya and Ball, Spencer}, + journal = {arXiv preprint arXiv:2103.06268}, + title = {Cuad: An expert-annotated nlp dataset for legal contract review}, + year = {2021}, +} +""", ) def dataset_transform(self): @@ -1203,22 +1239,23 @@ class CUADCovenantNotToSueLegalBenchClassification(AbsTaskClassification): annotations_creators="expert-annotated", dialect=[], sample_creation="found", - bibtex_citation=""" - @misc{guha2023legalbench, - title={LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, - author={Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, - year={2023}, - eprint={2308.11462}, - archivePrefix={arXiv}, - primaryClass={cs.CL} - }, - @article{hendrycks2021cuad, - title={Cuad: An expert-annotated nlp dataset for legal contract review}, - author={Hendrycks, Dan and Burns, Collin and Chen, Anya and Ball, Spencer}, - journal={arXiv preprint arXiv:2103.06268}, - year={2021} - } - """, + bibtex_citation=r""" +@misc{guha2023legalbench, + archiveprefix = {arXiv}, + author = {Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, + eprint = {2308.11462}, + primaryclass = {cs.CL}, + title = {LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, + year = {2023}, +} + +@article{hendrycks2021cuad, + author = {Hendrycks, Dan and Burns, Collin and Chen, Anya and Ball, Spencer}, + journal = {arXiv preprint arXiv:2103.06268}, + title = {Cuad: An expert-annotated nlp dataset for legal contract review}, + year = {2021}, +} +""", ) def dataset_transform(self): @@ -1254,22 +1291,23 @@ class CUADEffectiveDateLegalBenchClassification(AbsTaskClassification): annotations_creators="expert-annotated", dialect=[], sample_creation="found", - bibtex_citation=""" - @misc{guha2023legalbench, - title={LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, - author={Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, - year={2023}, - eprint={2308.11462}, - archivePrefix={arXiv}, - primaryClass={cs.CL} - }, - @article{hendrycks2021cuad, - title={Cuad: An expert-annotated nlp dataset for legal contract review}, - author={Hendrycks, Dan and Burns, Collin and Chen, Anya and Ball, Spencer}, - journal={arXiv preprint arXiv:2103.06268}, - year={2021} - } - """, + bibtex_citation=r""" +@misc{guha2023legalbench, + archiveprefix = {arXiv}, + author = {Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, + eprint = {2308.11462}, + primaryclass = {cs.CL}, + title = {LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, + year = {2023}, +} + +@article{hendrycks2021cuad, + author = {Hendrycks, Dan and Burns, Collin and Chen, Anya and Ball, Spencer}, + journal = {arXiv preprint arXiv:2103.06268}, + title = {Cuad: An expert-annotated nlp dataset for legal contract review}, + year = {2021}, +} +""", ) def dataset_transform(self): @@ -1305,22 +1343,23 @@ class CUADExclusivityLegalBenchClassification(AbsTaskClassification): annotations_creators="expert-annotated", dialect=[], sample_creation="found", - bibtex_citation=""" - @misc{guha2023legalbench, - title={LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, - author={Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, - year={2023}, - eprint={2308.11462}, - archivePrefix={arXiv}, - primaryClass={cs.CL} - }, - @article{hendrycks2021cuad, - title={Cuad: An expert-annotated nlp dataset for legal contract review}, - author={Hendrycks, Dan and Burns, Collin and Chen, Anya and Ball, Spencer}, - journal={arXiv preprint arXiv:2103.06268}, - year={2021} - } - """, + bibtex_citation=r""" +@misc{guha2023legalbench, + archiveprefix = {arXiv}, + author = {Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, + eprint = {2308.11462}, + primaryclass = {cs.CL}, + title = {LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, + year = {2023}, +} + +@article{hendrycks2021cuad, + author = {Hendrycks, Dan and Burns, Collin and Chen, Anya and Ball, Spencer}, + journal = {arXiv preprint arXiv:2103.06268}, + title = {Cuad: An expert-annotated nlp dataset for legal contract review}, + year = {2021}, +} +""", ) def dataset_transform(self): @@ -1356,22 +1395,23 @@ class CUADExpirationDateLegalBenchClassification(AbsTaskClassification): annotations_creators="expert-annotated", dialect=[], sample_creation="found", - bibtex_citation=""" - @misc{guha2023legalbench, - title={LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, - author={Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, - year={2023}, - eprint={2308.11462}, - archivePrefix={arXiv}, - primaryClass={cs.CL} - }, - @article{hendrycks2021cuad, - title={Cuad: An expert-annotated nlp dataset for legal contract review}, - author={Hendrycks, Dan and Burns, Collin and Chen, Anya and Ball, Spencer}, - journal={arXiv preprint arXiv:2103.06268}, - year={2021} - } - """, + bibtex_citation=r""" +@misc{guha2023legalbench, + archiveprefix = {arXiv}, + author = {Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, + eprint = {2308.11462}, + primaryclass = {cs.CL}, + title = {LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, + year = {2023}, +} + +@article{hendrycks2021cuad, + author = {Hendrycks, Dan and Burns, Collin and Chen, Anya and Ball, Spencer}, + journal = {arXiv preprint arXiv:2103.06268}, + title = {Cuad: An expert-annotated nlp dataset for legal contract review}, + year = {2021}, +} +""", ) def dataset_transform(self): @@ -1407,22 +1447,23 @@ class CUADGoverningLawLegalBenchClassification(AbsTaskClassification): annotations_creators="expert-annotated", dialect=[], sample_creation="found", - bibtex_citation=""" - @misc{guha2023legalbench, - title={LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, - author={Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, - year={2023}, - eprint={2308.11462}, - archivePrefix={arXiv}, - primaryClass={cs.CL} - }, - @article{hendrycks2021cuad, - title={Cuad: An expert-annotated nlp dataset for legal contract review}, - author={Hendrycks, Dan and Burns, Collin and Chen, Anya and Ball, Spencer}, - journal={arXiv preprint arXiv:2103.06268}, - year={2021} - } - """, + bibtex_citation=r""" +@misc{guha2023legalbench, + archiveprefix = {arXiv}, + author = {Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, + eprint = {2308.11462}, + primaryclass = {cs.CL}, + title = {LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, + year = {2023}, +} + +@article{hendrycks2021cuad, + author = {Hendrycks, Dan and Burns, Collin and Chen, Anya and Ball, Spencer}, + journal = {arXiv preprint arXiv:2103.06268}, + title = {Cuad: An expert-annotated nlp dataset for legal contract review}, + year = {2021}, +} +""", ) def dataset_transform(self): @@ -1458,22 +1499,23 @@ class CUADInsuranceLegalBenchClassification(AbsTaskClassification): annotations_creators="expert-annotated", dialect=[], sample_creation="found", - bibtex_citation=""" - @misc{guha2023legalbench, - title={LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, - author={Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, - year={2023}, - eprint={2308.11462}, - archivePrefix={arXiv}, - primaryClass={cs.CL} - }, - @article{hendrycks2021cuad, - title={Cuad: An expert-annotated nlp dataset for legal contract review}, - author={Hendrycks, Dan and Burns, Collin and Chen, Anya and Ball, Spencer}, - journal={arXiv preprint arXiv:2103.06268}, - year={2021} - } - """, + bibtex_citation=r""" +@misc{guha2023legalbench, + archiveprefix = {arXiv}, + author = {Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, + eprint = {2308.11462}, + primaryclass = {cs.CL}, + title = {LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, + year = {2023}, +} + +@article{hendrycks2021cuad, + author = {Hendrycks, Dan and Burns, Collin and Chen, Anya and Ball, Spencer}, + journal = {arXiv preprint arXiv:2103.06268}, + title = {Cuad: An expert-annotated nlp dataset for legal contract review}, + year = {2021}, +} +""", ) def dataset_transform(self): @@ -1509,22 +1551,23 @@ class CUADIPOwnershipAssignmentLegalBenchClassification(AbsTaskClassification): annotations_creators="expert-annotated", dialect=[], sample_creation="found", - bibtex_citation=""" - @misc{guha2023legalbench, - title={LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, - author={Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, - year={2023}, - eprint={2308.11462}, - archivePrefix={arXiv}, - primaryClass={cs.CL} - }, - @article{hendrycks2021cuad, - title={Cuad: An expert-annotated nlp dataset for legal contract review}, - author={Hendrycks, Dan and Burns, Collin and Chen, Anya and Ball, Spencer}, - journal={arXiv preprint arXiv:2103.06268}, - year={2021} - } - """, + bibtex_citation=r""" +@misc{guha2023legalbench, + archiveprefix = {arXiv}, + author = {Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, + eprint = {2308.11462}, + primaryclass = {cs.CL}, + title = {LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, + year = {2023}, +} + +@article{hendrycks2021cuad, + author = {Hendrycks, Dan and Burns, Collin and Chen, Anya and Ball, Spencer}, + journal = {arXiv preprint arXiv:2103.06268}, + title = {Cuad: An expert-annotated nlp dataset for legal contract review}, + year = {2021}, +} +""", ) def dataset_transform(self): @@ -1560,22 +1603,23 @@ class CUADIrrevocableOrPerpetualLicenseLegalBenchClassification(AbsTaskClassific annotations_creators="expert-annotated", dialect=[], sample_creation="found", - bibtex_citation=""" - @misc{guha2023legalbench, - title={LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, - author={Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, - year={2023}, - eprint={2308.11462}, - archivePrefix={arXiv}, - primaryClass={cs.CL} - }, - @article{hendrycks2021cuad, - title={Cuad: An expert-annotated nlp dataset for legal contract review}, - author={Hendrycks, Dan and Burns, Collin and Chen, Anya and Ball, Spencer}, - journal={arXiv preprint arXiv:2103.06268}, - year={2021} - } - """, + bibtex_citation=r""" +@misc{guha2023legalbench, + archiveprefix = {arXiv}, + author = {Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, + eprint = {2308.11462}, + primaryclass = {cs.CL}, + title = {LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, + year = {2023}, +} + +@article{hendrycks2021cuad, + author = {Hendrycks, Dan and Burns, Collin and Chen, Anya and Ball, Spencer}, + journal = {arXiv preprint arXiv:2103.06268}, + title = {Cuad: An expert-annotated nlp dataset for legal contract review}, + year = {2021}, +} +""", ) def dataset_transform(self): @@ -1611,22 +1655,23 @@ class CUADJointIPOwnershipLegalBenchClassification(AbsTaskClassification): annotations_creators="expert-annotated", dialect=[], sample_creation="found", - bibtex_citation=""" - @misc{guha2023legalbench, - title={LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, - author={Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, - year={2023}, - eprint={2308.11462}, - archivePrefix={arXiv}, - primaryClass={cs.CL} - }, - @article{hendrycks2021cuad, - title={Cuad: An expert-annotated nlp dataset for legal contract review}, - author={Hendrycks, Dan and Burns, Collin and Chen, Anya and Ball, Spencer}, - journal={arXiv preprint arXiv:2103.06268}, - year={2021} - } - """, + bibtex_citation=r""" +@misc{guha2023legalbench, + archiveprefix = {arXiv}, + author = {Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, + eprint = {2308.11462}, + primaryclass = {cs.CL}, + title = {LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, + year = {2023}, +} + +@article{hendrycks2021cuad, + author = {Hendrycks, Dan and Burns, Collin and Chen, Anya and Ball, Spencer}, + journal = {arXiv preprint arXiv:2103.06268}, + title = {Cuad: An expert-annotated nlp dataset for legal contract review}, + year = {2021}, +} +""", ) def dataset_transform(self): @@ -1662,22 +1707,23 @@ class CUADLicenseGrantLegalBenchClassification(AbsTaskClassification): annotations_creators="expert-annotated", dialect=[], sample_creation="found", - bibtex_citation=""" - @misc{guha2023legalbench, - title={LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, - author={Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, - year={2023}, - eprint={2308.11462}, - archivePrefix={arXiv}, - primaryClass={cs.CL} - }, - @article{hendrycks2021cuad, - title={Cuad: An expert-annotated nlp dataset for legal contract review}, - author={Hendrycks, Dan and Burns, Collin and Chen, Anya and Ball, Spencer}, - journal={arXiv preprint arXiv:2103.06268}, - year={2021} - } - """, + bibtex_citation=r""" +@misc{guha2023legalbench, + archiveprefix = {arXiv}, + author = {Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, + eprint = {2308.11462}, + primaryclass = {cs.CL}, + title = {LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, + year = {2023}, +} + +@article{hendrycks2021cuad, + author = {Hendrycks, Dan and Burns, Collin and Chen, Anya and Ball, Spencer}, + journal = {arXiv preprint arXiv:2103.06268}, + title = {Cuad: An expert-annotated nlp dataset for legal contract review}, + year = {2021}, +} +""", ) def dataset_transform(self): @@ -1713,22 +1759,23 @@ class CUADLiquidatedDamagesLegalBenchClassification(AbsTaskClassification): annotations_creators="expert-annotated", dialect=[], sample_creation="found", - bibtex_citation=""" - @misc{guha2023legalbench, - title={LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, - author={Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, - year={2023}, - eprint={2308.11462}, - archivePrefix={arXiv}, - primaryClass={cs.CL} - }, - @article{hendrycks2021cuad, - title={Cuad: An expert-annotated nlp dataset for legal contract review}, - author={Hendrycks, Dan and Burns, Collin and Chen, Anya and Ball, Spencer}, - journal={arXiv preprint arXiv:2103.06268}, - year={2021} - } - """, + bibtex_citation=r""" +@misc{guha2023legalbench, + archiveprefix = {arXiv}, + author = {Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, + eprint = {2308.11462}, + primaryclass = {cs.CL}, + title = {LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, + year = {2023}, +} + +@article{hendrycks2021cuad, + author = {Hendrycks, Dan and Burns, Collin and Chen, Anya and Ball, Spencer}, + journal = {arXiv preprint arXiv:2103.06268}, + title = {Cuad: An expert-annotated nlp dataset for legal contract review}, + year = {2021}, +} +""", ) def dataset_transform(self): @@ -1764,22 +1811,23 @@ class CUADMinimumCommitmentLegalBenchClassification(AbsTaskClassification): annotations_creators="expert-annotated", dialect=[], sample_creation="found", - bibtex_citation=""" - @misc{guha2023legalbench, - title={LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, - author={Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, - year={2023}, - eprint={2308.11462}, - archivePrefix={arXiv}, - primaryClass={cs.CL} - }, - @article{hendrycks2021cuad, - title={Cuad: An expert-annotated nlp dataset for legal contract review}, - author={Hendrycks, Dan and Burns, Collin and Chen, Anya and Ball, Spencer}, - journal={arXiv preprint arXiv:2103.06268}, - year={2021} - } - """, + bibtex_citation=r""" +@misc{guha2023legalbench, + archiveprefix = {arXiv}, + author = {Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, + eprint = {2308.11462}, + primaryclass = {cs.CL}, + title = {LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, + year = {2023}, +} + +@article{hendrycks2021cuad, + author = {Hendrycks, Dan and Burns, Collin and Chen, Anya and Ball, Spencer}, + journal = {arXiv preprint arXiv:2103.06268}, + title = {Cuad: An expert-annotated nlp dataset for legal contract review}, + year = {2021}, +} +""", ) def dataset_transform(self): @@ -1815,22 +1863,23 @@ class CUADMostFavoredNationLegalBenchClassification(AbsTaskClassification): annotations_creators="expert-annotated", dialect=[], sample_creation="found", - bibtex_citation=""" - @misc{guha2023legalbench, - title={LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, - author={Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, - year={2023}, - eprint={2308.11462}, - archivePrefix={arXiv}, - primaryClass={cs.CL} - }, - @article{hendrycks2021cuad, - title={Cuad: An expert-annotated nlp dataset for legal contract review}, - author={Hendrycks, Dan and Burns, Collin and Chen, Anya and Ball, Spencer}, - journal={arXiv preprint arXiv:2103.06268}, - year={2021} - } - """, + bibtex_citation=r""" +@misc{guha2023legalbench, + archiveprefix = {arXiv}, + author = {Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, + eprint = {2308.11462}, + primaryclass = {cs.CL}, + title = {LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, + year = {2023}, +} + +@article{hendrycks2021cuad, + author = {Hendrycks, Dan and Burns, Collin and Chen, Anya and Ball, Spencer}, + journal = {arXiv preprint arXiv:2103.06268}, + title = {Cuad: An expert-annotated nlp dataset for legal contract review}, + year = {2021}, +} +""", ) def dataset_transform(self): @@ -1866,22 +1915,23 @@ class CUADNoSolicitOfCustomersLegalBenchClassification(AbsTaskClassification): annotations_creators="expert-annotated", dialect=[], sample_creation="found", - bibtex_citation=""" - @misc{guha2023legalbench, - title={LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, - author={Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, - year={2023}, - eprint={2308.11462}, - archivePrefix={arXiv}, - primaryClass={cs.CL} - }, - @article{hendrycks2021cuad, - title={Cuad: An expert-annotated nlp dataset for legal contract review}, - author={Hendrycks, Dan and Burns, Collin and Chen, Anya and Ball, Spencer}, - journal={arXiv preprint arXiv:2103.06268}, - year={2021} - } - """, + bibtex_citation=r""" +@misc{guha2023legalbench, + archiveprefix = {arXiv}, + author = {Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, + eprint = {2308.11462}, + primaryclass = {cs.CL}, + title = {LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, + year = {2023}, +} + +@article{hendrycks2021cuad, + author = {Hendrycks, Dan and Burns, Collin and Chen, Anya and Ball, Spencer}, + journal = {arXiv preprint arXiv:2103.06268}, + title = {Cuad: An expert-annotated nlp dataset for legal contract review}, + year = {2021}, +} +""", ) def dataset_transform(self): @@ -1917,22 +1967,23 @@ class CUADNoSolicitOfEmployeesLegalBenchClassification(AbsTaskClassification): annotations_creators="expert-annotated", dialect=[], sample_creation="found", - bibtex_citation=""" - @misc{guha2023legalbench, - title={LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, - author={Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, - year={2023}, - eprint={2308.11462}, - archivePrefix={arXiv}, - primaryClass={cs.CL} - }, - @article{hendrycks2021cuad, - title={Cuad: An expert-annotated nlp dataset for legal contract review}, - author={Hendrycks, Dan and Burns, Collin and Chen, Anya and Ball, Spencer}, - journal={arXiv preprint arXiv:2103.06268}, - year={2021} - } - """, + bibtex_citation=r""" +@misc{guha2023legalbench, + archiveprefix = {arXiv}, + author = {Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, + eprint = {2308.11462}, + primaryclass = {cs.CL}, + title = {LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, + year = {2023}, +} + +@article{hendrycks2021cuad, + author = {Hendrycks, Dan and Burns, Collin and Chen, Anya and Ball, Spencer}, + journal = {arXiv preprint arXiv:2103.06268}, + title = {Cuad: An expert-annotated nlp dataset for legal contract review}, + year = {2021}, +} +""", ) def dataset_transform(self): @@ -1968,22 +2019,23 @@ class CUADNonCompeteLegalBenchClassification(AbsTaskClassification): annotations_creators="expert-annotated", dialect=[], sample_creation="found", - bibtex_citation=""" - @misc{guha2023legalbench, - title={LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, - author={Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, - year={2023}, - eprint={2308.11462}, - archivePrefix={arXiv}, - primaryClass={cs.CL} - }, - @article{hendrycks2021cuad, - title={Cuad: An expert-annotated nlp dataset for legal contract review}, - author={Hendrycks, Dan and Burns, Collin and Chen, Anya and Ball, Spencer}, - journal={arXiv preprint arXiv:2103.06268}, - year={2021} - } - """, + bibtex_citation=r""" +@misc{guha2023legalbench, + archiveprefix = {arXiv}, + author = {Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, + eprint = {2308.11462}, + primaryclass = {cs.CL}, + title = {LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, + year = {2023}, +} + +@article{hendrycks2021cuad, + author = {Hendrycks, Dan and Burns, Collin and Chen, Anya and Ball, Spencer}, + journal = {arXiv preprint arXiv:2103.06268}, + title = {Cuad: An expert-annotated nlp dataset for legal contract review}, + year = {2021}, +} +""", ) def dataset_transform(self): @@ -2019,22 +2071,23 @@ class CUADNonDisparagementLegalBenchClassification(AbsTaskClassification): annotations_creators="expert-annotated", dialect=[], sample_creation="found", - bibtex_citation=""" - @misc{guha2023legalbench, - title={LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, - author={Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, - year={2023}, - eprint={2308.11462}, - archivePrefix={arXiv}, - primaryClass={cs.CL} - }, - @article{hendrycks2021cuad, - title={Cuad: An expert-annotated nlp dataset for legal contract review}, - author={Hendrycks, Dan and Burns, Collin and Chen, Anya and Ball, Spencer}, - journal={arXiv preprint arXiv:2103.06268}, - year={2021} - } - """, + bibtex_citation=r""" +@misc{guha2023legalbench, + archiveprefix = {arXiv}, + author = {Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, + eprint = {2308.11462}, + primaryclass = {cs.CL}, + title = {LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, + year = {2023}, +} + +@article{hendrycks2021cuad, + author = {Hendrycks, Dan and Burns, Collin and Chen, Anya and Ball, Spencer}, + journal = {arXiv preprint arXiv:2103.06268}, + title = {Cuad: An expert-annotated nlp dataset for legal contract review}, + year = {2021}, +} +""", ) def dataset_transform(self): @@ -2070,22 +2123,23 @@ class CUADNonTransferableLicenseLegalBenchClassification(AbsTaskClassification): annotations_creators="expert-annotated", dialect=[], sample_creation="found", - bibtex_citation=""" - @misc{guha2023legalbench, - title={LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, - author={Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, - year={2023}, - eprint={2308.11462}, - archivePrefix={arXiv}, - primaryClass={cs.CL} - }, - @article{hendrycks2021cuad, - title={Cuad: An expert-annotated nlp dataset for legal contract review}, - author={Hendrycks, Dan and Burns, Collin and Chen, Anya and Ball, Spencer}, - journal={arXiv preprint arXiv:2103.06268}, - year={2021} - } - """, + bibtex_citation=r""" +@misc{guha2023legalbench, + archiveprefix = {arXiv}, + author = {Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, + eprint = {2308.11462}, + primaryclass = {cs.CL}, + title = {LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, + year = {2023}, +} + +@article{hendrycks2021cuad, + author = {Hendrycks, Dan and Burns, Collin and Chen, Anya and Ball, Spencer}, + journal = {arXiv preprint arXiv:2103.06268}, + title = {Cuad: An expert-annotated nlp dataset for legal contract review}, + year = {2021}, +} +""", ) def dataset_transform(self): @@ -2121,22 +2175,23 @@ class CUADNoticePeriodToTerminateRenewalLegalBenchClassification(AbsTaskClassifi annotations_creators="expert-annotated", dialect=[], sample_creation="found", - bibtex_citation=""" - @misc{guha2023legalbench, - title={LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, - author={Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, - year={2023}, - eprint={2308.11462}, - archivePrefix={arXiv}, - primaryClass={cs.CL} - }, - @article{hendrycks2021cuad, - title={Cuad: An expert-annotated nlp dataset for legal contract review}, - author={Hendrycks, Dan and Burns, Collin and Chen, Anya and Ball, Spencer}, - journal={arXiv preprint arXiv:2103.06268}, - year={2021} - } - """, + bibtex_citation=r""" +@misc{guha2023legalbench, + archiveprefix = {arXiv}, + author = {Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, + eprint = {2308.11462}, + primaryclass = {cs.CL}, + title = {LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, + year = {2023}, +} + +@article{hendrycks2021cuad, + author = {Hendrycks, Dan and Burns, Collin and Chen, Anya and Ball, Spencer}, + journal = {arXiv preprint arXiv:2103.06268}, + title = {Cuad: An expert-annotated nlp dataset for legal contract review}, + year = {2021}, +} +""", ) def dataset_transform(self): @@ -2172,22 +2227,23 @@ class CUADPostTerminationServicesLegalBenchClassification(AbsTaskClassification) annotations_creators="expert-annotated", dialect=[], sample_creation="found", - bibtex_citation=""" - @misc{guha2023legalbench, - title={LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, - author={Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, - year={2023}, - eprint={2308.11462}, - archivePrefix={arXiv}, - primaryClass={cs.CL} - }, - @article{hendrycks2021cuad, - title={Cuad: An expert-annotated nlp dataset for legal contract review}, - author={Hendrycks, Dan and Burns, Collin and Chen, Anya and Ball, Spencer}, - journal={arXiv preprint arXiv:2103.06268}, - year={2021} - } - """, + bibtex_citation=r""" +@misc{guha2023legalbench, + archiveprefix = {arXiv}, + author = {Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, + eprint = {2308.11462}, + primaryclass = {cs.CL}, + title = {LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, + year = {2023}, +} + +@article{hendrycks2021cuad, + author = {Hendrycks, Dan and Burns, Collin and Chen, Anya and Ball, Spencer}, + journal = {arXiv preprint arXiv:2103.06268}, + title = {Cuad: An expert-annotated nlp dataset for legal contract review}, + year = {2021}, +} +""", ) def dataset_transform(self): @@ -2223,22 +2279,23 @@ class CUADPriceRestrictionsLegalBenchClassification(AbsTaskClassification): annotations_creators="expert-annotated", dialect=[], sample_creation="found", - bibtex_citation=""" - @misc{guha2023legalbench, - title={LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, - author={Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, - year={2023}, - eprint={2308.11462}, - archivePrefix={arXiv}, - primaryClass={cs.CL} - }, - @article{hendrycks2021cuad, - title={Cuad: An expert-annotated nlp dataset for legal contract review}, - author={Hendrycks, Dan and Burns, Collin and Chen, Anya and Ball, Spencer}, - journal={arXiv preprint arXiv:2103.06268}, - year={2021} - } - """, + bibtex_citation=r""" +@misc{guha2023legalbench, + archiveprefix = {arXiv}, + author = {Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, + eprint = {2308.11462}, + primaryclass = {cs.CL}, + title = {LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, + year = {2023}, +} + +@article{hendrycks2021cuad, + author = {Hendrycks, Dan and Burns, Collin and Chen, Anya and Ball, Spencer}, + journal = {arXiv preprint arXiv:2103.06268}, + title = {Cuad: An expert-annotated nlp dataset for legal contract review}, + year = {2021}, +} +""", ) def dataset_transform(self): @@ -2274,22 +2331,23 @@ class CUADRenewalTermLegalBenchClassification(AbsTaskClassification): annotations_creators="expert-annotated", dialect=[], sample_creation="found", - bibtex_citation=""" - @misc{guha2023legalbench, - title={LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, - author={Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, - year={2023}, - eprint={2308.11462}, - archivePrefix={arXiv}, - primaryClass={cs.CL} - }, - @article{hendrycks2021cuad, - title={Cuad: An expert-annotated nlp dataset for legal contract review}, - author={Hendrycks, Dan and Burns, Collin and Chen, Anya and Ball, Spencer}, - journal={arXiv preprint arXiv:2103.06268}, - year={2021} - } - """, + bibtex_citation=r""" +@misc{guha2023legalbench, + archiveprefix = {arXiv}, + author = {Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, + eprint = {2308.11462}, + primaryclass = {cs.CL}, + title = {LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, + year = {2023}, +} + +@article{hendrycks2021cuad, + author = {Hendrycks, Dan and Burns, Collin and Chen, Anya and Ball, Spencer}, + journal = {arXiv preprint arXiv:2103.06268}, + title = {Cuad: An expert-annotated nlp dataset for legal contract review}, + year = {2021}, +} +""", ) def dataset_transform(self): @@ -2325,22 +2383,23 @@ class CUADRevenueProfitSharingLegalBenchClassification(AbsTaskClassification): annotations_creators="expert-annotated", dialect=[], sample_creation="found", - bibtex_citation=""" - @misc{guha2023legalbench, - title={LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, - author={Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, - year={2023}, - eprint={2308.11462}, - archivePrefix={arXiv}, - primaryClass={cs.CL} - }, - @article{hendrycks2021cuad, - title={Cuad: An expert-annotated nlp dataset for legal contract review}, - author={Hendrycks, Dan and Burns, Collin and Chen, Anya and Ball, Spencer}, - journal={arXiv preprint arXiv:2103.06268}, - year={2021} - } - """, + bibtex_citation=r""" +@misc{guha2023legalbench, + archiveprefix = {arXiv}, + author = {Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, + eprint = {2308.11462}, + primaryclass = {cs.CL}, + title = {LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, + year = {2023}, +} + +@article{hendrycks2021cuad, + author = {Hendrycks, Dan and Burns, Collin and Chen, Anya and Ball, Spencer}, + journal = {arXiv preprint arXiv:2103.06268}, + title = {Cuad: An expert-annotated nlp dataset for legal contract review}, + year = {2021}, +} +""", ) def dataset_transform(self): @@ -2376,22 +2435,23 @@ class CUADRofrRofoRofnLegalBenchClassification(AbsTaskClassification): annotations_creators="expert-annotated", dialect=[], sample_creation="found", - bibtex_citation=""" - @misc{guha2023legalbench, - title={LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, - author={Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, - year={2023}, - eprint={2308.11462}, - archivePrefix={arXiv}, - primaryClass={cs.CL} - }, - @article{hendrycks2021cuad, - title={Cuad: An expert-annotated nlp dataset for legal contract review}, - author={Hendrycks, Dan and Burns, Collin and Chen, Anya and Ball, Spencer}, - journal={arXiv preprint arXiv:2103.06268}, - year={2021} - } - """, + bibtex_citation=r""" +@misc{guha2023legalbench, + archiveprefix = {arXiv}, + author = {Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, + eprint = {2308.11462}, + primaryclass = {cs.CL}, + title = {LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, + year = {2023}, +} + +@article{hendrycks2021cuad, + author = {Hendrycks, Dan and Burns, Collin and Chen, Anya and Ball, Spencer}, + journal = {arXiv preprint arXiv:2103.06268}, + title = {Cuad: An expert-annotated nlp dataset for legal contract review}, + year = {2021}, +} +""", ) def dataset_transform(self): @@ -2427,22 +2487,23 @@ class CUADSourceCodeEscrowLegalBenchClassification(AbsTaskClassification): annotations_creators="expert-annotated", dialect=[], sample_creation="found", - bibtex_citation=""" - @misc{guha2023legalbench, - title={LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, - author={Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, - year={2023}, - eprint={2308.11462}, - archivePrefix={arXiv}, - primaryClass={cs.CL} - }, - @article{hendrycks2021cuad, - title={Cuad: An expert-annotated nlp dataset for legal contract review}, - author={Hendrycks, Dan and Burns, Collin and Chen, Anya and Ball, Spencer}, - journal={arXiv preprint arXiv:2103.06268}, - year={2021} - } - """, + bibtex_citation=r""" +@misc{guha2023legalbench, + archiveprefix = {arXiv}, + author = {Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, + eprint = {2308.11462}, + primaryclass = {cs.CL}, + title = {LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, + year = {2023}, +} + +@article{hendrycks2021cuad, + author = {Hendrycks, Dan and Burns, Collin and Chen, Anya and Ball, Spencer}, + journal = {arXiv preprint arXiv:2103.06268}, + title = {Cuad: An expert-annotated nlp dataset for legal contract review}, + year = {2021}, +} +""", ) def dataset_transform(self): @@ -2478,22 +2539,23 @@ class CUADTerminationForConvenienceLegalBenchClassification(AbsTaskClassificatio annotations_creators="expert-annotated", dialect=[], sample_creation="found", - bibtex_citation=""" - @misc{guha2023legalbench, - title={LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, - author={Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, - year={2023}, - eprint={2308.11462}, - archivePrefix={arXiv}, - primaryClass={cs.CL} - }, - @article{hendrycks2021cuad, - title={Cuad: An expert-annotated nlp dataset for legal contract review}, - author={Hendrycks, Dan and Burns, Collin and Chen, Anya and Ball, Spencer}, - journal={arXiv preprint arXiv:2103.06268}, - year={2021} - } - """, + bibtex_citation=r""" +@misc{guha2023legalbench, + archiveprefix = {arXiv}, + author = {Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, + eprint = {2308.11462}, + primaryclass = {cs.CL}, + title = {LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, + year = {2023}, +} + +@article{hendrycks2021cuad, + author = {Hendrycks, Dan and Burns, Collin and Chen, Anya and Ball, Spencer}, + journal = {arXiv preprint arXiv:2103.06268}, + title = {Cuad: An expert-annotated nlp dataset for legal contract review}, + year = {2021}, +} +""", ) def dataset_transform(self): @@ -2529,22 +2591,23 @@ class CUADThirdPartyBeneficiaryLegalBenchClassification(AbsTaskClassification): annotations_creators="expert-annotated", dialect=[], sample_creation="found", - bibtex_citation=""" - @misc{guha2023legalbench, - title={LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, - author={Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, - year={2023}, - eprint={2308.11462}, - archivePrefix={arXiv}, - primaryClass={cs.CL} - }, - @article{hendrycks2021cuad, - title={Cuad: An expert-annotated nlp dataset for legal contract review}, - author={Hendrycks, Dan and Burns, Collin and Chen, Anya and Ball, Spencer}, - journal={arXiv preprint arXiv:2103.06268}, - year={2021} - } - """, + bibtex_citation=r""" +@misc{guha2023legalbench, + archiveprefix = {arXiv}, + author = {Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, + eprint = {2308.11462}, + primaryclass = {cs.CL}, + title = {LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, + year = {2023}, +} + +@article{hendrycks2021cuad, + author = {Hendrycks, Dan and Burns, Collin and Chen, Anya and Ball, Spencer}, + journal = {arXiv preprint arXiv:2103.06268}, + title = {Cuad: An expert-annotated nlp dataset for legal contract review}, + year = {2021}, +} +""", ) def dataset_transform(self): @@ -2580,22 +2643,23 @@ class CUADUncappedLiabilityLegalBenchClassification(AbsTaskClassification): annotations_creators="expert-annotated", dialect=[], sample_creation="found", - bibtex_citation=""" - @misc{guha2023legalbench, - title={LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, - author={Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, - year={2023}, - eprint={2308.11462}, - archivePrefix={arXiv}, - primaryClass={cs.CL} - }, - @article{hendrycks2021cuad, - title={Cuad: An expert-annotated nlp dataset for legal contract review}, - author={Hendrycks, Dan and Burns, Collin and Chen, Anya and Ball, Spencer}, - journal={arXiv preprint arXiv:2103.06268}, - year={2021} - } - """, + bibtex_citation=r""" +@misc{guha2023legalbench, + archiveprefix = {arXiv}, + author = {Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, + eprint = {2308.11462}, + primaryclass = {cs.CL}, + title = {LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, + year = {2023}, +} + +@article{hendrycks2021cuad, + author = {Hendrycks, Dan and Burns, Collin and Chen, Anya and Ball, Spencer}, + journal = {arXiv preprint arXiv:2103.06268}, + title = {Cuad: An expert-annotated nlp dataset for legal contract review}, + year = {2021}, +} +""", ) def dataset_transform(self): @@ -2631,22 +2695,23 @@ class CUADUnlimitedAllYouCanEatLicenseLegalBenchClassification(AbsTaskClassifica annotations_creators="expert-annotated", dialect=[], sample_creation="found", - bibtex_citation=""" - @misc{guha2023legalbench, - title={LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, - author={Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, - year={2023}, - eprint={2308.11462}, - archivePrefix={arXiv}, - primaryClass={cs.CL} - }, - @article{hendrycks2021cuad, - title={Cuad: An expert-annotated nlp dataset for legal contract review}, - author={Hendrycks, Dan and Burns, Collin and Chen, Anya and Ball, Spencer}, - journal={arXiv preprint arXiv:2103.06268}, - year={2021} - } - """, + bibtex_citation=r""" +@misc{guha2023legalbench, + archiveprefix = {arXiv}, + author = {Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, + eprint = {2308.11462}, + primaryclass = {cs.CL}, + title = {LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, + year = {2023}, +} + +@article{hendrycks2021cuad, + author = {Hendrycks, Dan and Burns, Collin and Chen, Anya and Ball, Spencer}, + journal = {arXiv preprint arXiv:2103.06268}, + title = {Cuad: An expert-annotated nlp dataset for legal contract review}, + year = {2021}, +} +""", ) def dataset_transform(self): @@ -2682,22 +2747,23 @@ class CUADVolumeRestrictionLegalBenchClassification(AbsTaskClassification): annotations_creators="expert-annotated", dialect=[], sample_creation="found", - bibtex_citation=""" - @misc{guha2023legalbench, - title={LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, - author={Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, - year={2023}, - eprint={2308.11462}, - archivePrefix={arXiv}, - primaryClass={cs.CL} - }, - @article{hendrycks2021cuad, - title={Cuad: An expert-annotated nlp dataset for legal contract review}, - author={Hendrycks, Dan and Burns, Collin and Chen, Anya and Ball, Spencer}, - journal={arXiv preprint arXiv:2103.06268}, - year={2021} - } - """, + bibtex_citation=r""" +@misc{guha2023legalbench, + archiveprefix = {arXiv}, + author = {Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, + eprint = {2308.11462}, + primaryclass = {cs.CL}, + title = {LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, + year = {2023}, +} + +@article{hendrycks2021cuad, + author = {Hendrycks, Dan and Burns, Collin and Chen, Anya and Ball, Spencer}, + journal = {arXiv preprint arXiv:2103.06268}, + title = {Cuad: An expert-annotated nlp dataset for legal contract review}, + year = {2021}, +} +""", ) def dataset_transform(self): @@ -2733,22 +2799,23 @@ class CUADWarrantyDurationLegalBenchClassification(AbsTaskClassification): annotations_creators="expert-annotated", dialect=[], sample_creation="found", - bibtex_citation=""" - @misc{guha2023legalbench, - title={LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, - author={Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, - year={2023}, - eprint={2308.11462}, - archivePrefix={arXiv}, - primaryClass={cs.CL} - }, - @article{hendrycks2021cuad, - title={Cuad: An expert-annotated nlp dataset for legal contract review}, - author={Hendrycks, Dan and Burns, Collin and Chen, Anya and Ball, Spencer}, - journal={arXiv preprint arXiv:2103.06268}, - year={2021} - } - """, + bibtex_citation=r""" +@misc{guha2023legalbench, + archiveprefix = {arXiv}, + author = {Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, + eprint = {2308.11462}, + primaryclass = {cs.CL}, + title = {LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, + year = {2023}, +} + +@article{hendrycks2021cuad, + author = {Hendrycks, Dan and Burns, Collin and Chen, Anya and Ball, Spencer}, + journal = {arXiv preprint arXiv:2103.06268}, + title = {Cuad: An expert-annotated nlp dataset for legal contract review}, + year = {2021}, +} +""", ) def dataset_transform(self): @@ -2784,16 +2851,16 @@ class DefinitionClassificationLegalBenchClassification(AbsTaskClassification): annotations_creators="expert-annotated", dialect=[], sample_creation="found", - bibtex_citation=""" - @misc{guha2023legalbench, - title={LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, - author={Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, - year={2023}, - eprint={2308.11462}, - archivePrefix={arXiv}, - primaryClass={cs.CL} - }, - """, + bibtex_citation=r""" +@misc{guha2023legalbench, + archiveprefix = {arXiv}, + author = {Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, + eprint = {2308.11462}, + primaryclass = {cs.CL}, + title = {LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, + year = {2023}, +} +""", ) def dataset_transform(self): @@ -2829,16 +2896,16 @@ class Diversity1LegalBenchClassification(AbsTaskClassification): annotations_creators="expert-annotated", dialect=[], sample_creation="found", - bibtex_citation=""" - @misc{guha2023legalbench, - title={LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, - author={Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, - year={2023}, - eprint={2308.11462}, - archivePrefix={arXiv}, - primaryClass={cs.CL} - }, - """, + bibtex_citation=r""" +@misc{guha2023legalbench, + archiveprefix = {arXiv}, + author = {Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, + eprint = {2308.11462}, + primaryclass = {cs.CL}, + title = {LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, + year = {2023}, +} +""", ) def dataset_transform(self): @@ -2898,16 +2965,16 @@ class Diversity2LegalBenchClassification(AbsTaskClassification): annotations_creators="expert-annotated", dialect=[], sample_creation="found", - bibtex_citation=""" - @misc{guha2023legalbench, - title={LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, - author={Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, - year={2023}, - eprint={2308.11462}, - archivePrefix={arXiv}, - primaryClass={cs.CL} - }, - """, + bibtex_citation=r""" +@misc{guha2023legalbench, + archiveprefix = {arXiv}, + author = {Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, + eprint = {2308.11462}, + primaryclass = {cs.CL}, + title = {LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, + year = {2023}, +} +""", ) def dataset_transform(self): @@ -2967,16 +3034,16 @@ class Diversity3LegalBenchClassification(AbsTaskClassification): annotations_creators="expert-annotated", dialect=[], sample_creation="found", - bibtex_citation=""" - @misc{guha2023legalbench, - title={LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, - author={Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, - year={2023}, - eprint={2308.11462}, - archivePrefix={arXiv}, - primaryClass={cs.CL} - }, - """, + bibtex_citation=r""" +@misc{guha2023legalbench, + archiveprefix = {arXiv}, + author = {Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, + eprint = {2308.11462}, + primaryclass = {cs.CL}, + title = {LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, + year = {2023}, +} +""", ) def dataset_transform(self): @@ -3036,16 +3103,16 @@ class Diversity4LegalBenchClassification(AbsTaskClassification): annotations_creators="expert-annotated", dialect=[], sample_creation="found", - bibtex_citation=""" - @misc{guha2023legalbench, - title={LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, - author={Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, - year={2023}, - eprint={2308.11462}, - archivePrefix={arXiv}, - primaryClass={cs.CL} - }, - """, + bibtex_citation=r""" +@misc{guha2023legalbench, + archiveprefix = {arXiv}, + author = {Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, + eprint = {2308.11462}, + primaryclass = {cs.CL}, + title = {LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, + year = {2023}, +} +""", ) def dataset_transform(self): @@ -3105,16 +3172,16 @@ class Diversity5LegalBenchClassification(AbsTaskClassification): annotations_creators="expert-annotated", dialect=[], sample_creation="found", - bibtex_citation=""" - @misc{guha2023legalbench, - title={LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, - author={Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, - year={2023}, - eprint={2308.11462}, - archivePrefix={arXiv}, - primaryClass={cs.CL} - }, - """, + bibtex_citation=r""" +@misc{guha2023legalbench, + archiveprefix = {arXiv}, + author = {Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, + eprint = {2308.11462}, + primaryclass = {cs.CL}, + title = {LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, + year = {2023}, +} +""", ) def dataset_transform(self): @@ -3174,16 +3241,16 @@ class Diversity6LegalBenchClassification(AbsTaskClassification): annotations_creators="expert-annotated", dialect=[], sample_creation="found", - bibtex_citation=""" - @misc{guha2023legalbench, - title={LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, - author={Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, - year={2023}, - eprint={2308.11462}, - archivePrefix={arXiv}, - primaryClass={cs.CL} - }, - """, + bibtex_citation=r""" +@misc{guha2023legalbench, + archiveprefix = {arXiv}, + author = {Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, + eprint = {2308.11462}, + primaryclass = {cs.CL}, + title = {LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, + year = {2023}, +} +""", ) def dataset_transform(self): @@ -3251,16 +3318,16 @@ class FunctionOfDecisionSectionLegalBenchClassification(AbsTaskClassification): annotations_creators="expert-annotated", dialect=[], sample_creation="found", - bibtex_citation=""" - @misc{guha2023legalbench, - title={LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, - author={Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, - year={2023}, - eprint={2308.11462}, - archivePrefix={arXiv}, - primaryClass={cs.CL} - }, - """, + bibtex_citation=r""" +@misc{guha2023legalbench, + archiveprefix = {arXiv}, + author = {Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, + eprint = {2308.11462}, + primaryclass = {cs.CL}, + title = {LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, + year = {2023}, +} +""", ) def dataset_transform(self): @@ -3299,16 +3366,16 @@ class InsurancePolicyInterpretationLegalBenchClassification(AbsTaskClassificatio annotations_creators="expert-annotated", dialect=[], sample_creation="found", - bibtex_citation=""" - @misc{guha2023legalbench, - title={LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, - author={Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, - year={2023}, - eprint={2308.11462}, - archivePrefix={arXiv}, - primaryClass={cs.CL} - }, - """, + bibtex_citation=r""" +@misc{guha2023legalbench, + archiveprefix = {arXiv}, + author = {Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, + eprint = {2308.11462}, + primaryclass = {cs.CL}, + title = {LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, + year = {2023}, +} +""", ) def dataset_transform(self): @@ -3344,23 +3411,24 @@ class InternationalCitizenshipQuestionsLegalBenchClassification(AbsTaskClassific annotations_creators="expert-annotated", dialect=[], sample_creation="found", - bibtex_citation=""" - @misc{guha2023legalbench, - title={LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, - author={Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, - year={2023}, - eprint={2308.11462}, - archivePrefix={arXiv}, - primaryClass={cs.CL} - }, - @misc{vink2023globalcit, - author = {Vink, Maarten and van der Baaren, Luuk and Bauböck, Rainer and Džankić, Jelena and Honohan, Iseult and Manby, Bronwen}, - title = {GLOBALCIT Citizenship Law Dataset, v2.0, Country-Year-Mode Data (Acquisition)}, - howpublished = {https://hdl.handle.net/1814/73190}, - year = {2023}, - publisher = {Global Citizenship Observatory} - } - """, + bibtex_citation=r""" +@misc{guha2023legalbench, + archiveprefix = {arXiv}, + author = {Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, + eprint = {2308.11462}, + primaryclass = {cs.CL}, + title = {LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, + year = {2023}, +} + +@misc{vink2023globalcit, + author = {Vink, Maarten and van der Baaren, Luuk and Bauböck, Rainer and Džankić, Jelena and Honohan, Iseult and Manby, Bronwen}, + howpublished = {https://hdl.handle.net/1814/73190}, + publisher = {Global Citizenship Observatory}, + title = {GLOBALCIT Citizenship Law Dataset, v2.0, Country-Year-Mode Data (Acquisition)}, + year = {2023}, +} +""", ) def dataset_transform(self): @@ -3401,16 +3469,16 @@ class JCrewBlockerLegalBenchClassification(AbsTaskClassification): annotations_creators="expert-annotated", dialect=[], sample_creation="found", - bibtex_citation=""" - @misc{guha2023legalbench, - title={LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, - author={Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, - year={2023}, - eprint={2308.11462}, - archivePrefix={arXiv}, - primaryClass={cs.CL} - }, - """, + bibtex_citation=r""" +@misc{guha2023legalbench, + archiveprefix = {arXiv}, + author = {Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, + eprint = {2308.11462}, + primaryclass = {cs.CL}, + title = {LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, + year = {2023}, +} +""", ) def dataset_transform(self): @@ -3446,24 +3514,25 @@ class LearnedHandsBenefitsLegalBenchClassification(AbsTaskClassification): annotations_creators="expert-annotated", dialect=[], sample_creation="found", - bibtex_citation=""" - @misc{guha2023legalbench, - title={LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, - author={Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, - year={2023}, - eprint={2308.11462}, - archivePrefix={arXiv}, - primaryClass={cs.CL} - }, - @dataset{learned_hands, - title = {LearnedHands Dataset}, - author = {{Suffolk University Law School} and {Stanford Legal Design Lab}}, - year = {2022}, - url = {https://spot.suffolklitlab.org/data/#learnedhands}, - note = {The LearnedHands dataset is licensed under CC BY-NC-SA 4.0}, - urldate = {2022-05-21} - } - """, + bibtex_citation=r""" +@misc{guha2023legalbench, + archiveprefix = {arXiv}, + author = {Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, + eprint = {2308.11462}, + primaryclass = {cs.CL}, + title = {LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, + year = {2023}, +} + +@dataset{learned_hands, + author = {{Suffolk University Law School} and {Stanford Legal Design Lab}}, + note = {The LearnedHands dataset is licensed under CC BY-NC-SA 4.0}, + title = {LearnedHands Dataset}, + url = {https://spot.suffolklitlab.org/data/#learnedhands}, + urldate = {2022-05-21}, + year = {2022}, +} +""", ) def dataset_transform(self): @@ -3499,24 +3568,25 @@ class LearnedHandsBusinessLegalBenchClassification(AbsTaskClassification): annotations_creators="expert-annotated", dialect=[], sample_creation="found", - bibtex_citation=""" - @misc{guha2023legalbench, - title={LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, - author={Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, - year={2023}, - eprint={2308.11462}, - archivePrefix={arXiv}, - primaryClass={cs.CL} - }, - @dataset{learned_hands, - title = {LearnedHands Dataset}, - author = {{Suffolk University Law School} and {Stanford Legal Design Lab}}, - year = {2022}, - url = {https://spot.suffolklitlab.org/data/#learnedhands}, - note = {The LearnedHands dataset is licensed under CC BY-NC-SA 4.0}, - urldate = {2022-05-21} - } - """, + bibtex_citation=r""" +@misc{guha2023legalbench, + archiveprefix = {arXiv}, + author = {Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, + eprint = {2308.11462}, + primaryclass = {cs.CL}, + title = {LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, + year = {2023}, +} + +@dataset{learned_hands, + author = {{Suffolk University Law School} and {Stanford Legal Design Lab}}, + note = {The LearnedHands dataset is licensed under CC BY-NC-SA 4.0}, + title = {LearnedHands Dataset}, + url = {https://spot.suffolklitlab.org/data/#learnedhands}, + urldate = {2022-05-21}, + year = {2022}, +} +""", ) def dataset_transform(self): @@ -3552,24 +3622,25 @@ class LearnedHandsConsumerLegalBenchClassification(AbsTaskClassification): annotations_creators="expert-annotated", dialect=[], sample_creation="found", - bibtex_citation=""" - @misc{guha2023legalbench, - title={LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, - author={Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, - year={2023}, - eprint={2308.11462}, - archivePrefix={arXiv}, - primaryClass={cs.CL} - }, - @dataset{learned_hands, - title = {LearnedHands Dataset}, - author = {{Suffolk University Law School} and {Stanford Legal Design Lab}}, - year = {2022}, - url = {https://spot.suffolklitlab.org/data/#learnedhands}, - note = {The LearnedHands dataset is licensed under CC BY-NC-SA 4.0}, - urldate = {2022-05-21} - } - """, + bibtex_citation=r""" +@misc{guha2023legalbench, + archiveprefix = {arXiv}, + author = {Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, + eprint = {2308.11462}, + primaryclass = {cs.CL}, + title = {LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, + year = {2023}, +} + +@dataset{learned_hands, + author = {{Suffolk University Law School} and {Stanford Legal Design Lab}}, + note = {The LearnedHands dataset is licensed under CC BY-NC-SA 4.0}, + title = {LearnedHands Dataset}, + url = {https://spot.suffolklitlab.org/data/#learnedhands}, + urldate = {2022-05-21}, + year = {2022}, +} +""", ) def dataset_transform(self): @@ -3605,24 +3676,25 @@ class LearnedHandsCourtsLegalBenchClassification(AbsTaskClassification): annotations_creators="expert-annotated", dialect=[], sample_creation="found", - bibtex_citation=""" - @misc{guha2023legalbench, - title={LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, - author={Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, - year={2023}, - eprint={2308.11462}, - archivePrefix={arXiv}, - primaryClass={cs.CL} - }, - @dataset{learned_hands, - title = {LearnedHands Dataset}, - author = {{Suffolk University Law School} and {Stanford Legal Design Lab}}, - year = {2022}, - url = {https://spot.suffolklitlab.org/data/#learnedhands}, - note = {The LearnedHands dataset is licensed under CC BY-NC-SA 4.0}, - urldate = {2022-05-21} - } - """, + bibtex_citation=r""" +@misc{guha2023legalbench, + archiveprefix = {arXiv}, + author = {Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, + eprint = {2308.11462}, + primaryclass = {cs.CL}, + title = {LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, + year = {2023}, +} + +@dataset{learned_hands, + author = {{Suffolk University Law School} and {Stanford Legal Design Lab}}, + note = {The LearnedHands dataset is licensed under CC BY-NC-SA 4.0}, + title = {LearnedHands Dataset}, + url = {https://spot.suffolklitlab.org/data/#learnedhands}, + urldate = {2022-05-21}, + year = {2022}, +} +""", ) def dataset_transform(self): @@ -3658,24 +3730,25 @@ class LearnedHandsCrimeLegalBenchClassification(AbsTaskClassification): annotations_creators="expert-annotated", dialect=[], sample_creation="found", - bibtex_citation=""" - @misc{guha2023legalbench, - title={LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, - author={Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, - year={2023}, - eprint={2308.11462}, - archivePrefix={arXiv}, - primaryClass={cs.CL} - }, - @dataset{learned_hands, - title = {LearnedHands Dataset}, - author = {{Suffolk University Law School} and {Stanford Legal Design Lab}}, - year = {2022}, - url = {https://spot.suffolklitlab.org/data/#learnedhands}, - note = {The LearnedHands dataset is licensed under CC BY-NC-SA 4.0}, - urldate = {2022-05-21} - } - """, + bibtex_citation=r""" +@misc{guha2023legalbench, + archiveprefix = {arXiv}, + author = {Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, + eprint = {2308.11462}, + primaryclass = {cs.CL}, + title = {LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, + year = {2023}, +} + +@dataset{learned_hands, + author = {{Suffolk University Law School} and {Stanford Legal Design Lab}}, + note = {The LearnedHands dataset is licensed under CC BY-NC-SA 4.0}, + title = {LearnedHands Dataset}, + url = {https://spot.suffolklitlab.org/data/#learnedhands}, + urldate = {2022-05-21}, + year = {2022}, +} +""", ) def dataset_transform(self): @@ -3711,24 +3784,25 @@ class LearnedHandsDivorceLegalBenchClassification(AbsTaskClassification): annotations_creators="expert-annotated", dialect=[], sample_creation="found", - bibtex_citation=""" - @misc{guha2023legalbench, - title={LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, - author={Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, - year={2023}, - eprint={2308.11462}, - archivePrefix={arXiv}, - primaryClass={cs.CL} - }, - @dataset{learned_hands, - title = {LearnedHands Dataset}, - author = {{Suffolk University Law School} and {Stanford Legal Design Lab}}, - year = {2022}, - url = {https://spot.suffolklitlab.org/data/#learnedhands}, - note = {The LearnedHands dataset is licensed under CC BY-NC-SA 4.0}, - urldate = {2022-05-21} - } - """, + bibtex_citation=r""" +@misc{guha2023legalbench, + archiveprefix = {arXiv}, + author = {Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, + eprint = {2308.11462}, + primaryclass = {cs.CL}, + title = {LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, + year = {2023}, +} + +@dataset{learned_hands, + author = {{Suffolk University Law School} and {Stanford Legal Design Lab}}, + note = {The LearnedHands dataset is licensed under CC BY-NC-SA 4.0}, + title = {LearnedHands Dataset}, + url = {https://spot.suffolklitlab.org/data/#learnedhands}, + urldate = {2022-05-21}, + year = {2022}, +} +""", ) def dataset_transform(self): @@ -3764,24 +3838,25 @@ class LearnedHandsDomesticViolenceLegalBenchClassification(AbsTaskClassification annotations_creators="expert-annotated", dialect=[], sample_creation="found", - bibtex_citation=""" - @misc{guha2023legalbench, - title={LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, - author={Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, - year={2023}, - eprint={2308.11462}, - archivePrefix={arXiv}, - primaryClass={cs.CL} - }, - @dataset{learned_hands, - title = {LearnedHands Dataset}, - author = {{Suffolk University Law School} and {Stanford Legal Design Lab}}, - year = {2022}, - url = {https://spot.suffolklitlab.org/data/#learnedhands}, - note = {The LearnedHands dataset is licensed under CC BY-NC-SA 4.0}, - urldate = {2022-05-21} - } - """, + bibtex_citation=r""" +@misc{guha2023legalbench, + archiveprefix = {arXiv}, + author = {Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, + eprint = {2308.11462}, + primaryclass = {cs.CL}, + title = {LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, + year = {2023}, +} + +@dataset{learned_hands, + author = {{Suffolk University Law School} and {Stanford Legal Design Lab}}, + note = {The LearnedHands dataset is licensed under CC BY-NC-SA 4.0}, + title = {LearnedHands Dataset}, + url = {https://spot.suffolklitlab.org/data/#learnedhands}, + urldate = {2022-05-21}, + year = {2022}, +} +""", ) def dataset_transform(self): @@ -3817,24 +3892,25 @@ class LearnedHandsEducationLegalBenchClassification(AbsTaskClassification): annotations_creators="expert-annotated", dialect=[], sample_creation="found", - bibtex_citation=""" - @misc{guha2023legalbench, - title={LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, - author={Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, - year={2023}, - eprint={2308.11462}, - archivePrefix={arXiv}, - primaryClass={cs.CL} - }, - @dataset{learned_hands, - title = {LearnedHands Dataset}, - author = {{Suffolk University Law School} and {Stanford Legal Design Lab}}, - year = {2022}, - url = {https://spot.suffolklitlab.org/data/#learnedhands}, - note = {The LearnedHands dataset is licensed under CC BY-NC-SA 4.0}, - urldate = {2022-05-21} - } - """, + bibtex_citation=r""" +@misc{guha2023legalbench, + archiveprefix = {arXiv}, + author = {Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, + eprint = {2308.11462}, + primaryclass = {cs.CL}, + title = {LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, + year = {2023}, +} + +@dataset{learned_hands, + author = {{Suffolk University Law School} and {Stanford Legal Design Lab}}, + note = {The LearnedHands dataset is licensed under CC BY-NC-SA 4.0}, + title = {LearnedHands Dataset}, + url = {https://spot.suffolklitlab.org/data/#learnedhands}, + urldate = {2022-05-21}, + year = {2022}, +} +""", ) def dataset_transform(self): @@ -3870,24 +3946,25 @@ class LearnedHandsEmploymentLegalBenchClassification(AbsTaskClassification): annotations_creators="expert-annotated", dialect=[], sample_creation="found", - bibtex_citation=""" - @misc{guha2023legalbench, - title={LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, - author={Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, - year={2023}, - eprint={2308.11462}, - archivePrefix={arXiv}, - primaryClass={cs.CL} - }, - @dataset{learned_hands, - title = {LearnedHands Dataset}, - author = {{Suffolk University Law School} and {Stanford Legal Design Lab}}, - year = {2022}, - url = {https://spot.suffolklitlab.org/data/#learnedhands}, - note = {The LearnedHands dataset is licensed under CC BY-NC-SA 4.0}, - urldate = {2022-05-21} - } - """, + bibtex_citation=r""" +@misc{guha2023legalbench, + archiveprefix = {arXiv}, + author = {Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, + eprint = {2308.11462}, + primaryclass = {cs.CL}, + title = {LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, + year = {2023}, +} + +@dataset{learned_hands, + author = {{Suffolk University Law School} and {Stanford Legal Design Lab}}, + note = {The LearnedHands dataset is licensed under CC BY-NC-SA 4.0}, + title = {LearnedHands Dataset}, + url = {https://spot.suffolklitlab.org/data/#learnedhands}, + urldate = {2022-05-21}, + year = {2022}, +} +""", ) def dataset_transform(self): @@ -3923,24 +4000,25 @@ class LearnedHandsEstatesLegalBenchClassification(AbsTaskClassification): annotations_creators="expert-annotated", dialect=[], sample_creation="found", - bibtex_citation=""" - @misc{guha2023legalbench, - title={LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, - author={Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, - year={2023}, - eprint={2308.11462}, - archivePrefix={arXiv}, - primaryClass={cs.CL} - }, - @dataset{learned_hands, - title = {LearnedHands Dataset}, - author = {{Suffolk University Law School} and {Stanford Legal Design Lab}}, - year = {2022}, - url = {https://spot.suffolklitlab.org/data/#learnedhands}, - note = {The LearnedHands dataset is licensed under CC BY-NC-SA 4.0}, - urldate = {2022-05-21} - } - """, + bibtex_citation=r""" +@misc{guha2023legalbench, + archiveprefix = {arXiv}, + author = {Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, + eprint = {2308.11462}, + primaryclass = {cs.CL}, + title = {LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, + year = {2023}, +} + +@dataset{learned_hands, + author = {{Suffolk University Law School} and {Stanford Legal Design Lab}}, + note = {The LearnedHands dataset is licensed under CC BY-NC-SA 4.0}, + title = {LearnedHands Dataset}, + url = {https://spot.suffolklitlab.org/data/#learnedhands}, + urldate = {2022-05-21}, + year = {2022}, +} +""", ) def dataset_transform(self): @@ -3976,24 +4054,25 @@ class LearnedHandsFamilyLegalBenchClassification(AbsTaskClassification): annotations_creators="expert-annotated", dialect=[], sample_creation="found", - bibtex_citation=""" - @misc{guha2023legalbench, - title={LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, - author={Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, - year={2023}, - eprint={2308.11462}, - archivePrefix={arXiv}, - primaryClass={cs.CL} - }, - @dataset{learned_hands, - title = {LearnedHands Dataset}, - author = {{Suffolk University Law School} and {Stanford Legal Design Lab}}, - year = {2022}, - url = {https://spot.suffolklitlab.org/data/#learnedhands}, - note = {The LearnedHands dataset is licensed under CC BY-NC-SA 4.0}, - urldate = {2022-05-21} - } - """, + bibtex_citation=r""" +@misc{guha2023legalbench, + archiveprefix = {arXiv}, + author = {Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, + eprint = {2308.11462}, + primaryclass = {cs.CL}, + title = {LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, + year = {2023}, +} + +@dataset{learned_hands, + author = {{Suffolk University Law School} and {Stanford Legal Design Lab}}, + note = {The LearnedHands dataset is licensed under CC BY-NC-SA 4.0}, + title = {LearnedHands Dataset}, + url = {https://spot.suffolklitlab.org/data/#learnedhands}, + urldate = {2022-05-21}, + year = {2022}, +} +""", ) def dataset_transform(self): @@ -4032,24 +4111,25 @@ class LearnedHandsHealthLegalBenchClassification(AbsTaskClassification): annotations_creators="expert-annotated", dialect=[], sample_creation="found", - bibtex_citation=""" - @misc{guha2023legalbench, - title={LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, - author={Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, - year={2023}, - eprint={2308.11462}, - archivePrefix={arXiv}, - primaryClass={cs.CL} - }, - @dataset{learned_hands, - title = {LearnedHands Dataset}, - author = {{Suffolk University Law School} and {Stanford Legal Design Lab}}, - year = {2022}, - url = {https://spot.suffolklitlab.org/data/#learnedhands}, - note = {The LearnedHands dataset is licensed under CC BY-NC-SA 4.0}, - urldate = {2022-05-21} - } - """, + bibtex_citation=r""" +@misc{guha2023legalbench, + archiveprefix = {arXiv}, + author = {Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, + eprint = {2308.11462}, + primaryclass = {cs.CL}, + title = {LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, + year = {2023}, +} + +@dataset{learned_hands, + author = {{Suffolk University Law School} and {Stanford Legal Design Lab}}, + note = {The LearnedHands dataset is licensed under CC BY-NC-SA 4.0}, + title = {LearnedHands Dataset}, + url = {https://spot.suffolklitlab.org/data/#learnedhands}, + urldate = {2022-05-21}, + year = {2022}, +} +""", ) def dataset_transform(self): @@ -4085,24 +4165,25 @@ class LearnedHandsHousingLegalBenchClassification(AbsTaskClassification): annotations_creators="expert-annotated", dialect=[], sample_creation="found", - bibtex_citation=""" - @misc{guha2023legalbench, - title={LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, - author={Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, - year={2023}, - eprint={2308.11462}, - archivePrefix={arXiv}, - primaryClass={cs.CL} - }, - @dataset{learned_hands, - title = {LearnedHands Dataset}, - author = {{Suffolk University Law School} and {Stanford Legal Design Lab}}, - year = {2022}, - url = {https://spot.suffolklitlab.org/data/#learnedhands}, - note = {The LearnedHands dataset is licensed under CC BY-NC-SA 4.0}, - urldate = {2022-05-21} - } - """, + bibtex_citation=r""" +@misc{guha2023legalbench, + archiveprefix = {arXiv}, + author = {Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, + eprint = {2308.11462}, + primaryclass = {cs.CL}, + title = {LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, + year = {2023}, +} + +@dataset{learned_hands, + author = {{Suffolk University Law School} and {Stanford Legal Design Lab}}, + note = {The LearnedHands dataset is licensed under CC BY-NC-SA 4.0}, + title = {LearnedHands Dataset}, + url = {https://spot.suffolklitlab.org/data/#learnedhands}, + urldate = {2022-05-21}, + year = {2022}, +} +""", ) def dataset_transform(self): @@ -4141,24 +4222,25 @@ class LearnedHandsImmigrationLegalBenchClassification(AbsTaskClassification): annotations_creators="expert-annotated", dialect=[], sample_creation="found", - bibtex_citation=""" - @misc{guha2023legalbench, - title={LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, - author={Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, - year={2023}, - eprint={2308.11462}, - archivePrefix={arXiv}, - primaryClass={cs.CL} - }, - @dataset{learned_hands, - title = {LearnedHands Dataset}, - author = {{Suffolk University Law School} and {Stanford Legal Design Lab}}, - year = {2022}, - url = {https://spot.suffolklitlab.org/data/#learnedhands}, - note = {The LearnedHands dataset is licensed under CC BY-NC-SA 4.0}, - urldate = {2022-05-21} - } - """, + bibtex_citation=r""" +@misc{guha2023legalbench, + archiveprefix = {arXiv}, + author = {Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, + eprint = {2308.11462}, + primaryclass = {cs.CL}, + title = {LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, + year = {2023}, +} + +@dataset{learned_hands, + author = {{Suffolk University Law School} and {Stanford Legal Design Lab}}, + note = {The LearnedHands dataset is licensed under CC BY-NC-SA 4.0}, + title = {LearnedHands Dataset}, + url = {https://spot.suffolklitlab.org/data/#learnedhands}, + urldate = {2022-05-21}, + year = {2022}, +} +""", ) def dataset_transform(self): @@ -4194,24 +4276,25 @@ class LearnedHandsTortsLegalBenchClassification(AbsTaskClassification): annotations_creators="expert-annotated", dialect=[], sample_creation="found", - bibtex_citation=""" - @misc{guha2023legalbench, - title={LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, - author={Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, - year={2023}, - eprint={2308.11462}, - archivePrefix={arXiv}, - primaryClass={cs.CL} - }, - @dataset{learned_hands, - title = {LearnedHands Dataset}, - author = {{Suffolk University Law School} and {Stanford Legal Design Lab}}, - year = {2022}, - url = {https://spot.suffolklitlab.org/data/#learnedhands}, - note = {The LearnedHands dataset is licensed under CC BY-NC-SA 4.0}, - urldate = {2022-05-21} - } - """, + bibtex_citation=r""" +@misc{guha2023legalbench, + archiveprefix = {arXiv}, + author = {Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, + eprint = {2308.11462}, + primaryclass = {cs.CL}, + title = {LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, + year = {2023}, +} + +@dataset{learned_hands, + author = {{Suffolk University Law School} and {Stanford Legal Design Lab}}, + note = {The LearnedHands dataset is licensed under CC BY-NC-SA 4.0}, + title = {LearnedHands Dataset}, + url = {https://spot.suffolklitlab.org/data/#learnedhands}, + urldate = {2022-05-21}, + year = {2022}, +} +""", ) def dataset_transform(self): @@ -4247,24 +4330,25 @@ class LearnedHandsTrafficLegalBenchClassification(AbsTaskClassification): annotations_creators="expert-annotated", dialect=[], sample_creation="found", - bibtex_citation=""" - @misc{guha2023legalbench, - title={LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, - author={Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, - year={2023}, - eprint={2308.11462}, - archivePrefix={arXiv}, - primaryClass={cs.CL} - }, - @dataset{learned_hands, - title = {LearnedHands Dataset}, - author = {{Suffolk University Law School} and {Stanford Legal Design Lab}}, - year = {2022}, - url = {https://spot.suffolklitlab.org/data/#learnedhands}, - note = {The LearnedHands dataset is licensed under CC BY-NC-SA 4.0}, - urldate = {2022-05-21} - } - """, + bibtex_citation=r""" +@misc{guha2023legalbench, + archiveprefix = {arXiv}, + author = {Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, + eprint = {2308.11462}, + primaryclass = {cs.CL}, + title = {LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, + year = {2023}, +} + +@dataset{learned_hands, + author = {{Suffolk University Law School} and {Stanford Legal Design Lab}}, + note = {The LearnedHands dataset is licensed under CC BY-NC-SA 4.0}, + title = {LearnedHands Dataset}, + url = {https://spot.suffolklitlab.org/data/#learnedhands}, + urldate = {2022-05-21}, + year = {2022}, +} +""", ) def dataset_transform(self): @@ -4300,16 +4384,16 @@ class LegalReasoningCausalityLegalBenchClassification(AbsTaskClassification): annotations_creators="expert-annotated", dialect=[], sample_creation="found", - bibtex_citation=""" - @misc{guha2023legalbench, - title={LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, - author={Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, - year={2023}, - eprint={2308.11462}, - archivePrefix={arXiv}, - primaryClass={cs.CL} - } - """, + bibtex_citation=r""" +@misc{guha2023legalbench, + archiveprefix = {arXiv}, + author = {Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, + eprint = {2308.11462}, + primaryclass = {cs.CL}, + title = {LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, + year = {2023}, +} +""", ) def dataset_transform(self): @@ -4523,22 +4607,23 @@ class MAUDLegalBenchClassification(AbsTaskClassification): annotations_creators="expert-annotated", dialect=[], sample_creation="found", - bibtex_citation=""" - @misc{guha2023legalbench, - title={LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, - author={Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, - year={2023}, - eprint={2308.11462}, - archivePrefix={arXiv}, - primaryClass={cs.CL} - } - @article{wang2023maud, - title={MAUD: An Expert-Annotated Legal NLP Dataset for Merger Agreement Understanding}, - author={Wang, Steven H and Scardigli, Antoine and Tang, Leonard and Chen, Wei and Levkin, Dimitry and Chen, Anya and Ball, Spencer and Woodside, Thomas and Zhang, Oliver and Hendrycks, Dan}, - journal={arXiv preprint arXiv:2301.00876}, - year={2023} - } - """, + bibtex_citation=r""" +@misc{guha2023legalbench, + archiveprefix = {arXiv}, + author = {Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, + eprint = {2308.11462}, + primaryclass = {cs.CL}, + title = {LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, + year = {2023}, +} + +@article{wang2023maud, + author = {Wang, Steven H and Scardigli, Antoine and Tang, Leonard and Chen, Wei and Levkin, Dimitry and Chen, Anya and Ball, Spencer and Woodside, Thomas and Zhang, Oliver and Hendrycks, Dan}, + journal = {arXiv preprint arXiv:2301.00876}, + title = {MAUD: An Expert-Annotated Legal NLP Dataset for Merger Agreement Understanding}, + year = {2023}, +} +""", ) def load_data(self, **kwargs: Any) -> None: @@ -4630,16 +4715,16 @@ class NYSJudicialEthicsLegalBenchClassification(AbsTaskClassification): annotations_creators="expert-annotated", dialect=[], sample_creation="found", - bibtex_citation=""" - @misc{guha2023legalbench, - title={LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, - author={Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, - year={2023}, - eprint={2308.11462}, - archivePrefix={arXiv}, - primaryClass={cs.CL} - } - """, + bibtex_citation=r""" +@misc{guha2023legalbench, + archiveprefix = {arXiv}, + author = {Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, + eprint = {2308.11462}, + primaryclass = {cs.CL}, + title = {LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, + year = {2023}, +} +""", ) def dataset_transform(self): @@ -4677,23 +4762,24 @@ class OPP115DataRetentionLegalBenchClassification(AbsTaskClassification): annotations_creators="expert-annotated", dialect=[], sample_creation="found", - bibtex_citation=""" - @misc{guha2023legalbench, - title={LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, - author={Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, - year={2023}, - eprint={2308.11462}, - archivePrefix={arXiv}, - primaryClass={cs.CL} - } - @inproceedings{wilson2016creation, - title={The creation and analysis of a website privacy policy corpus}, - author={Wilson, Shomir and Schaub, Florian and Dara, Aswarth Abhilash and Liu, Frederick and Cherivirala, Sushain and Leon, Pedro Giovanni and Andersen, Mads Schaarup and Zimmeck, Sebastian and Sathyendra, Kanthashree Mysore and Russell, N Cameron and others}, - booktitle={Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)}, - pages={1330--1340}, - year={2016} - } - """, + bibtex_citation=r""" +@misc{guha2023legalbench, + archiveprefix = {arXiv}, + author = {Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, + eprint = {2308.11462}, + primaryclass = {cs.CL}, + title = {LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, + year = {2023}, +} + +@inproceedings{wilson2016creation, + author = {Wilson, Shomir and Schaub, Florian and Dara, Aswarth Abhilash and Liu, Frederick and Cherivirala, Sushain and Leon, Pedro Giovanni and Andersen, Mads Schaarup and Zimmeck, Sebastian and Sathyendra, Kanthashree Mysore and Russell, N Cameron and others}, + booktitle = {Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)}, + pages = {1330--1340}, + title = {The creation and analysis of a website privacy policy corpus}, + year = {2016}, +} +""", ) def dataset_transform(self): @@ -4729,23 +4815,24 @@ class OPP115DataSecurityLegalBenchClassification(AbsTaskClassification): annotations_creators="expert-annotated", dialect=[], sample_creation="found", - bibtex_citation=""" - @misc{guha2023legalbench, - title={LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, - author={Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, - year={2023}, - eprint={2308.11462}, - archivePrefix={arXiv}, - primaryClass={cs.CL} - } - @inproceedings{wilson2016creation, - title={The creation and analysis of a website privacy policy corpus}, - author={Wilson, Shomir and Schaub, Florian and Dara, Aswarth Abhilash and Liu, Frederick and Cherivirala, Sushain and Leon, Pedro Giovanni and Andersen, Mads Schaarup and Zimmeck, Sebastian and Sathyendra, Kanthashree Mysore and Russell, N Cameron and others}, - booktitle={Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)}, - pages={1330--1340}, - year={2016} - } - """, + bibtex_citation=r""" +@misc{guha2023legalbench, + archiveprefix = {arXiv}, + author = {Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, + eprint = {2308.11462}, + primaryclass = {cs.CL}, + title = {LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, + year = {2023}, +} + +@inproceedings{wilson2016creation, + author = {Wilson, Shomir and Schaub, Florian and Dara, Aswarth Abhilash and Liu, Frederick and Cherivirala, Sushain and Leon, Pedro Giovanni and Andersen, Mads Schaarup and Zimmeck, Sebastian and Sathyendra, Kanthashree Mysore and Russell, N Cameron and others}, + booktitle = {Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)}, + pages = {1330--1340}, + title = {The creation and analysis of a website privacy policy corpus}, + year = {2016}, +} +""", ) def dataset_transform(self): @@ -4781,23 +4868,24 @@ class OPP115DoNotTrackLegalBenchClassification(AbsTaskClassification): annotations_creators="expert-annotated", dialect=[], sample_creation="found", - bibtex_citation=""" - @misc{guha2023legalbench, - title={LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, - author={Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, - year={2023}, - eprint={2308.11462}, - archivePrefix={arXiv}, - primaryClass={cs.CL} - } - @inproceedings{wilson2016creation, - title={The creation and analysis of a website privacy policy corpus}, - author={Wilson, Shomir and Schaub, Florian and Dara, Aswarth Abhilash and Liu, Frederick and Cherivirala, Sushain and Leon, Pedro Giovanni and Andersen, Mads Schaarup and Zimmeck, Sebastian and Sathyendra, Kanthashree Mysore and Russell, N Cameron and others}, - booktitle={Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)}, - pages={1330--1340}, - year={2016} - } - """, + bibtex_citation=r""" +@misc{guha2023legalbench, + archiveprefix = {arXiv}, + author = {Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, + eprint = {2308.11462}, + primaryclass = {cs.CL}, + title = {LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, + year = {2023}, +} + +@inproceedings{wilson2016creation, + author = {Wilson, Shomir and Schaub, Florian and Dara, Aswarth Abhilash and Liu, Frederick and Cherivirala, Sushain and Leon, Pedro Giovanni and Andersen, Mads Schaarup and Zimmeck, Sebastian and Sathyendra, Kanthashree Mysore and Russell, N Cameron and others}, + booktitle = {Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)}, + pages = {1330--1340}, + title = {The creation and analysis of a website privacy policy corpus}, + year = {2016}, +} +""", ) def dataset_transform(self): @@ -4833,23 +4921,24 @@ class OPP115FirstPartyCollectionUseLegalBenchClassification(AbsTaskClassificatio annotations_creators="expert-annotated", dialect=[], sample_creation="found", - bibtex_citation=""" - @misc{guha2023legalbench, - title={LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, - author={Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, - year={2023}, - eprint={2308.11462}, - archivePrefix={arXiv}, - primaryClass={cs.CL} - } - @inproceedings{wilson2016creation, - title={The creation and analysis of a website privacy policy corpus}, - author={Wilson, Shomir and Schaub, Florian and Dara, Aswarth Abhilash and Liu, Frederick and Cherivirala, Sushain and Leon, Pedro Giovanni and Andersen, Mads Schaarup and Zimmeck, Sebastian and Sathyendra, Kanthashree Mysore and Russell, N Cameron and others}, - booktitle={Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)}, - pages={1330--1340}, - year={2016} - } - """, + bibtex_citation=r""" +@misc{guha2023legalbench, + archiveprefix = {arXiv}, + author = {Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, + eprint = {2308.11462}, + primaryclass = {cs.CL}, + title = {LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, + year = {2023}, +} + +@inproceedings{wilson2016creation, + author = {Wilson, Shomir and Schaub, Florian and Dara, Aswarth Abhilash and Liu, Frederick and Cherivirala, Sushain and Leon, Pedro Giovanni and Andersen, Mads Schaarup and Zimmeck, Sebastian and Sathyendra, Kanthashree Mysore and Russell, N Cameron and others}, + booktitle = {Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)}, + pages = {1330--1340}, + title = {The creation and analysis of a website privacy policy corpus}, + year = {2016}, +} +""", ) def dataset_transform(self): @@ -4887,23 +4976,24 @@ class OPP115InternationalAndSpecificAudiencesLegalBenchClassification( annotations_creators="expert-annotated", dialect=[], sample_creation="found", - bibtex_citation=""" - @misc{guha2023legalbench, - title={LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, - author={Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, - year={2023}, - eprint={2308.11462}, - archivePrefix={arXiv}, - primaryClass={cs.CL} - } - @inproceedings{wilson2016creation, - title={The creation and analysis of a website privacy policy corpus}, - author={Wilson, Shomir and Schaub, Florian and Dara, Aswarth Abhilash and Liu, Frederick and Cherivirala, Sushain and Leon, Pedro Giovanni and Andersen, Mads Schaarup and Zimmeck, Sebastian and Sathyendra, Kanthashree Mysore and Russell, N Cameron and others}, - booktitle={Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)}, - pages={1330--1340}, - year={2016} - } - """, + bibtex_citation=r""" +@misc{guha2023legalbench, + archiveprefix = {arXiv}, + author = {Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, + eprint = {2308.11462}, + primaryclass = {cs.CL}, + title = {LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, + year = {2023}, +} + +@inproceedings{wilson2016creation, + author = {Wilson, Shomir and Schaub, Florian and Dara, Aswarth Abhilash and Liu, Frederick and Cherivirala, Sushain and Leon, Pedro Giovanni and Andersen, Mads Schaarup and Zimmeck, Sebastian and Sathyendra, Kanthashree Mysore and Russell, N Cameron and others}, + booktitle = {Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)}, + pages = {1330--1340}, + title = {The creation and analysis of a website privacy policy corpus}, + year = {2016}, +} +""", ) def dataset_transform(self): @@ -4939,23 +5029,24 @@ class OPP115PolicyChangeLegalBenchClassification(AbsTaskClassification): annotations_creators="expert-annotated", dialect=[], sample_creation="found", - bibtex_citation=""" - @misc{guha2023legalbench, - title={LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, - author={Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, - year={2023}, - eprint={2308.11462}, - archivePrefix={arXiv}, - primaryClass={cs.CL} - } - @inproceedings{wilson2016creation, - title={The creation and analysis of a website privacy policy corpus}, - author={Wilson, Shomir and Schaub, Florian and Dara, Aswarth Abhilash and Liu, Frederick and Cherivirala, Sushain and Leon, Pedro Giovanni and Andersen, Mads Schaarup and Zimmeck, Sebastian and Sathyendra, Kanthashree Mysore and Russell, N Cameron and others}, - booktitle={Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)}, - pages={1330--1340}, - year={2016} - } - """, + bibtex_citation=r""" +@misc{guha2023legalbench, + archiveprefix = {arXiv}, + author = {Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, + eprint = {2308.11462}, + primaryclass = {cs.CL}, + title = {LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, + year = {2023}, +} + +@inproceedings{wilson2016creation, + author = {Wilson, Shomir and Schaub, Florian and Dara, Aswarth Abhilash and Liu, Frederick and Cherivirala, Sushain and Leon, Pedro Giovanni and Andersen, Mads Schaarup and Zimmeck, Sebastian and Sathyendra, Kanthashree Mysore and Russell, N Cameron and others}, + booktitle = {Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)}, + pages = {1330--1340}, + title = {The creation and analysis of a website privacy policy corpus}, + year = {2016}, +} +""", ) def dataset_transform(self): @@ -4991,23 +5082,24 @@ class OPP115ThirdPartySharingCollectionLegalBenchClassification(AbsTaskClassific annotations_creators="expert-annotated", dialect=[], sample_creation="found", - bibtex_citation=""" - @misc{guha2023legalbench, - title={LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, - author={Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, - year={2023}, - eprint={2308.11462}, - archivePrefix={arXiv}, - primaryClass={cs.CL} - } - @inproceedings{wilson2016creation, - title={The creation and analysis of a website privacy policy corpus}, - author={Wilson, Shomir and Schaub, Florian and Dara, Aswarth Abhilash and Liu, Frederick and Cherivirala, Sushain and Leon, Pedro Giovanni and Andersen, Mads Schaarup and Zimmeck, Sebastian and Sathyendra, Kanthashree Mysore and Russell, N Cameron and others}, - booktitle={Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)}, - pages={1330--1340}, - year={2016} - } - """, + bibtex_citation=r""" +@misc{guha2023legalbench, + archiveprefix = {arXiv}, + author = {Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, + eprint = {2308.11462}, + primaryclass = {cs.CL}, + title = {LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, + year = {2023}, +} + +@inproceedings{wilson2016creation, + author = {Wilson, Shomir and Schaub, Florian and Dara, Aswarth Abhilash and Liu, Frederick and Cherivirala, Sushain and Leon, Pedro Giovanni and Andersen, Mads Schaarup and Zimmeck, Sebastian and Sathyendra, Kanthashree Mysore and Russell, N Cameron and others}, + booktitle = {Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)}, + pages = {1330--1340}, + title = {The creation and analysis of a website privacy policy corpus}, + year = {2016}, +} +""", ) def dataset_transform(self): @@ -5043,23 +5135,24 @@ class OPP115UserAccessEditAndDeletionLegalBenchClassification(AbsTaskClassificat annotations_creators="expert-annotated", dialect=[], sample_creation="found", - bibtex_citation=""" - @misc{guha2023legalbench, - title={LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, - author={Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, - year={2023}, - eprint={2308.11462}, - archivePrefix={arXiv}, - primaryClass={cs.CL} - } - @inproceedings{wilson2016creation, - title={The creation and analysis of a website privacy policy corpus}, - author={Wilson, Shomir and Schaub, Florian and Dara, Aswarth Abhilash and Liu, Frederick and Cherivirala, Sushain and Leon, Pedro Giovanni and Andersen, Mads Schaarup and Zimmeck, Sebastian and Sathyendra, Kanthashree Mysore and Russell, N Cameron and others}, - booktitle={Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)}, - pages={1330--1340}, - year={2016} - } - """, + bibtex_citation=r""" +@misc{guha2023legalbench, + archiveprefix = {arXiv}, + author = {Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, + eprint = {2308.11462}, + primaryclass = {cs.CL}, + title = {LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, + year = {2023}, +} + +@inproceedings{wilson2016creation, + author = {Wilson, Shomir and Schaub, Florian and Dara, Aswarth Abhilash and Liu, Frederick and Cherivirala, Sushain and Leon, Pedro Giovanni and Andersen, Mads Schaarup and Zimmeck, Sebastian and Sathyendra, Kanthashree Mysore and Russell, N Cameron and others}, + booktitle = {Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)}, + pages = {1330--1340}, + title = {The creation and analysis of a website privacy policy corpus}, + year = {2016}, +} +""", ) def dataset_transform(self): @@ -5095,23 +5188,24 @@ class OPP115UserChoiceControlLegalBenchClassification(AbsTaskClassification): annotations_creators="expert-annotated", dialect=[], sample_creation="found", - bibtex_citation=""" - @misc{guha2023legalbench, - title={LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, - author={Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, - year={2023}, - eprint={2308.11462}, - archivePrefix={arXiv}, - primaryClass={cs.CL} - } - @inproceedings{wilson2016creation, - title={The creation and analysis of a website privacy policy corpus}, - author={Wilson, Shomir and Schaub, Florian and Dara, Aswarth Abhilash and Liu, Frederick and Cherivirala, Sushain and Leon, Pedro Giovanni and Andersen, Mads Schaarup and Zimmeck, Sebastian and Sathyendra, Kanthashree Mysore and Russell, N Cameron and others}, - booktitle={Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)}, - pages={1330--1340}, - year={2016} - } - """, + bibtex_citation=r""" +@misc{guha2023legalbench, + archiveprefix = {arXiv}, + author = {Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, + eprint = {2308.11462}, + primaryclass = {cs.CL}, + title = {LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, + year = {2023}, +} + +@inproceedings{wilson2016creation, + author = {Wilson, Shomir and Schaub, Florian and Dara, Aswarth Abhilash and Liu, Frederick and Cherivirala, Sushain and Leon, Pedro Giovanni and Andersen, Mads Schaarup and Zimmeck, Sebastian and Sathyendra, Kanthashree Mysore and Russell, N Cameron and others}, + booktitle = {Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)}, + pages = {1330--1340}, + title = {The creation and analysis of a website privacy policy corpus}, + year = {2016}, +} +""", ) def dataset_transform(self): @@ -5155,16 +5249,16 @@ class OralArgumentQuestionPurposeLegalBenchClassification(AbsTaskClassification) annotations_creators="expert-annotated", dialect=[], sample_creation="found", - bibtex_citation=""" - @misc{guha2023legalbench, - title={LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, - author={Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, - year={2023}, - eprint={2308.11462}, - archivePrefix={arXiv}, - primaryClass={cs.CL} - }, - """, + bibtex_citation=r""" +@misc{guha2023legalbench, + archiveprefix = {arXiv}, + author = {Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, + eprint = {2308.11462}, + primaryclass = {cs.CL}, + title = {LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, + year = {2023}, +} +""", ) def dataset_transform(self): @@ -5196,23 +5290,24 @@ class OverrulingLegalBenchClassification(AbsTaskClassification): annotations_creators="expert-annotated", dialect=[], sample_creation="found", - bibtex_citation=""" - @misc{guha2023legalbench, - title={LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, - author={Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, - year={2023}, - eprint={2308.11462}, - archivePrefix={arXiv}, - primaryClass={cs.CL} - }, - @inproceedings{zheng2021does, - title={When does pretraining help? assessing self-supervised learning for law and the casehold dataset of 53,000+ legal holdings}, - author={Zheng, Lucia and Guha, Neel and Anderson, Brandon R and Henderson, Peter and Ho, Daniel E}, - booktitle={Proceedings of the eighteenth international conference on artificial intelligence and law}, - pages={159--168}, - year={2021} - } - """, + bibtex_citation=r""" +@misc{guha2023legalbench, + archiveprefix = {arXiv}, + author = {Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, + eprint = {2308.11462}, + primaryclass = {cs.CL}, + title = {LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, + year = {2023}, +} + +@inproceedings{zheng2021does, + author = {Zheng, Lucia and Guha, Neel and Anderson, Brandon R and Henderson, Peter and Ho, Daniel E}, + booktitle = {Proceedings of the eighteenth international conference on artificial intelligence and law}, + pages = {159--168}, + title = {When does pretraining help? assessing self-supervised learning for law and the casehold dataset of 53,000+ legal holdings}, + year = {2021}, +} +""", ) def dataset_transform(self): @@ -5251,16 +5346,16 @@ class PersonalJurisdictionLegalBenchClassification(AbsTaskClassification): annotations_creators="expert-annotated", dialect=[], sample_creation="found", - bibtex_citation=""" - @misc{guha2023legalbench, - title={LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, - author={Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, - year={2023}, - eprint={2308.11462}, - archivePrefix={arXiv}, - primaryClass={cs.CL} - }, - """, + bibtex_citation=r""" +@misc{guha2023legalbench, + archiveprefix = {arXiv}, + author = {Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, + eprint = {2308.11462}, + primaryclass = {cs.CL}, + title = {LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, + year = {2023}, +} +""", ) def dataset_transform(self): @@ -5296,16 +5391,16 @@ class PROALegalBenchClassification(AbsTaskClassification): annotations_creators="expert-annotated", dialect=[], sample_creation="found", - bibtex_citation=""" - @misc{guha2023legalbench, - title={LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, - author={Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, - year={2023}, - eprint={2308.11462}, - archivePrefix={arXiv}, - primaryClass={cs.CL} - }, - """, + bibtex_citation=r""" +@misc{guha2023legalbench, + archiveprefix = {arXiv}, + author = {Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, + eprint = {2308.11462}, + primaryclass = {cs.CL}, + title = {LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, + year = {2023}, +} +""", ) def dataset_transform(self): @@ -5341,25 +5436,26 @@ class SCDBPAccountabilityLegalBenchClassification(AbsTaskClassification): annotations_creators="expert-annotated", dialect=[], sample_creation="found", - bibtex_citation=""" - @misc{guha2023legalbench, - title={LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, - author={Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, - year={2023}, - eprint={2308.11462}, - archivePrefix={arXiv}, - primaryClass={cs.CL} - }, - @article{chilton2017limitations, - title={The limitations of supply chain disclosure regimes}, - author={Chilton, Adam S and Sarfaty, Galit A}, - journal={Stan. J. Int'l L.}, - volume={53}, - pages={1}, - year={2017}, - publisher={HeinOnline} - } - """, + bibtex_citation=r""" +@article{chilton2017limitations, + author = {Chilton, Adam S and Sarfaty, Galit A}, + journal = {Stan. J. Int'l L.}, + pages = {1}, + publisher = {HeinOnline}, + title = {The limitations of supply chain disclosure regimes}, + volume = {53}, + year = {2017}, +} + +@misc{guha2023legalbench, + archiveprefix = {arXiv}, + author = {Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, + eprint = {2308.11462}, + primaryclass = {cs.CL}, + title = {LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, + year = {2023}, +} +""", ) def dataset_transform(self): @@ -5395,25 +5491,26 @@ class SCDBPAuditsLegalBenchClassification(AbsTaskClassification): annotations_creators="expert-annotated", dialect=[], sample_creation="found", - bibtex_citation=""" - @misc{guha2023legalbench, - title={LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, - author={Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, - year={2023}, - eprint={2308.11462}, - archivePrefix={arXiv}, - primaryClass={cs.CL} - }, - @article{chilton2017limitations, - title={The limitations of supply chain disclosure regimes}, - author={Chilton, Adam S and Sarfaty, Galit A}, - journal={Stan. J. Int'l L.}, - volume={53}, - pages={1}, - year={2017}, - publisher={HeinOnline} - } - """, + bibtex_citation=r""" +@article{chilton2017limitations, + author = {Chilton, Adam S and Sarfaty, Galit A}, + journal = {Stan. J. Int'l L.}, + pages = {1}, + publisher = {HeinOnline}, + title = {The limitations of supply chain disclosure regimes}, + volume = {53}, + year = {2017}, +} + +@misc{guha2023legalbench, + archiveprefix = {arXiv}, + author = {Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, + eprint = {2308.11462}, + primaryclass = {cs.CL}, + title = {LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, + year = {2023}, +} +""", ) def dataset_transform(self): @@ -5449,25 +5546,26 @@ class SCDBPCertificationLegalBenchClassification(AbsTaskClassification): annotations_creators="expert-annotated", dialect=[], sample_creation="found", - bibtex_citation=""" - @misc{guha2023legalbench, - title={LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, - author={Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, - year={2023}, - eprint={2308.11462}, - archivePrefix={arXiv}, - primaryClass={cs.CL} - }, - @article{chilton2017limitations, - title={The limitations of supply chain disclosure regimes}, - author={Chilton, Adam S and Sarfaty, Galit A}, - journal={Stan. J. Int'l L.}, - volume={53}, - pages={1}, - year={2017}, - publisher={HeinOnline} - } - """, + bibtex_citation=r""" +@article{chilton2017limitations, + author = {Chilton, Adam S and Sarfaty, Galit A}, + journal = {Stan. J. Int'l L.}, + pages = {1}, + publisher = {HeinOnline}, + title = {The limitations of supply chain disclosure regimes}, + volume = {53}, + year = {2017}, +} + +@misc{guha2023legalbench, + archiveprefix = {arXiv}, + author = {Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, + eprint = {2308.11462}, + primaryclass = {cs.CL}, + title = {LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, + year = {2023}, +} +""", ) def dataset_transform(self): @@ -5503,25 +5601,26 @@ class SCDBPTrainingLegalBenchClassification(AbsTaskClassification): annotations_creators="expert-annotated", dialect=[], sample_creation="found", - bibtex_citation=""" - @misc{guha2023legalbench, - title={LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, - author={Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, - year={2023}, - eprint={2308.11462}, - archivePrefix={arXiv}, - primaryClass={cs.CL} - }, - @article{chilton2017limitations, - title={The limitations of supply chain disclosure regimes}, - author={Chilton, Adam S and Sarfaty, Galit A}, - journal={Stan. J. Int'l L.}, - volume={53}, - pages={1}, - year={2017}, - publisher={HeinOnline} - } - """, + bibtex_citation=r""" +@article{chilton2017limitations, + author = {Chilton, Adam S and Sarfaty, Galit A}, + journal = {Stan. J. Int'l L.}, + pages = {1}, + publisher = {HeinOnline}, + title = {The limitations of supply chain disclosure regimes}, + volume = {53}, + year = {2017}, +} + +@misc{guha2023legalbench, + archiveprefix = {arXiv}, + author = {Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, + eprint = {2308.11462}, + primaryclass = {cs.CL}, + title = {LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, + year = {2023}, +} +""", ) def dataset_transform(self): @@ -5557,25 +5656,26 @@ class SCDBPVerificationLegalBenchClassification(AbsTaskClassification): annotations_creators="expert-annotated", dialect=[], sample_creation="found", - bibtex_citation=""" - @misc{guha2023legalbench, - title={LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, - author={Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, - year={2023}, - eprint={2308.11462}, - archivePrefix={arXiv}, - primaryClass={cs.CL} - }, - @article{chilton2017limitations, - title={The limitations of supply chain disclosure regimes}, - author={Chilton, Adam S and Sarfaty, Galit A}, - journal={Stan. J. Int'l L.}, - volume={53}, - pages={1}, - year={2017}, - publisher={HeinOnline} - } - """, + bibtex_citation=r""" +@article{chilton2017limitations, + author = {Chilton, Adam S and Sarfaty, Galit A}, + journal = {Stan. J. Int'l L.}, + pages = {1}, + publisher = {HeinOnline}, + title = {The limitations of supply chain disclosure regimes}, + volume = {53}, + year = {2017}, +} + +@misc{guha2023legalbench, + archiveprefix = {arXiv}, + author = {Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, + eprint = {2308.11462}, + primaryclass = {cs.CL}, + title = {LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, + year = {2023}, +} +""", ) def dataset_transform(self): @@ -5611,25 +5711,26 @@ class SCDDAccountabilityLegalBenchClassification(AbsTaskClassification): annotations_creators="expert-annotated", dialect=[], sample_creation="found", - bibtex_citation=""" - @misc{guha2023legalbench, - title={LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, - author={Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, - year={2023}, - eprint={2308.11462}, - archivePrefix={arXiv}, - primaryClass={cs.CL} - }, - @article{chilton2017limitations, - title={The limitations of supply chain disclosure regimes}, - author={Chilton, Adam S and Sarfaty, Galit A}, - journal={Stan. J. Int'l L.}, - volume={53}, - pages={1}, - year={2017}, - publisher={HeinOnline} - } - """, + bibtex_citation=r""" +@article{chilton2017limitations, + author = {Chilton, Adam S and Sarfaty, Galit A}, + journal = {Stan. J. Int'l L.}, + pages = {1}, + publisher = {HeinOnline}, + title = {The limitations of supply chain disclosure regimes}, + volume = {53}, + year = {2017}, +} + +@misc{guha2023legalbench, + archiveprefix = {arXiv}, + author = {Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, + eprint = {2308.11462}, + primaryclass = {cs.CL}, + title = {LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, + year = {2023}, +} +""", ) def dataset_transform(self): @@ -5665,25 +5766,26 @@ class SCDDAuditsLegalBenchClassification(AbsTaskClassification): annotations_creators="expert-annotated", dialect=[], sample_creation="found", - bibtex_citation=""" - @misc{guha2023legalbench, - title={LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, - author={Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, - year={2023}, - eprint={2308.11462}, - archivePrefix={arXiv}, - primaryClass={cs.CL} - }, - @article{chilton2017limitations, - title={The limitations of supply chain disclosure regimes}, - author={Chilton, Adam S and Sarfaty, Galit A}, - journal={Stan. J. Int'l L.}, - volume={53}, - pages={1}, - year={2017}, - publisher={HeinOnline} - } - """, + bibtex_citation=r""" +@article{chilton2017limitations, + author = {Chilton, Adam S and Sarfaty, Galit A}, + journal = {Stan. J. Int'l L.}, + pages = {1}, + publisher = {HeinOnline}, + title = {The limitations of supply chain disclosure regimes}, + volume = {53}, + year = {2017}, +} + +@misc{guha2023legalbench, + archiveprefix = {arXiv}, + author = {Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, + eprint = {2308.11462}, + primaryclass = {cs.CL}, + title = {LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, + year = {2023}, +} +""", ) def dataset_transform(self): @@ -5719,25 +5821,26 @@ class SCDDCertificationLegalBenchClassification(AbsTaskClassification): annotations_creators="expert-annotated", dialect=[], sample_creation="found", - bibtex_citation=""" - @misc{guha2023legalbench, - title={LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, - author={Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, - year={2023}, - eprint={2308.11462}, - archivePrefix={arXiv}, - primaryClass={cs.CL} - }, - @article{chilton2017limitations, - title={The limitations of supply chain disclosure regimes}, - author={Chilton, Adam S and Sarfaty, Galit A}, - journal={Stan. J. Int'l L.}, - volume={53}, - pages={1}, - year={2017}, - publisher={HeinOnline} - } - """, + bibtex_citation=r""" +@article{chilton2017limitations, + author = {Chilton, Adam S and Sarfaty, Galit A}, + journal = {Stan. J. Int'l L.}, + pages = {1}, + publisher = {HeinOnline}, + title = {The limitations of supply chain disclosure regimes}, + volume = {53}, + year = {2017}, +} + +@misc{guha2023legalbench, + archiveprefix = {arXiv}, + author = {Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, + eprint = {2308.11462}, + primaryclass = {cs.CL}, + title = {LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, + year = {2023}, +} +""", ) def dataset_transform(self): @@ -5773,25 +5876,26 @@ class SCDDTrainingLegalBenchClassification(AbsTaskClassification): annotations_creators="expert-annotated", dialect=[], sample_creation="found", - bibtex_citation=""" - @misc{guha2023legalbench, - title={LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, - author={Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, - year={2023}, - eprint={2308.11462}, - archivePrefix={arXiv}, - primaryClass={cs.CL} - }, - @article{chilton2017limitations, - title={The limitations of supply chain disclosure regimes}, - author={Chilton, Adam S and Sarfaty, Galit A}, - journal={Stan. J. Int'l L.}, - volume={53}, - pages={1}, - year={2017}, - publisher={HeinOnline} - } - """, + bibtex_citation=r""" +@article{chilton2017limitations, + author = {Chilton, Adam S and Sarfaty, Galit A}, + journal = {Stan. J. Int'l L.}, + pages = {1}, + publisher = {HeinOnline}, + title = {The limitations of supply chain disclosure regimes}, + volume = {53}, + year = {2017}, +} + +@misc{guha2023legalbench, + archiveprefix = {arXiv}, + author = {Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, + eprint = {2308.11462}, + primaryclass = {cs.CL}, + title = {LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, + year = {2023}, +} +""", ) def dataset_transform(self): @@ -5827,25 +5931,26 @@ class SCDDVerificationLegalBenchClassification(AbsTaskClassification): annotations_creators="expert-annotated", dialect=[], sample_creation="found", - bibtex_citation=""" - @misc{guha2023legalbench, - title={LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, - author={Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, - year={2023}, - eprint={2308.11462}, - archivePrefix={arXiv}, - primaryClass={cs.CL} - }, - @article{chilton2017limitations, - title={The limitations of supply chain disclosure regimes}, - author={Chilton, Adam S and Sarfaty, Galit A}, - journal={Stan. J. Int'l L.}, - volume={53}, - pages={1}, - year={2017}, - publisher={HeinOnline} - } - """, + bibtex_citation=r""" +@article{chilton2017limitations, + author = {Chilton, Adam S and Sarfaty, Galit A}, + journal = {Stan. J. Int'l L.}, + pages = {1}, + publisher = {HeinOnline}, + title = {The limitations of supply chain disclosure regimes}, + volume = {53}, + year = {2017}, +} + +@misc{guha2023legalbench, + archiveprefix = {arXiv}, + author = {Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, + eprint = {2308.11462}, + primaryclass = {cs.CL}, + title = {LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, + year = {2023}, +} +""", ) def dataset_transform(self): @@ -5881,16 +5986,16 @@ class TelemarketingSalesRuleLegalBenchClassification(AbsTaskClassification): annotations_creators="expert-annotated", dialect=[], sample_creation="found", - bibtex_citation=""" - @misc{guha2023legalbench, - title={LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, - author={Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, - year={2023}, - eprint={2308.11462}, - archivePrefix={arXiv}, - primaryClass={cs.CL} - } - """, + bibtex_citation=r""" +@misc{guha2023legalbench, + archiveprefix = {arXiv}, + author = {Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, + eprint = {2308.11462}, + primaryclass = {cs.CL}, + title = {LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, + year = {2023}, +} +""", ) def dataset_transform(self): @@ -5926,16 +6031,16 @@ class TextualismToolDictionariesLegalBenchClassification(AbsTaskClassification): annotations_creators="expert-annotated", dialect=[], sample_creation="found", - bibtex_citation=""" - @misc{guha2023legalbench, - title={LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, - author={Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, - year={2023}, - eprint={2308.11462}, - archivePrefix={arXiv}, - primaryClass={cs.CL} - } - """, + bibtex_citation=r""" +@misc{guha2023legalbench, + archiveprefix = {arXiv}, + author = {Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, + eprint = {2308.11462}, + primaryclass = {cs.CL}, + title = {LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, + year = {2023}, +} +""", ) def dataset_transform(self): @@ -5971,16 +6076,16 @@ class TextualismToolPlainLegalBenchClassification(AbsTaskClassification): annotations_creators="expert-annotated", dialect=[], sample_creation="found", - bibtex_citation=""" - @misc{guha2023legalbench, - title={LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, - author={Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, - year={2023}, - eprint={2308.11462}, - archivePrefix={arXiv}, - primaryClass={cs.CL} - } - """, + bibtex_citation=r""" +@misc{guha2023legalbench, + archiveprefix = {arXiv}, + author = {Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, + eprint = {2308.11462}, + primaryclass = {cs.CL}, + title = {LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, + year = {2023}, +} +""", ) def dataset_transform(self): @@ -6016,16 +6121,16 @@ class UCCVCommonLawLegalBenchClassification(AbsTaskClassification): annotations_creators="expert-annotated", dialect=[], sample_creation="found", - bibtex_citation=""" - @misc{guha2023legalbench, - title={LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, - author={Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, - year={2023}, - eprint={2308.11462}, - archivePrefix={arXiv}, - primaryClass={cs.CL} - } - """, + bibtex_citation=r""" +@misc{guha2023legalbench, + archiveprefix = {arXiv}, + author = {Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, + eprint = {2308.11462}, + primaryclass = {cs.CL}, + title = {LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, + year = {2023}, +} +""", ) def dataset_transform(self): @@ -6063,25 +6168,26 @@ class UnfairTOSLegalBenchClassification(AbsTaskClassification): annotations_creators="expert-annotated", dialect=[], sample_creation="found", - bibtex_citation=""" - @misc{guha2023legalbench, - title={LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, - author={Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, - year={2023}, - eprint={2308.11462}, - archivePrefix={arXiv}, - primaryClass={cs.CL} - } - @article{lippi2019claudette, - title={CLAUDETTE: an automated detector of potentially unfair clauses in online terms of service}, - author={Lippi, Marco and Pa{\l}ka, Przemys{\l}aw and Contissa, Giuseppe and Lagioia, Francesca and Micklitz, Hans-Wolfgang and Sartor, Giovanni and Torroni, Paolo}, - journal={Artificial Intelligence and Law}, - volume={27}, - pages={117--139}, - year={2019}, - publisher={Springer} - } - """, + bibtex_citation=r""" +@misc{guha2023legalbench, + archiveprefix = {arXiv}, + author = {Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, + eprint = {2308.11462}, + primaryclass = {cs.CL}, + title = {LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, + year = {2023}, +} + +@article{lippi2019claudette, + author = {Lippi, Marco and Pa{\l}ka, Przemys{\l}aw and Contissa, Giuseppe and Lagioia, Francesca and Micklitz, Hans-Wolfgang and Sartor, Giovanni and Torroni, Paolo}, + journal = {Artificial Intelligence and Law}, + pages = {117--139}, + publisher = {Springer}, + title = {CLAUDETTE: an automated detector of potentially unfair clauses in online terms of service}, + volume = {27}, + year = {2019}, +} +""", ) def dataset_transform(self): diff --git a/mteb/tasks/Classification/eng/NewsClassification.py b/mteb/tasks/Classification/eng/NewsClassification.py index e09aa04255..aec198d5c1 100644 --- a/mteb/tasks/Classification/eng/NewsClassification.py +++ b/mteb/tasks/Classification/eng/NewsClassification.py @@ -29,16 +29,17 @@ class NewsClassification(AbsTaskClassification): annotations_creators="expert-annotated", dialect=["eng-Latn-US", "en-Latn-GB", "en-Latn-AU"], sample_creation="found", - bibtex_citation=""" - @inproceedings{NIPS2015_250cf8b5, - author = {Zhang, Xiang and Zhao, Junbo and LeCun, Yann}, - booktitle = {Advances in Neural Information Processing Systems}, - editor = {C. Cortes and N. Lawrence and D. Lee and M. Sugiyama and R. Garnett}, - pages = {}, - publisher = {Curran Associates, Inc.}, - title = {Character-level Convolutional Networks for Text Classification}, - url = {https://proceedings.neurips.cc/paper_files/paper/2015/file/250cf8b51c773f3f8dc8b4be867a9a02-Paper.pdf}, - volume = {28}, - year = {2015} - }""", + bibtex_citation=r""" +@inproceedings{NIPS2015_250cf8b5, + author = {Zhang, Xiang and Zhao, Junbo and LeCun, Yann}, + booktitle = {Advances in Neural Information Processing Systems}, + editor = {C. Cortes and N. Lawrence and D. Lee and M. Sugiyama and R. Garnett}, + pages = {}, + publisher = {Curran Associates, Inc.}, + title = {Character-level Convolutional Networks for Text Classification}, + url = {https://proceedings.neurips.cc/paper_files/paper/2015/file/250cf8b51c773f3f8dc8b4be867a9a02-Paper.pdf}, + volume = {28}, + year = {2015}, +} +""", ) diff --git a/mteb/tasks/Classification/eng/PatentClassification.py b/mteb/tasks/Classification/eng/PatentClassification.py index 9f10a8a794..3876e64ddf 100644 --- a/mteb/tasks/Classification/eng/PatentClassification.py +++ b/mteb/tasks/Classification/eng/PatentClassification.py @@ -27,24 +27,26 @@ class PatentClassification(AbsTaskClassification): annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation="""@inproceedings{sharma-etal-2019-bigpatent, - title = "{BIGPATENT}: A Large-Scale Dataset for Abstractive and Coherent Summarization", - author = "Sharma, Eva and - Li, Chen and - Wang, Lu", - editor = "Korhonen, Anna and - Traum, David and - M{\`a}rquez, Llu{\'\i}s", - booktitle = "Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics", - month = jul, - year = "2019", - address = "Florence, Italy", - publisher = "Association for Computational Linguistics", - url = "https://aclanthology.org/P19-1212", - doi = "10.18653/v1/P19-1212", - pages = "2204--2213", - abstract = "Most existing text summarization datasets are compiled from the news domain, where summaries have a flattened discourse structure. In such datasets, summary-worthy content often appears in the beginning of input articles. Moreover, large segments from input articles are present verbatim in their respective summaries. These issues impede the learning and evaluation of systems that can understand an article{'}s global content structure as well as produce abstractive summaries with high compression ratio. In this work, we present a novel dataset, BIGPATENT, consisting of 1.3 million records of U.S. patent documents along with human written abstractive summaries. Compared to existing summarization datasets, BIGPATENT has the following properties: i) summaries contain a richer discourse structure with more recurring entities, ii) salient content is evenly distributed in the input, and iii) lesser and shorter extractive fragments are present in the summaries. Finally, we train and evaluate baselines and popular learning models on BIGPATENT to shed light on new challenges and motivate future directions for summarization research.", - }""", + bibtex_citation=r""" +@inproceedings{sharma-etal-2019-bigpatent, + abstract = {Most existing text summarization datasets are compiled from the news domain, where summaries have a flattened discourse structure. In such datasets, summary-worthy content often appears in the beginning of input articles. Moreover, large segments from input articles are present verbatim in their respective summaries. These issues impede the learning and evaluation of systems that can understand an article{'}s global content structure as well as produce abstractive summaries with high compression ratio. In this work, we present a novel dataset, BIGPATENT, consisting of 1.3 million records of U.S. patent documents along with human written abstractive summaries. Compared to existing summarization datasets, BIGPATENT has the following properties: i) summaries contain a richer discourse structure with more recurring entities, ii) salient content is evenly distributed in the input, and iii) lesser and shorter extractive fragments are present in the summaries. Finally, we train and evaluate baselines and popular learning models on BIGPATENT to shed light on new challenges and motivate future directions for summarization research.}, + address = {Florence, Italy}, + author = {Sharma, Eva and +Li, Chen and +Wang, Lu}, + booktitle = {Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics}, + doi = {10.18653/v1/P19-1212}, + editor = {Korhonen, Anna and +Traum, David and +M{\`a}rquez, Llu{\'\i}s}, + month = jul, + pages = {2204--2213}, + publisher = {Association for Computational Linguistics}, + title = {{BIGPATENT}: A Large-Scale Dataset for Abstractive and Coherent Summarization}, + url = {https://aclanthology.org/P19-1212}, + year = {2019}, +} +""", ) def dataset_transform(self): diff --git a/mteb/tasks/Classification/eng/PoemSentimentClassification.py b/mteb/tasks/Classification/eng/PoemSentimentClassification.py index f0110308ee..8671929fea 100644 --- a/mteb/tasks/Classification/eng/PoemSentimentClassification.py +++ b/mteb/tasks/Classification/eng/PoemSentimentClassification.py @@ -27,14 +27,14 @@ class PoemSentimentClassification(AbsTaskClassification): annotations_creators="human-annotated", dialect=["eng-Latn-US", "en-Latn-GB"], sample_creation="found", - bibtex_citation=""" + bibtex_citation=r""" @misc{sheng2020investigating, - title={Investigating Societal Biases in a Poetry Composition System}, - author={Emily Sheng and David Uthus}, - year={2020}, - eprint={2011.02686}, - archivePrefix={arXiv}, - primaryClass={cs.CL} + archiveprefix = {arXiv}, + author = {Emily Sheng and David Uthus}, + eprint = {2011.02686}, + primaryclass = {cs.CL}, + title = {Investigating Societal Biases in a Poetry Composition System}, + year = {2020}, } """, ) diff --git a/mteb/tasks/Classification/eng/SDSEyeProtectionClassification.py b/mteb/tasks/Classification/eng/SDSEyeProtectionClassification.py index 197060ba0c..5ae0df8602 100644 --- a/mteb/tasks/Classification/eng/SDSEyeProtectionClassification.py +++ b/mteb/tasks/Classification/eng/SDSEyeProtectionClassification.py @@ -26,19 +26,20 @@ class SDSEyeProtectionClassification(AbsTaskClassification): annotations_creators="LM-generated and reviewed", dialect=[], sample_creation="created", - bibtex_citation=""" - @article{kasmaee2024chemteb, - title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, - author={Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, - journal={arXiv preprint arXiv:2412.00532}, - year={2024} - } - @inproceedings{pereira2020msds, - title={MSDS-OPP: Operator Procedures Prediction in Material Safety Data Sheets}, - author={Pereira, Eliseu}, - booktitle={15th Doctoral Symposium}, - pages={42}, - year={2020} - } - """, + bibtex_citation=r""" +@article{kasmaee2024chemteb, + author = {Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, + journal = {arXiv preprint arXiv:2412.00532}, + title = {ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, + year = {2024}, +} + +@inproceedings{pereira2020msds, + author = {Pereira, Eliseu}, + booktitle = {15th Doctoral Symposium}, + pages = {42}, + title = {MSDS-OPP: Operator Procedures Prediction in Material Safety Data Sheets}, + year = {2020}, +} +""", ) diff --git a/mteb/tasks/Classification/eng/SDSGlovesClassification.py b/mteb/tasks/Classification/eng/SDSGlovesClassification.py index ac471d58e9..41b68096db 100644 --- a/mteb/tasks/Classification/eng/SDSGlovesClassification.py +++ b/mteb/tasks/Classification/eng/SDSGlovesClassification.py @@ -26,19 +26,20 @@ class SDSGlovesClassification(AbsTaskClassification): annotations_creators="LM-generated and reviewed", dialect=[], sample_creation="created", - bibtex_citation=""" - @article{kasmaee2024chemteb, - title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, - author={Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, - journal={arXiv preprint arXiv:2412.00532}, - year={2024} - } - @inproceedings{pereira2020msds, - title={MSDS-OPP: Operator Procedures Prediction in Material Safety Data Sheets}, - author={Pereira, Eliseu}, - booktitle={15th Doctoral Symposium}, - pages={42}, - year={2020} - } - """, + bibtex_citation=r""" +@article{kasmaee2024chemteb, + author = {Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, + journal = {arXiv preprint arXiv:2412.00532}, + title = {ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, + year = {2024}, +} + +@inproceedings{pereira2020msds, + author = {Pereira, Eliseu}, + booktitle = {15th Doctoral Symposium}, + pages = {42}, + title = {MSDS-OPP: Operator Procedures Prediction in Material Safety Data Sheets}, + year = {2020}, +} +""", ) diff --git a/mteb/tasks/Classification/eng/ToxicChatClassification.py b/mteb/tasks/Classification/eng/ToxicChatClassification.py index 51dd5066d3..e189cd51a0 100644 --- a/mteb/tasks/Classification/eng/ToxicChatClassification.py +++ b/mteb/tasks/Classification/eng/ToxicChatClassification.py @@ -37,14 +37,16 @@ class ToxicChatClassification(AbsTaskClassification): annotations_creators="expert-annotated", dialect=[], sample_creation="found", - bibtex_citation="""@misc{lin2023toxicchat, - title={ToxicChat: Unveiling Hidden Challenges of Toxicity Detection in Real-World User-AI Conversation}, - author={Zi Lin and Zihan Wang and Yongqi Tong and Yangkun Wang and Yuxin Guo and Yujia Wang and Jingbo Shang}, - year={2023}, - eprint={2310.17389}, - archivePrefix={arXiv}, - primaryClass={cs.CL} - }""", + bibtex_citation=r""" +@misc{lin2023toxicchat, + archiveprefix = {arXiv}, + author = {Zi Lin and Zihan Wang and Yongqi Tong and Yangkun Wang and Yuxin Guo and Yujia Wang and Jingbo Shang}, + eprint = {2310.17389}, + primaryclass = {cs.CL}, + title = {ToxicChat: Unveiling Hidden Challenges of Toxicity Detection in Real-World User-AI Conversation}, + year = {2023}, +} +""", ) def dataset_transform(self): diff --git a/mteb/tasks/Classification/eng/ToxicConversationsClassification.py b/mteb/tasks/Classification/eng/ToxicConversationsClassification.py index f99d44534d..439b19ba7c 100644 --- a/mteb/tasks/Classification/eng/ToxicConversationsClassification.py +++ b/mteb/tasks/Classification/eng/ToxicConversationsClassification.py @@ -29,13 +29,15 @@ class ToxicConversationsClassification(AbsTaskClassification): annotations_creators="human-annotated", dialect=[], sample_creation="found", - bibtex_citation="""@misc{jigsaw-unintended-bias-in-toxicity-classification, - author = {cjadams and Daniel Borkan and inversion and Jeffrey Sorensen and Lucas Dixon and Lucy Vasserman and nithum}, - title = {Jigsaw Unintended Bias in Toxicity Classification}, - publisher = {Kaggle}, - year = {2019}, - url = {https://kaggle.com/competitions/jigsaw-unintended-bias-in-toxicity-classification} -}""", + bibtex_citation=r""" +@misc{jigsaw-unintended-bias-in-toxicity-classification, + author = {cjadams and Daniel Borkan and inversion and Jeffrey Sorensen and Lucas Dixon and Lucy Vasserman and nithum}, + publisher = {Kaggle}, + title = {Jigsaw Unintended Bias in Toxicity Classification}, + url = {https://kaggle.com/competitions/jigsaw-unintended-bias-in-toxicity-classification}, + year = {2019}, +} +""", prompt="Classify the given comments as either toxic or not toxic", ) diff --git a/mteb/tasks/Classification/eng/TweetSentimentExtractionClassification.py b/mteb/tasks/Classification/eng/TweetSentimentExtractionClassification.py index d77c44936e..a4ab4b5c70 100644 --- a/mteb/tasks/Classification/eng/TweetSentimentExtractionClassification.py +++ b/mteb/tasks/Classification/eng/TweetSentimentExtractionClassification.py @@ -29,13 +29,15 @@ class TweetSentimentExtractionClassification(AbsTaskClassification): annotations_creators="human-annotated", dialect=[], sample_creation="found", - bibtex_citation="""@misc{tweet-sentiment-extraction, - author = {Maggie, Phil Culliton, Wei Chen}, - title = {Tweet Sentiment Extraction}, - publisher = {Kaggle}, - year = {2020}, - url = {https://kaggle.com/competitions/tweet-sentiment-extraction} -}""", + bibtex_citation=r""" +@misc{tweet-sentiment-extraction, + author = {Maggie, Phil Culliton, Wei Chen}, + publisher = {Kaggle}, + title = {Tweet Sentiment Extraction}, + url = {https://kaggle.com/competitions/tweet-sentiment-extraction}, + year = {2020}, +} +""", prompt="Classify the sentiment of a given tweet as either positive, negative, or neutral", ) diff --git a/mteb/tasks/Classification/eng/TweetTopicSingleClassification.py b/mteb/tasks/Classification/eng/TweetTopicSingleClassification.py index 6c7d4e2bbb..cad250dac0 100644 --- a/mteb/tasks/Classification/eng/TweetTopicSingleClassification.py +++ b/mteb/tasks/Classification/eng/TweetTopicSingleClassification.py @@ -32,22 +32,22 @@ class TweetTopicSingleClassification(AbsTaskClassification): annotations_creators="expert-annotated", dialect=[], sample_creation="found", - bibtex_citation=""" - @inproceedings{dimosthenis-etal-2022-twitter, - title = "{T}witter {T}opic {C}lassification", - author = "Antypas, Dimosthenis and - Ushio, Asahi and - Camacho-Collados, Jose and - Neves, Leonardo and - Silva, Vitor and - Barbieri, Francesco", - booktitle = "Proceedings of the 29th International Conference on Computational Linguistics", - month = oct, - year = "2022", - address = "Gyeongju, Republic of Korea", - publisher = "International Committee on Computational Linguistics" - } - """, + bibtex_citation=r""" +@inproceedings{dimosthenis-etal-2022-twitter, + address = {Gyeongju, Republic of Korea}, + author = {Antypas, Dimosthenis and +Ushio, Asahi and +Camacho-Collados, Jose and +Neves, Leonardo and +Silva, Vitor and +Barbieri, Francesco}, + booktitle = {Proceedings of the 29th International Conference on Computational Linguistics}, + month = oct, + publisher = {International Committee on Computational Linguistics}, + title = {{T}witter {T}opic {C}lassification}, + year = {2022}, +} +""", ) def dataset_transform(self): diff --git a/mteb/tasks/Classification/eng/WikipediaBioMetChemClassification.py b/mteb/tasks/Classification/eng/WikipediaBioMetChemClassification.py index 3b494f46f6..c6ade13b66 100644 --- a/mteb/tasks/Classification/eng/WikipediaBioMetChemClassification.py +++ b/mteb/tasks/Classification/eng/WikipediaBioMetChemClassification.py @@ -26,12 +26,12 @@ class WikipediaBioMetChemClassification(AbsTaskClassification): annotations_creators="derived", dialect=[], sample_creation="created", - bibtex_citation=""" - @article{kasmaee2024chemteb, - title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, - author={Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, - journal={arXiv preprint arXiv:2412.00532}, - year={2024} - } - """, + bibtex_citation=r""" +@article{kasmaee2024chemteb, + author = {Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, + journal = {arXiv preprint arXiv:2412.00532}, + title = {ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, + year = {2024}, +} +""", ) diff --git a/mteb/tasks/Classification/eng/WikipediaBiolumNeurochemClassification.py b/mteb/tasks/Classification/eng/WikipediaBiolumNeurochemClassification.py index 623ec8fc66..cd103aacf8 100644 --- a/mteb/tasks/Classification/eng/WikipediaBiolumNeurochemClassification.py +++ b/mteb/tasks/Classification/eng/WikipediaBiolumNeurochemClassification.py @@ -26,12 +26,12 @@ class WikipediaBiolumNeurochemClassification(AbsTaskClassification): annotations_creators="derived", dialect=[], sample_creation="created", - bibtex_citation=""" - @article{kasmaee2024chemteb, - title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, - author={Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, - journal={arXiv preprint arXiv:2412.00532}, - year={2024} - } - """, + bibtex_citation=r""" +@article{kasmaee2024chemteb, + author = {Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, + journal = {arXiv preprint arXiv:2412.00532}, + title = {ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, + year = {2024}, +} +""", ) diff --git a/mteb/tasks/Classification/eng/WikipediaChemEngSpecialtiesClassification.py b/mteb/tasks/Classification/eng/WikipediaChemEngSpecialtiesClassification.py index c95abcd4f2..4a61cb4665 100644 --- a/mteb/tasks/Classification/eng/WikipediaChemEngSpecialtiesClassification.py +++ b/mteb/tasks/Classification/eng/WikipediaChemEngSpecialtiesClassification.py @@ -26,12 +26,12 @@ class WikipediaChemEngSpecialtiesClassification(AbsTaskClassification): annotations_creators="derived", dialect=[], sample_creation="created", - bibtex_citation=""" - @article{kasmaee2024chemteb, - title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, - author={Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, - journal={arXiv preprint arXiv:2412.00532}, - year={2024} - } - """, + bibtex_citation=r""" +@article{kasmaee2024chemteb, + author = {Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, + journal = {arXiv preprint arXiv:2412.00532}, + title = {ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, + year = {2024}, +} +""", ) diff --git a/mteb/tasks/Classification/eng/WikipediaChemFieldsClassification.py b/mteb/tasks/Classification/eng/WikipediaChemFieldsClassification.py index 7c0179fb1e..940bcdc44e 100644 --- a/mteb/tasks/Classification/eng/WikipediaChemFieldsClassification.py +++ b/mteb/tasks/Classification/eng/WikipediaChemFieldsClassification.py @@ -26,12 +26,12 @@ class WikipediaChemFieldsClassification(AbsTaskClassification): annotations_creators="derived", dialect=[], sample_creation="created", - bibtex_citation=""" - @article{kasmaee2024chemteb, - title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, - author={Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, - journal={arXiv preprint arXiv:2412.00532}, - year={2024} - } - """, + bibtex_citation=r""" +@article{kasmaee2024chemteb, + author = {Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, + journal = {arXiv preprint arXiv:2412.00532}, + title = {ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, + year = {2024}, +} +""", ) diff --git a/mteb/tasks/Classification/eng/WikipediaChemistryTopicsClassification.py b/mteb/tasks/Classification/eng/WikipediaChemistryTopicsClassification.py index 02751b1a32..b6e0d362bb 100644 --- a/mteb/tasks/Classification/eng/WikipediaChemistryTopicsClassification.py +++ b/mteb/tasks/Classification/eng/WikipediaChemistryTopicsClassification.py @@ -26,12 +26,12 @@ class WikipediaChemistryTopicsClassification(AbsTaskClassification): annotations_creators="derived", dialect=[], sample_creation="created", - bibtex_citation=""" - @article{kasmaee2024chemteb, - title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, - author={Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, - journal={arXiv preprint arXiv:2412.00532}, - year={2024} - } - """, + bibtex_citation=r""" +@article{kasmaee2024chemteb, + author = {Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, + journal = {arXiv preprint arXiv:2412.00532}, + title = {ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, + year = {2024}, +} +""", ) diff --git a/mteb/tasks/Classification/eng/WikipediaCompChemSpectroscopyClassification.py b/mteb/tasks/Classification/eng/WikipediaCompChemSpectroscopyClassification.py index 28a42ac044..8ee7c5b145 100644 --- a/mteb/tasks/Classification/eng/WikipediaCompChemSpectroscopyClassification.py +++ b/mteb/tasks/Classification/eng/WikipediaCompChemSpectroscopyClassification.py @@ -26,12 +26,12 @@ class WikipediaCompChemSpectroscopyClassification(AbsTaskClassification): annotations_creators="derived", dialect=[], sample_creation="created", - bibtex_citation=""" - @article{kasmaee2024chemteb, - title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, - author={Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, - journal={arXiv preprint arXiv:2412.00532}, - year={2024} - } - """, + bibtex_citation=r""" +@article{kasmaee2024chemteb, + author = {Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, + journal = {arXiv preprint arXiv:2412.00532}, + title = {ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, + year = {2024}, +} +""", ) diff --git a/mteb/tasks/Classification/eng/WikipediaCryobiologySeparationClassification.py b/mteb/tasks/Classification/eng/WikipediaCryobiologySeparationClassification.py index 0e01454298..4f64eb2ff8 100644 --- a/mteb/tasks/Classification/eng/WikipediaCryobiologySeparationClassification.py +++ b/mteb/tasks/Classification/eng/WikipediaCryobiologySeparationClassification.py @@ -26,12 +26,12 @@ class WikipediaCryobiologySeparationClassification(AbsTaskClassification): annotations_creators="derived", dialect=[], sample_creation="created", - bibtex_citation=""" - @article{kasmaee2024chemteb, - title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, - author={Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, - journal={arXiv preprint arXiv:2412.00532}, - year={2024} - } - """, + bibtex_citation=r""" +@article{kasmaee2024chemteb, + author = {Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, + journal = {arXiv preprint arXiv:2412.00532}, + title = {ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, + year = {2024}, +} +""", ) diff --git a/mteb/tasks/Classification/eng/WikipediaCrystallographyAnalyticalClassification.py b/mteb/tasks/Classification/eng/WikipediaCrystallographyAnalyticalClassification.py index 724ffc4249..9bc991261a 100644 --- a/mteb/tasks/Classification/eng/WikipediaCrystallographyAnalyticalClassification.py +++ b/mteb/tasks/Classification/eng/WikipediaCrystallographyAnalyticalClassification.py @@ -26,12 +26,12 @@ class WikipediaCrystallographyAnalyticalClassification(AbsTaskClassification): annotations_creators="derived", dialect=[], sample_creation="created", - bibtex_citation=""" - @article{kasmaee2024chemteb, - title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, - author={Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, - journal={arXiv preprint arXiv:2412.00532}, - year={2024} - } - """, + bibtex_citation=r""" +@article{kasmaee2024chemteb, + author = {Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, + journal = {arXiv preprint arXiv:2412.00532}, + title = {ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, + year = {2024}, +} +""", ) diff --git a/mteb/tasks/Classification/eng/WikipediaGreenhouseEnantiopureClassification.py b/mteb/tasks/Classification/eng/WikipediaGreenhouseEnantiopureClassification.py index b701584a70..32f8bc949b 100644 --- a/mteb/tasks/Classification/eng/WikipediaGreenhouseEnantiopureClassification.py +++ b/mteb/tasks/Classification/eng/WikipediaGreenhouseEnantiopureClassification.py @@ -26,12 +26,12 @@ class WikipediaGreenhouseEnantiopureClassification(AbsTaskClassification): annotations_creators="derived", dialect=[], sample_creation="created", - bibtex_citation=""" - @article{kasmaee2024chemteb, - title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, - author={Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, - journal={arXiv preprint arXiv:2412.00532}, - year={2024} - } - """, + bibtex_citation=r""" +@article{kasmaee2024chemteb, + author = {Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, + journal = {arXiv preprint arXiv:2412.00532}, + title = {ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, + year = {2024}, +} +""", ) diff --git a/mteb/tasks/Classification/eng/WikipediaIsotopesFissionClassification.py b/mteb/tasks/Classification/eng/WikipediaIsotopesFissionClassification.py index 252ad85ed9..f3cb8f52c3 100644 --- a/mteb/tasks/Classification/eng/WikipediaIsotopesFissionClassification.py +++ b/mteb/tasks/Classification/eng/WikipediaIsotopesFissionClassification.py @@ -26,12 +26,12 @@ class WikipediaIsotopesFissionClassification(AbsTaskClassification): annotations_creators="derived", dialect=[], sample_creation="created", - bibtex_citation=""" - @article{kasmaee2024chemteb, - title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, - author={Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, - journal={arXiv preprint arXiv:2412.00532}, - year={2024} - } - """, + bibtex_citation=r""" +@article{kasmaee2024chemteb, + author = {Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, + journal = {arXiv preprint arXiv:2412.00532}, + title = {ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, + year = {2024}, +} +""", ) diff --git a/mteb/tasks/Classification/eng/WikipediaLuminescenceClassification.py b/mteb/tasks/Classification/eng/WikipediaLuminescenceClassification.py index 8e115b59d4..e414138cd6 100644 --- a/mteb/tasks/Classification/eng/WikipediaLuminescenceClassification.py +++ b/mteb/tasks/Classification/eng/WikipediaLuminescenceClassification.py @@ -26,12 +26,12 @@ class WikipediaLuminescenceClassification(AbsTaskClassification): annotations_creators="derived", dialect=[], sample_creation="created", - bibtex_citation=""" - @article{kasmaee2024chemteb, - title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, - author={Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, - journal={arXiv preprint arXiv:2412.00532}, - year={2024} - } - """, + bibtex_citation=r""" +@article{kasmaee2024chemteb, + author = {Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, + journal = {arXiv preprint arXiv:2412.00532}, + title = {ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, + year = {2024}, +} +""", ) diff --git a/mteb/tasks/Classification/eng/WikipediaOrganicInorganicClassification.py b/mteb/tasks/Classification/eng/WikipediaOrganicInorganicClassification.py index 0ad784b69b..bde49beefa 100644 --- a/mteb/tasks/Classification/eng/WikipediaOrganicInorganicClassification.py +++ b/mteb/tasks/Classification/eng/WikipediaOrganicInorganicClassification.py @@ -26,12 +26,12 @@ class WikipediaOrganicInorganicClassification(AbsTaskClassification): annotations_creators="derived", dialect=[], sample_creation="created", - bibtex_citation=""" - @article{kasmaee2024chemteb, - title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, - author={Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, - journal={arXiv preprint arXiv:2412.00532}, - year={2024} - } - """, + bibtex_citation=r""" +@article{kasmaee2024chemteb, + author = {Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, + journal = {arXiv preprint arXiv:2412.00532}, + title = {ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, + year = {2024}, +} +""", ) diff --git a/mteb/tasks/Classification/eng/WikipediaSaltsSemiconductorsClassification.py b/mteb/tasks/Classification/eng/WikipediaSaltsSemiconductorsClassification.py index a409f87c8d..55ef384d0e 100644 --- a/mteb/tasks/Classification/eng/WikipediaSaltsSemiconductorsClassification.py +++ b/mteb/tasks/Classification/eng/WikipediaSaltsSemiconductorsClassification.py @@ -26,12 +26,12 @@ class WikipediaSaltsSemiconductorsClassification(AbsTaskClassification): annotations_creators="derived", dialect=[], sample_creation="created", - bibtex_citation=""" - @article{kasmaee2024chemteb, - title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, - author={Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, - journal={arXiv preprint arXiv:2412.00532}, - year={2024} - } - """, + bibtex_citation=r""" +@article{kasmaee2024chemteb, + author = {Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, + journal = {arXiv preprint arXiv:2412.00532}, + title = {ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, + year = {2024}, +} +""", ) diff --git a/mteb/tasks/Classification/eng/WikipediaSolidStateColloidalClassification.py b/mteb/tasks/Classification/eng/WikipediaSolidStateColloidalClassification.py index 43f95c50f3..eab8e2cdcc 100644 --- a/mteb/tasks/Classification/eng/WikipediaSolidStateColloidalClassification.py +++ b/mteb/tasks/Classification/eng/WikipediaSolidStateColloidalClassification.py @@ -26,12 +26,12 @@ class WikipediaSolidStateColloidalClassification(AbsTaskClassification): annotations_creators="derived", dialect=[], sample_creation="created", - bibtex_citation=""" - @article{kasmaee2024chemteb, - title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, - author={Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, - journal={arXiv preprint arXiv:2412.00532}, - year={2024} - } - """, + bibtex_citation=r""" +@article{kasmaee2024chemteb, + author = {Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, + journal = {arXiv preprint arXiv:2412.00532}, + title = {ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, + year = {2024}, +} +""", ) diff --git a/mteb/tasks/Classification/eng/WikipediaTheoreticalAppliedClassification.py b/mteb/tasks/Classification/eng/WikipediaTheoreticalAppliedClassification.py index f33b02f4bb..f1d97d3a70 100644 --- a/mteb/tasks/Classification/eng/WikipediaTheoreticalAppliedClassification.py +++ b/mteb/tasks/Classification/eng/WikipediaTheoreticalAppliedClassification.py @@ -26,12 +26,12 @@ class WikipediaTheoreticalAppliedClassification(AbsTaskClassification): annotations_creators="derived", dialect=[], sample_creation="created", - bibtex_citation=""" - @article{kasmaee2024chemteb, - title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, - author={Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, - journal={arXiv preprint arXiv:2412.00532}, - year={2024} - } - """, + bibtex_citation=r""" +@article{kasmaee2024chemteb, + author = {Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, + journal = {arXiv preprint arXiv:2412.00532}, + title = {ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, + year = {2024}, +} +""", ) diff --git a/mteb/tasks/Classification/eng/YahooAnswersTopicsClassification.py b/mteb/tasks/Classification/eng/YahooAnswersTopicsClassification.py index 9369b0f6b1..7a699bfaaf 100644 --- a/mteb/tasks/Classification/eng/YahooAnswersTopicsClassification.py +++ b/mteb/tasks/Classification/eng/YahooAnswersTopicsClassification.py @@ -27,18 +27,19 @@ class YahooAnswersTopicsClassification(AbsTaskClassification): annotations_creators="human-annotated", dialect=[], sample_creation="found", - bibtex_citation=""" - @inproceedings{NIPS2015_250cf8b5, - author = {Zhang, Xiang and Zhao, Junbo and LeCun, Yann}, - booktitle = {Advances in Neural Information Processing Systems}, - editor = {C. Cortes and N. Lawrence and D. Lee and M. Sugiyama and R. Garnett}, - pages = {}, - publisher = {Curran Associates, Inc.}, - title = {Character-level Convolutional Networks for Text Classification}, - url = {https://proceedings.neurips.cc/paper_files/paper/2015/file/250cf8b51c773f3f8dc8b4be867a9a02-Paper.pdf}, - volume = {28}, - year = {2015} - }""", + bibtex_citation=r""" +@inproceedings{NIPS2015_250cf8b5, + author = {Zhang, Xiang and Zhao, Junbo and LeCun, Yann}, + booktitle = {Advances in Neural Information Processing Systems}, + editor = {C. Cortes and N. Lawrence and D. Lee and M. Sugiyama and R. Garnett}, + pages = {}, + publisher = {Curran Associates, Inc.}, + title = {Character-level Convolutional Networks for Text Classification}, + url = {https://proceedings.neurips.cc/paper_files/paper/2015/file/250cf8b51c773f3f8dc8b4be867a9a02-Paper.pdf}, + volume = {28}, + year = {2015}, +} +""", ) samples_per_label = 32 diff --git a/mteb/tasks/Classification/eng/YelpReviewFullClassification.py b/mteb/tasks/Classification/eng/YelpReviewFullClassification.py index 584d5b5266..2c088af31a 100644 --- a/mteb/tasks/Classification/eng/YelpReviewFullClassification.py +++ b/mteb/tasks/Classification/eng/YelpReviewFullClassification.py @@ -26,19 +26,19 @@ class YelpReviewFullClassification(AbsTaskClassification): annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation=""" - @inproceedings{NIPS2015_250cf8b5, - author = {Zhang, Xiang and Zhao, Junbo and LeCun, Yann}, - booktitle = {Advances in Neural Information Processing Systems}, - editor = {C. Cortes and N. Lawrence and D. Lee and M. Sugiyama and R. Garnett}, - pages = {}, - publisher = {Curran Associates, Inc.}, - title = {Character-level Convolutional Networks for Text Classification}, - url = {https://proceedings.neurips.cc/paper_files/paper/2015/file/250cf8b51c773f3f8dc8b4be867a9a02-Paper.pdf}, - volume = {28}, - year = {2015} - } - """, + bibtex_citation=r""" +@inproceedings{NIPS2015_250cf8b5, + author = {Zhang, Xiang and Zhao, Junbo and LeCun, Yann}, + booktitle = {Advances in Neural Information Processing Systems}, + editor = {C. Cortes and N. Lawrence and D. Lee and M. Sugiyama and R. Garnett}, + pages = {}, + publisher = {Curran Associates, Inc.}, + title = {Character-level Convolutional Networks for Text Classification}, + url = {https://proceedings.neurips.cc/paper_files/paper/2015/file/250cf8b51c773f3f8dc8b4be867a9a02-Paper.pdf}, + volume = {28}, + year = {2015}, +} +""", ) samples_per_label = 128 diff --git a/mteb/tasks/Classification/est/estonian_valence.py b/mteb/tasks/Classification/est/estonian_valence.py index 3f15ee7925..11561aa385 100644 --- a/mteb/tasks/Classification/est/estonian_valence.py +++ b/mteb/tasks/Classification/est/estonian_valence.py @@ -29,15 +29,16 @@ class EstonianValenceClassification(AbsTaskClassification): license="cc-by-4.0", annotations_creators="human-annotated", sample_creation="found", - bibtex_citation=""" + bibtex_citation=r""" @article{Pajupuu2023, - author = "Hille Pajupuu and Jaan Pajupuu and Rene Altrov and Kairi Tamuri", - title = "{Estonian Valence Corpus / Eesti valentsikorpus}", - year = "2023", - month = "11", - url = "https://figshare.com/articles/dataset/Estonian_Valence_Corpus_Eesti_valentsikorpus/24517054", - doi = "10.6084/m9.figshare.24517054.v1" -}""", + author = {Hille Pajupuu and Jaan Pajupuu and Rene Altrov and Kairi Tamuri}, + doi = {10.6084/m9.figshare.24517054.v1}, + month = {11}, + title = {{Estonian Valence Corpus / Eesti valentsikorpus}}, + url = {https://figshare.com/articles/dataset/Estonian_Valence_Corpus_Eesti_valentsikorpus/24517054}, + year = {2023}, +} +""", ) def dataset_transform(self): diff --git a/mteb/tasks/Classification/fas/PersianFoodSentimentClassification.py b/mteb/tasks/Classification/fas/PersianFoodSentimentClassification.py index f7389e57bc..5b66d39efc 100644 --- a/mteb/tasks/Classification/fas/PersianFoodSentimentClassification.py +++ b/mteb/tasks/Classification/fas/PersianFoodSentimentClassification.py @@ -28,15 +28,15 @@ class PersianFoodSentimentClassification(AbsTaskClassification): annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation=""" - @article{ParsBERT, - title={ParsBERT: Transformer-based Model for Persian Language Understanding}, - author={Mehrdad Farahani, Mohammad Gharachorloo, Marzieh Farahani, Mohammad Manthouri}, - journal={ArXiv}, - year={2020}, - volume={abs/2005.12515} - } - """, + bibtex_citation=r""" +@article{ParsBERT, + author = {Mehrdad Farahani, Mohammad Gharachorloo, Marzieh Farahani, Mohammad Manthouri}, + journal = {ArXiv}, + title = {ParsBERT: Transformer-based Model for Persian Language Understanding}, + volume = {abs/2005.12515}, + year = {2020}, +} +""", ) def dataset_transform(self): diff --git a/mteb/tasks/Classification/fil/FilipinoHateSpeechClassification.py b/mteb/tasks/Classification/fil/FilipinoHateSpeechClassification.py index 3715103ca2..047e91df8e 100644 --- a/mteb/tasks/Classification/fil/FilipinoHateSpeechClassification.py +++ b/mteb/tasks/Classification/fil/FilipinoHateSpeechClassification.py @@ -29,17 +29,17 @@ class FilipinoHateSpeechClassification(AbsTaskClassification): annotations_creators="human-annotated", dialect=[], sample_creation="found", - bibtex_citation=""" - @article{Cabasag-2019-hate-speech, - title={Hate speech in Philippine election-related tweets: Automatic detection and classification using natural language processing.}, - author={Neil Vicente Cabasag, Vicente Raphael Chan, Sean Christian Lim, Mark Edward Gonzales, and Charibeth Cheng}, - journal={Philippine Computing Journal}, - volume={XIV}, - number={1}, - month={August}, - year={2019} - } - """, + bibtex_citation=r""" +@article{Cabasag-2019-hate-speech, + author = {Neil Vicente Cabasag, Vicente Raphael Chan, Sean Christian Lim, Mark Edward Gonzales, and Charibeth Cheng}, + journal = {Philippine Computing Journal}, + month = {August}, + number = {1}, + title = {Hate speech in Philippine election-related tweets: Automatic detection and classification using natural language processing.}, + volume = {XIV}, + year = {2019}, +} +""", ) def dataset_transform(self): diff --git a/mteb/tasks/Classification/fil/FilipinoShopeeReviewsClassification.py b/mteb/tasks/Classification/fil/FilipinoShopeeReviewsClassification.py index d91af36567..8c35e86cea 100644 --- a/mteb/tasks/Classification/fil/FilipinoShopeeReviewsClassification.py +++ b/mteb/tasks/Classification/fil/FilipinoShopeeReviewsClassification.py @@ -26,15 +26,16 @@ class FilipinoShopeeReviewsClassification(AbsTaskClassification): sample_creation="found", date=("2022-05-13", "2023-05-13"), main_score="accuracy", - bibtex_citation=""" - @article{riegoenhancement, - title={Enhancement to Low-Resource Text Classification via Sequential Transfer Learning}, - author={Riego, Neil Christian R. and Villarba, Danny Bell and Sison, Ariel Antwaun Rolando C. and Pineda, Fernandez C. and Lagunzad, Herminiño C.} - journal={United International Journal for Research & Technology}, - volume={04}, - issue={08}, - pages={72--82} - }""", + bibtex_citation=r""" +@article{riegoenhancement, + author = {Riego, Neil Christian R. and Villarba, Danny Bell and Sison, Ariel Antwaun Rolando C. and Pineda, Fernandez C. and Lagunzad, Herminiño C.}, + issue = {08}, + journal = {United International Journal for Research & Technology}, + pages = {72--82}, + title = {Enhancement to Low-Resource Text Classification via Sequential Transfer Learning}, + volume = {04}, +} +""", ) def dataset_transform(self): diff --git a/mteb/tasks/Classification/fin/FinToxicityClassification.py b/mteb/tasks/Classification/fin/FinToxicityClassification.py index 2b582c0143..d847dac5a8 100644 --- a/mteb/tasks/Classification/fin/FinToxicityClassification.py +++ b/mteb/tasks/Classification/fin/FinToxicityClassification.py @@ -30,18 +30,19 @@ class FinToxicityClassification(AbsTaskClassification): annotations_creators="derived", dialect=[], sample_creation="machine-translated", - bibtex_citation=""" - @inproceedings{eskelinen-etal-2023-toxicity, - title = "Toxicity Detection in {F}innish Using Machine Translation", - author = "Eskelinen, Anni and - Silvala, Laura and - Ginter, Filip and - Pyysalo, Sampo and - Laippala, Veronika", - booktitle = "Proceedings of the 24th Nordic Conference on Computational Linguistics (NoDaLiDa)", - month = may, - year = "2023", - }""", + bibtex_citation=r""" +@inproceedings{eskelinen-etal-2023-toxicity, + author = {Eskelinen, Anni and +Silvala, Laura and +Ginter, Filip and +Pyysalo, Sampo and +Laippala, Veronika}, + booktitle = {Proceedings of the 24th Nordic Conference on Computational Linguistics (NoDaLiDa)}, + month = may, + title = {Toxicity Detection in {F}innish Using Machine Translation}, + year = {2023}, +} +""", ) def dataset_transform(self): diff --git a/mteb/tasks/Classification/fra/MovieReviewSentimentClassification.py b/mteb/tasks/Classification/fra/MovieReviewSentimentClassification.py index ea1971a715..b488661093 100644 --- a/mteb/tasks/Classification/fra/MovieReviewSentimentClassification.py +++ b/mteb/tasks/Classification/fra/MovieReviewSentimentClassification.py @@ -26,10 +26,10 @@ class MovieReviewSentimentClassification(AbsTaskClassification): annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation=""" + bibtex_citation=r""" @software{blard2020, - title = {French sentiment analysis with BERT}, author = {Théophile Blard}, + title = {French sentiment analysis with BERT}, url = {https://github.com/TheophileBlard/french-sentiment-analysis-with-bert}, year = {2020}, } diff --git a/mteb/tasks/Classification/heb/HebrewSentimentAnalysis.py b/mteb/tasks/Classification/heb/HebrewSentimentAnalysis.py index a4162801b3..5f70955710 100644 --- a/mteb/tasks/Classification/heb/HebrewSentimentAnalysis.py +++ b/mteb/tasks/Classification/heb/HebrewSentimentAnalysis.py @@ -30,19 +30,19 @@ class HebrewSentimentAnalysis(AbsTaskClassification): annotations_creators="expert-annotated", dialect=[], sample_creation="found", - bibtex_citation="""" - @inproceedings{amram-etal-2018-representations, - title = "Representations and Architectures in Neural Sentiment Analysis for Morphologically Rich Languages: A Case Study from {M}odern {H}ebrew", - author = "Amram, Adam and Ben David, Anat and Tsarfaty, Reut", - booktitle = "Proceedings of the 27th International Conference on Computational Linguistics", - month = aug, - year = "2018", - address = "Santa Fe, New Mexico, USA", - publisher = "Association for Computational Linguistics", - url = "https://www.aclweb.org/anthology/C18-1190", - pages = "2242--2252" - } - """, + bibtex_citation=r""" +@inproceedings{amram-etal-2018-representations, + address = {Santa Fe, New Mexico, USA}, + author = {Amram, Adam and Ben David, Anat and Tsarfaty, Reut}, + booktitle = {Proceedings of the 27th International Conference on Computational Linguistics}, + month = aug, + pages = {2242--2252}, + publisher = {Association for Computational Linguistics}, + title = {Representations and Architectures in Neural Sentiment Analysis for Morphologically Rich Languages: A Case Study from {M}odern {H}ebrew}, + url = {https://www.aclweb.org/anthology/C18-1190}, + year = {2018}, +} +""", ) def dataset_transform(self): diff --git a/mteb/tasks/Classification/hin/HindiDiscourseClassification.py b/mteb/tasks/Classification/hin/HindiDiscourseClassification.py index 936fabe2cd..52fc83a720 100644 --- a/mteb/tasks/Classification/hin/HindiDiscourseClassification.py +++ b/mteb/tasks/Classification/hin/HindiDiscourseClassification.py @@ -27,29 +27,30 @@ class HindiDiscourseClassification(AbsTaskClassification): license="mit", annotations_creators="expert-annotated", sample_creation="found", - bibtex_citation=""" - @inproceedings{dhanwal-etal-2020-annotated, - title = "An Annotated Dataset of Discourse Modes in {H}indi Stories", - author = "Dhanwal, Swapnil and - Dutta, Hritwik and - Nankani, Hitesh and - Shrivastava, Nilay and - Kumar, Yaman and - Li, Junyi Jessy and - Mahata, Debanjan and - Gosangi, Rakesh and - Zhang, Haimin and - Shah, Rajiv Ratn and - Stent, Amanda", - booktitle = "Proceedings of the 12th Language Resources and Evaluation Conference", - month = may, - year = "2020", - address = "Marseille, France", - publisher = "European Language Resources Association", - url = "https://www.aclweb.org/anthology/2020.lrec-1.149", - language = "English", - ISBN = "979-10-95546-34-4", -}""", + bibtex_citation=r""" +@inproceedings{dhanwal-etal-2020-annotated, + address = {Marseille, France}, + author = {Dhanwal, Swapnil and +Dutta, Hritwik and +Nankani, Hitesh and +Shrivastava, Nilay and +Kumar, Yaman and +Li, Junyi Jessy and +Mahata, Debanjan and +Gosangi, Rakesh and +Zhang, Haimin and +Shah, Rajiv Ratn and +Stent, Amanda}, + booktitle = {Proceedings of the 12th Language Resources and Evaluation Conference}, + isbn = {979-10-95546-34-4}, + language = {English}, + month = may, + publisher = {European Language Resources Association}, + title = {An Annotated Dataset of Discourse Modes in {H}indi Stories}, + url = {https://www.aclweb.org/anthology/2020.lrec-1.149}, + year = {2020}, +} +""", ) def dataset_transform(self): diff --git a/mteb/tasks/Classification/hin/SentimentAnalysisHindi.py b/mteb/tasks/Classification/hin/SentimentAnalysisHindi.py index 39daf9b14a..c922567b8f 100644 --- a/mteb/tasks/Classification/hin/SentimentAnalysisHindi.py +++ b/mteb/tasks/Classification/hin/SentimentAnalysisHindi.py @@ -26,13 +26,16 @@ class SentimentAnalysisHindi(AbsTaskClassification): license="cc-by-nc-sa-4.0", annotations_creators="derived", sample_creation="found", - bibtex_citation="""@misc{OdiaGenAI, - author = {Shantipriya Parida and Sambit Sekhar and Soumendra Kumar Sahoo and Swateek Jena and Abhijeet Parida and Satya Ranjan Dash and Guneet Singh Kohli}, - title = {OdiaGenAI: Generative AI and LLM Initiative for the Odia Language}, - year = {2023}, - publisher = {Hugging Face}, - journal = {Hugging Face repository}, - howpublished = {{https://huggingface.co/OdiaGenAI}}, } """, + bibtex_citation=r""" +@misc{OdiaGenAI, + author = {Shantipriya Parida and Sambit Sekhar and Soumendra Kumar Sahoo and Swateek Jena and Abhijeet Parida and Satya Ranjan Dash and Guneet Singh Kohli}, + howpublished = {{https://huggingface.co/OdiaGenAI}}, + journal = {Hugging Face repository}, + publisher = {Hugging Face}, + title = {OdiaGenAI: Generative AI and LLM Initiative for the Odia Language}, + year = {2023}, +} +""", ) def dataset_transform(self): diff --git a/mteb/tasks/Classification/hrv/FrenkHrClassification.py b/mteb/tasks/Classification/hrv/FrenkHrClassification.py index e89ad42eb3..440e0c90ad 100644 --- a/mteb/tasks/Classification/hrv/FrenkHrClassification.py +++ b/mteb/tasks/Classification/hrv/FrenkHrClassification.py @@ -27,13 +27,15 @@ class FrenkHrClassification(AbsTaskClassification): annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation="""@misc{ljubešić2019frenk, - title={The FRENK Datasets of Socially Unacceptable Discourse in Slovene and English}, - author={Nikola Ljubešić and Darja Fišer and Tomaž Erjavec}, - year={2019}, - eprint={1906.02045}, - archivePrefix={arXiv}, - primaryClass={cs.CL}, - url={https://arxiv.org/abs/1906.02045} - }""", + bibtex_citation=r""" +@misc{ljubešić2019frenk, + archiveprefix = {arXiv}, + author = {Nikola Ljubešić and Darja Fišer and Tomaž Erjavec}, + eprint = {1906.02045}, + primaryclass = {cs.CL}, + title = {The FRENK Datasets of Socially Unacceptable Discourse in Slovene and English}, + url = {https://arxiv.org/abs/1906.02045}, + year = {2019}, +} +""", ) diff --git a/mteb/tasks/Classification/ind/IndonesianIdClickbaitClassification.py b/mteb/tasks/Classification/ind/IndonesianIdClickbaitClassification.py index 9fece9e214..adcfbd57df 100644 --- a/mteb/tasks/Classification/ind/IndonesianIdClickbaitClassification.py +++ b/mteb/tasks/Classification/ind/IndonesianIdClickbaitClassification.py @@ -26,19 +26,19 @@ class IndonesianIdClickbaitClassification(AbsTaskClassification): license="cc-by-4.0", annotations_creators="expert-annotated", sample_creation="found", - bibtex_citation=""" - @article{WILLIAM2020106231, -title = "CLICK-ID: A novel dataset for Indonesian clickbait headlines", -journal = "Data in Brief", -volume = "32", -pages = "106231", -year = "2020", -issn = "2352-3409", -doi = "https://doi.org/10.1016/j.dib.2020.106231", -url = "http://www.sciencedirect.com/science/article/pii/S2352340920311252", -author = "Andika William and Yunita Sari", -keywords = "Indonesian, Natural Language Processing, News articles, Clickbait, Text-classification", -abstract = "News analysis is a popular task in Natural Language Processing (NLP). In particular, the problem of clickbait in news analysis has gained attention in recent years [1, 2]. However, the majority of the tasks has been focused on English news, in which there is already a rich representative resource. For other languages, such as Indonesian, there is still a lack of resource for clickbait tasks. Therefore, we introduce the CLICK-ID dataset of Indonesian news headlines extracted from 12 Indonesian online news publishers. It is comprised of 15,000 annotated headlines with clickbait and non-clickbait labels. Using the CLICK-ID dataset, we then developed an Indonesian clickbait classification model achieving favourable performance. We believe that this corpus will be useful for replicable experiments in clickbait detection or other experiments in NLP areas." + bibtex_citation=r""" +@article{WILLIAM2020106231, + abstract = {News analysis is a popular task in Natural Language Processing (NLP). In particular, the problem of clickbait in news analysis has gained attention in recent years [1, 2]. However, the majority of the tasks has been focused on English news, in which there is already a rich representative resource. For other languages, such as Indonesian, there is still a lack of resource for clickbait tasks. Therefore, we introduce the CLICK-ID dataset of Indonesian news headlines extracted from 12 Indonesian online news publishers. It is comprised of 15,000 annotated headlines with clickbait and non-clickbait labels. Using the CLICK-ID dataset, we then developed an Indonesian clickbait classification model achieving favourable performance. We believe that this corpus will be useful for replicable experiments in clickbait detection or other experiments in NLP areas.}, + author = {Andika William and Yunita Sari}, + doi = {https://doi.org/10.1016/j.dib.2020.106231}, + issn = {2352-3409}, + journal = {Data in Brief}, + keywords = {Indonesian, Natural Language Processing, News articles, Clickbait, Text-classification}, + pages = {106231}, + title = {CLICK-ID: A novel dataset for Indonesian clickbait headlines}, + url = {http://www.sciencedirect.com/science/article/pii/S2352340920311252}, + volume = {32}, + year = {2020}, } """, ) diff --git a/mteb/tasks/Classification/ind/IndonesianMongabayConservationClassification.py b/mteb/tasks/Classification/ind/IndonesianMongabayConservationClassification.py index 91e54bc137..cef0f33fac 100644 --- a/mteb/tasks/Classification/ind/IndonesianMongabayConservationClassification.py +++ b/mteb/tasks/Classification/ind/IndonesianMongabayConservationClassification.py @@ -31,29 +31,29 @@ class IndonesianMongabayConservationClassification(AbsTaskClassification): annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation=""" - @inproceedings{fransiska-etal-2023-utilizing, - title = "Utilizing Weak Supervision to Generate {I}ndonesian Conservation Datasets", - author = "Fransiska, Mega and - Pitaloka, Diah and - Saripudin, Saripudin and - Putra, Satrio and - Sutawika*, Lintang", - editor = "Wijaya, Derry and - Aji, Alham Fikri and - Vania, Clara and - Winata, Genta Indra and - Purwarianti, Ayu", - booktitle = "Proceedings of the First Workshop in South East Asian Language Processing", - month = nov, - year = "2023", - address = "Nusa Dua, Bali, Indonesia", - publisher = "Association for Computational Linguistics", - url = "https://aclanthology.org/2023.sealp-1.4", - doi = "10.18653/v1/2023.sealp-1.4", - pages = "30--54", - } - """, + bibtex_citation=r""" +@inproceedings{fransiska-etal-2023-utilizing, + address = {Nusa Dua, Bali, Indonesia}, + author = {Fransiska, Mega and +Pitaloka, Diah and +Saripudin, Saripudin and +Putra, Satrio and +Sutawika*, Lintang}, + booktitle = {Proceedings of the First Workshop in South East Asian Language Processing}, + doi = {10.18653/v1/2023.sealp-1.4}, + editor = {Wijaya, Derry and +Aji, Alham Fikri and +Vania, Clara and +Winata, Genta Indra and +Purwarianti, Ayu}, + month = nov, + pages = {30--54}, + publisher = {Association for Computational Linguistics}, + title = {Utilizing Weak Supervision to Generate {I}ndonesian Conservation Datasets}, + url = {https://aclanthology.org/2023.sealp-1.4}, + year = {2023}, +} +""", ) def dataset_transform(self): diff --git a/mteb/tasks/Classification/ita/ItaCaseholdClassification.py b/mteb/tasks/Classification/ita/ItaCaseholdClassification.py index 837383ff69..b2024d6e79 100644 --- a/mteb/tasks/Classification/ita/ItaCaseholdClassification.py +++ b/mteb/tasks/Classification/ita/ItaCaseholdClassification.py @@ -26,25 +26,25 @@ class ItaCaseholdClassification(AbsTaskClassification): license="apache-2.0", annotations_creators="expert-annotated", sample_creation="found", - bibtex_citation=""" - @inproceedings{10.1145/3594536.3595177, - author = {Licari, Daniele and Bushipaka, Praveen and Marino, Gabriele and Comand\'{e}, Giovanni and Cucinotta, Tommaso}, - title = {Legal Holding Extraction from Italian Case Documents using Italian-LEGAL-BERT Text Summarization}, - year = {2023}, - isbn = {9798400701979}, - publisher = {Association for Computing Machinery}, - address = {New York, NY, USA}, - url = {https://doi.org/10.1145/3594536.3595177}, - doi = {10.1145/3594536.3595177}, - abstract = {Legal holdings are used in Italy as a critical component of the legal system, serving to establish legal precedents, provide guidance for future legal decisions, and ensure consistency and predictability in the interpretation and application of the law. They are written by domain experts who describe in a clear and concise manner the principle of law applied in the judgments.We introduce a legal holding extraction method based on Italian-LEGAL-BERT to automatically extract legal holdings from Italian cases. In addition, we present ITA-CaseHold, a benchmark dataset for Italian legal summarization. We conducted several experiments using this dataset, as a valuable baseline for future research on this topic.}, - booktitle = {Proceedings of the Nineteenth International Conference on Artificial Intelligence and Law}, - pages = {148–156}, - numpages = {9}, - keywords = {Italian-LEGAL-BERT, Holding Extraction, Extractive Text Summarization, Benchmark Dataset}, - location = {, Braga, Portugal, }, - series = {ICAIL '23} - } - """, + bibtex_citation=r""" +@inproceedings{10.1145/3594536.3595177, + abstract = {Legal holdings are used in Italy as a critical component of the legal system, serving to establish legal precedents, provide guidance for future legal decisions, and ensure consistency and predictability in the interpretation and application of the law. They are written by domain experts who describe in a clear and concise manner the principle of law applied in the judgments.We introduce a legal holding extraction method based on Italian-LEGAL-BERT to automatically extract legal holdings from Italian cases. In addition, we present ITA-CaseHold, a benchmark dataset for Italian legal summarization. We conducted several experiments using this dataset, as a valuable baseline for future research on this topic.}, + address = {New York, NY, USA}, + author = {Licari, Daniele and Bushipaka, Praveen and Marino, Gabriele and Comand\'{e}, Giovanni and Cucinotta, Tommaso}, + booktitle = {Proceedings of the Nineteenth International Conference on Artificial Intelligence and Law}, + doi = {10.1145/3594536.3595177}, + isbn = {9798400701979}, + keywords = {Italian-LEGAL-BERT, Holding Extraction, Extractive Text Summarization, Benchmark Dataset}, + location = {, Braga, Portugal, }, + numpages = {9}, + pages = {148–156}, + publisher = {Association for Computing Machinery}, + series = {ICAIL '23}, + title = {Legal Holding Extraction from Italian Case Documents using Italian-LEGAL-BERT Text Summarization}, + url = {https://doi.org/10.1145/3594536.3595177}, + year = {2023}, +} +""", ) def dataset_transform(self): diff --git a/mteb/tasks/Classification/ita/ItalianLinguistAcceptabilityClassification.py b/mteb/tasks/Classification/ita/ItalianLinguistAcceptabilityClassification.py index 9509f4d9ed..db6f371494 100644 --- a/mteb/tasks/Classification/ita/ItalianLinguistAcceptabilityClassification.py +++ b/mteb/tasks/Classification/ita/ItalianLinguistAcceptabilityClassification.py @@ -27,23 +27,23 @@ class ItalianLinguisticAcceptabilityClassification(AbsTaskClassification): license="not specified", annotations_creators="expert-annotated", sample_creation="found", - bibtex_citation=""" - @inproceedings{trotta-etal-2021-monolingual-cross, - title = "Monolingual and Cross-Lingual Acceptability Judgments with the {I}talian {C}o{LA} corpus", - author = "Trotta, Daniela and - Guarasci, Raffaele and - Leonardelli, Elisa and - Tonelli, Sara", - booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2021", - month = nov, - year = "2021", - address = "Punta Cana, Dominican Republic", - publisher = "Association for Computational Linguistics", - url = "https://aclanthology.org/2021.findings-emnlp.250", - doi = "10.18653/v1/2021.findings-emnlp.250", - pages = "2929--2940" + bibtex_citation=r""" +@inproceedings{trotta-etal-2021-monolingual-cross, + address = {Punta Cana, Dominican Republic}, + author = {Trotta, Daniela and +Guarasci, Raffaele and +Leonardelli, Elisa and +Tonelli, Sara}, + booktitle = {Findings of the Association for Computational Linguistics: EMNLP 2021}, + doi = {10.18653/v1/2021.findings-emnlp.250}, + month = nov, + pages = {2929--2940}, + publisher = {Association for Computational Linguistics}, + title = {Monolingual and Cross-Lingual Acceptability Judgments with the {I}talian {C}o{LA} corpus}, + url = {https://aclanthology.org/2021.findings-emnlp.250}, + year = {2021}, } - """, +""", ) def dataset_transform(self): diff --git a/mteb/tasks/Classification/jav/JavaneseIMDBClassification.py b/mteb/tasks/Classification/jav/JavaneseIMDBClassification.py index bc79f0b851..b0fa0144bd 100644 --- a/mteb/tasks/Classification/jav/JavaneseIMDBClassification.py +++ b/mteb/tasks/Classification/jav/JavaneseIMDBClassification.py @@ -27,16 +27,16 @@ class JavaneseIMDBClassification(AbsTaskClassification): annotations_creators="human-annotated", dialect=[], sample_creation="found", - bibtex_citation=""" - @inproceedings{wongso2021causal, - title={Causal and Masked Language Modeling of Javanese Language using Transformer-based Architectures}, - author={Wongso, Wilson and Setiawan, David Samuel and Suhartono, Derwin}, - booktitle={2021 International Conference on Advanced Computer Science and Information Systems (ICACSIS)}, - pages={1--7}, - year={2021}, - organization={IEEE} - } - """, + bibtex_citation=r""" +@inproceedings{wongso2021causal, + author = {Wongso, Wilson and Setiawan, David Samuel and Suhartono, Derwin}, + booktitle = {2021 International Conference on Advanced Computer Science and Information Systems (ICACSIS)}, + organization = {IEEE}, + pages = {1--7}, + title = {Causal and Masked Language Modeling of Javanese Language using Transformer-based Architectures}, + year = {2021}, +} +""", ) def dataset_transform(self): diff --git a/mteb/tasks/Classification/jpn/WRIMEClassification.py b/mteb/tasks/Classification/jpn/WRIMEClassification.py index 623a266177..893b092167 100644 --- a/mteb/tasks/Classification/jpn/WRIMEClassification.py +++ b/mteb/tasks/Classification/jpn/WRIMEClassification.py @@ -28,32 +28,34 @@ class WRIMEClassification(AbsTaskClassification): annotations_creators="human-annotated", dialect=[], sample_creation="found", - bibtex_citation="""@inproceedings{kajiwara-etal-2021-wrime, - title = "{WRIME}: A New Dataset for Emotional Intensity Estimation with Subjective and Objective Annotations", - author = "Kajiwara, Tomoyuki and - Chu, Chenhui and - Takemura, Noriko and - Nakashima, Yuta and - Nagahara, Hajime", - editor = "Toutanova, Kristina and - Rumshisky, Anna and - Zettlemoyer, Luke and - Hakkani-Tur, Dilek and - Beltagy, Iz and - Bethard, Steven and - Cotterell, Ryan and - Chakraborty, Tanmoy and - Zhou, Yichao", - booktitle = "Proceedings of the 2021 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies", - month = jun, - year = "2021", - address = "Online", - publisher = "Association for Computational Linguistics", - url = "https://aclanthology.org/2021.naacl-main.169", - doi = "10.18653/v1/2021.naacl-main.169", - pages = "2095--2104", - abstract = "We annotate 17,000 SNS posts with both the writer{'}s subjective emotional intensity and the reader{'}s objective one to construct a Japanese emotion analysis dataset. In this study, we explore the difference between the emotional intensity of the writer and that of the readers with this dataset. We found that the reader cannot fully detect the emotions of the writer, especially anger and trust. In addition, experimental results in estimating the emotional intensity show that it is more difficult to estimate the writer{'}s subjective labels than the readers{'}. The large gap between the subjective and objective emotions imply the complexity of the mapping from a post to the subjective emotion intensities, which also leads to a lower performance with machine learning models.", -}""", + bibtex_citation=r""" +@inproceedings{kajiwara-etal-2021-wrime, + abstract = {We annotate 17,000 SNS posts with both the writer{'}s subjective emotional intensity and the reader{'}s objective one to construct a Japanese emotion analysis dataset. In this study, we explore the difference between the emotional intensity of the writer and that of the readers with this dataset. We found that the reader cannot fully detect the emotions of the writer, especially anger and trust. In addition, experimental results in estimating the emotional intensity show that it is more difficult to estimate the writer{'}s subjective labels than the readers{'}. The large gap between the subjective and objective emotions imply the complexity of the mapping from a post to the subjective emotion intensities, which also leads to a lower performance with machine learning models.}, + address = {Online}, + author = {Kajiwara, Tomoyuki and +Chu, Chenhui and +Takemura, Noriko and +Nakashima, Yuta and +Nagahara, Hajime}, + booktitle = {Proceedings of the 2021 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies}, + doi = {10.18653/v1/2021.naacl-main.169}, + editor = {Toutanova, Kristina and +Rumshisky, Anna and +Zettlemoyer, Luke and +Hakkani-Tur, Dilek and +Beltagy, Iz and +Bethard, Steven and +Cotterell, Ryan and +Chakraborty, Tanmoy and +Zhou, Yichao}, + month = jun, + pages = {2095--2104}, + publisher = {Association for Computational Linguistics}, + title = {{WRIME}: A New Dataset for Emotional Intensity Estimation with Subjective and Objective Annotations}, + url = {https://aclanthology.org/2021.naacl-main.169}, + year = {2021}, +} +""", ) def dataset_transform(self): diff --git a/mteb/tasks/Classification/kan/KannadaNewsClassification.py b/mteb/tasks/Classification/kan/KannadaNewsClassification.py index f005e56518..4d3edcc6ca 100644 --- a/mteb/tasks/Classification/kan/KannadaNewsClassification.py +++ b/mteb/tasks/Classification/kan/KannadaNewsClassification.py @@ -26,13 +26,14 @@ class KannadaNewsClassification(AbsTaskClassification): annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation=""" - @article{kunchukuttan2020indicnlpcorpus, - title={AI4Bharat-IndicNLP Corpus: Monolingual Corpora and Word Embeddings for Indic Languages}, - author={Anoop Kunchukuttan and Divyanshu Kakwani and Satish Golla and Gokul N.C. and Avik Bhattacharyya and Mitesh M. Khapra and Pratyush Kumar}, - year={2020}, - journal={arXiv preprint arXiv:2005.00085}, -}""", + bibtex_citation=r""" +@article{kunchukuttan2020indicnlpcorpus, + author = {Anoop Kunchukuttan and Divyanshu Kakwani and Satish Golla and Gokul N.C. and Avik Bhattacharyya and Mitesh M. Khapra and Pratyush Kumar}, + journal = {arXiv preprint arXiv:2005.00085}, + title = {AI4Bharat-IndicNLP Corpus: Monolingual Corpora and Word Embeddings for Indic Languages}, + year = {2020}, +} +""", ) def dataset_transform(self): diff --git a/mteb/tasks/Classification/kat/GeorgianSentimentClassification.py b/mteb/tasks/Classification/kat/GeorgianSentimentClassification.py index e34d148a36..aa7458c97b 100644 --- a/mteb/tasks/Classification/kat/GeorgianSentimentClassification.py +++ b/mteb/tasks/Classification/kat/GeorgianSentimentClassification.py @@ -26,33 +26,33 @@ class GeorgianSentimentClassification(AbsTaskClassification): annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation=""" - @inproceedings{stefanovitch-etal-2022-resources, - title = "Resources and Experiments on Sentiment Classification for {G}eorgian", - author = "Stefanovitch, Nicolas and - Piskorski, Jakub and - Kharazi, Sopho", - editor = "Calzolari, Nicoletta and - B{\'e}chet, Fr{\'e}d{\'e}ric and - Blache, Philippe and - Choukri, Khalid and - Cieri, Christopher and - Declerck, Thierry and - Goggi, Sara and - Isahara, Hitoshi and - Maegaard, Bente and - Mariani, Joseph and - Mazo, H{\'e}l{\`e}ne and - Odijk, Jan and - Piperidis, Stelios", - booktitle = "Proceedings of the Thirteenth Language Resources and Evaluation Conference", - month = jun, - year = "2022", - address = "Marseille, France", - publisher = "European Language Resources Association", - url = "https://aclanthology.org/2022.lrec-1.173", - pages = "1613--1621", - abstract = "This paper presents, to the best of our knowledge, the first ever publicly available annotated dataset for sentiment classification and semantic polarity dictionary for Georgian. The characteristics of these resources and the process of their creation are described in detail. The results of various experiments on the performance of both lexicon- and machine learning-based models for Georgian sentiment classification are also reported. Both 3-label (positive, neutral, negative) and 4-label settings (same labels + mixed) are considered. The machine learning models explored include, i.a., logistic regression, SVMs, and transformed-based models. We also explore transfer learning- and translation-based (to a well-supported language) approaches. The obtained results for Georgian are on par with the state-of-the-art results in sentiment classification for well studied languages when using training data of comparable size.", + bibtex_citation=r""" +@inproceedings{stefanovitch-etal-2022-resources, + abstract = {This paper presents, to the best of our knowledge, the first ever publicly available annotated dataset for sentiment classification and semantic polarity dictionary for Georgian. The characteristics of these resources and the process of their creation are described in detail. The results of various experiments on the performance of both lexicon- and machine learning-based models for Georgian sentiment classification are also reported. Both 3-label (positive, neutral, negative) and 4-label settings (same labels + mixed) are considered. The machine learning models explored include, i.a., logistic regression, SVMs, and transformed-based models. We also explore transfer learning- and translation-based (to a well-supported language) approaches. The obtained results for Georgian are on par with the state-of-the-art results in sentiment classification for well studied languages when using training data of comparable size.}, + address = {Marseille, France}, + author = {Stefanovitch, Nicolas and +Piskorski, Jakub and +Kharazi, Sopho}, + booktitle = {Proceedings of the Thirteenth Language Resources and Evaluation Conference}, + editor = {Calzolari, Nicoletta and +B{\'e}chet, Fr{\'e}d{\'e}ric and +Blache, Philippe and +Choukri, Khalid and +Cieri, Christopher and +Declerck, Thierry and +Goggi, Sara and +Isahara, Hitoshi and +Maegaard, Bente and +Mariani, Joseph and +Mazo, H{\'e}l{\`e}ne and +Odijk, Jan and +Piperidis, Stelios}, + month = jun, + pages = {1613--1621}, + publisher = {European Language Resources Association}, + title = {Resources and Experiments on Sentiment Classification for {G}eorgian}, + url = {https://aclanthology.org/2022.lrec-1.173}, + year = {2022}, } - """, +""", ) diff --git a/mteb/tasks/Classification/kor/KlueTC.py b/mteb/tasks/Classification/kor/KlueTC.py index 55a2d760d9..bf878570ac 100644 --- a/mteb/tasks/Classification/kor/KlueTC.py +++ b/mteb/tasks/Classification/kor/KlueTC.py @@ -27,14 +27,16 @@ class KlueTC(AbsTaskClassification): annotations_creators="human-annotated", dialect=[], sample_creation="found", - bibtex_citation="""@misc{park2021klue, - title={KLUE: Korean Language Understanding Evaluation}, - author={Sungjoon Park and Jihyung Moon and Sungdong Kim and Won Ik Cho and Jiyoon Han and Jangwon Park and Chisung Song and Junseong Kim and Yongsook Song and Taehwan Oh and Joohong Lee and Juhyun Oh and Sungwon Lyu and Younghoon Jeong and Inkwon Lee and Sangwoo Seo and Dongjun Lee and Hyunwoo Kim and Myeonghwa Lee and Seongbo Jang and Seungwon Do and Sunkyoung Kim and Kyungtae Lim and Jongwon Lee and Kyumin Park and Jamin Shin and Seonghyun Kim and Lucy Park and Alice Oh and Jungwoo Ha and Kyunghyun Cho}, - year={2021}, - eprint={2105.09680}, - archivePrefix={arXiv}, - primaryClass={cs.CL} -}""", + bibtex_citation=r""" +@misc{park2021klue, + archiveprefix = {arXiv}, + author = {Sungjoon Park and Jihyung Moon and Sungdong Kim and Won Ik Cho and Jiyoon Han and Jangwon Park and Chisung Song and Junseong Kim and Yongsook Song and Taehwan Oh and Joohong Lee and Juhyun Oh and Sungwon Lyu and Younghoon Jeong and Inkwon Lee and Sangwoo Seo and Dongjun Lee and Hyunwoo Kim and Myeonghwa Lee and Seongbo Jang and Seungwon Do and Sunkyoung Kim and Kyungtae Lim and Jongwon Lee and Kyumin Park and Jamin Shin and Seonghyun Kim and Lucy Park and Alice Oh and Jungwoo Ha and Kyunghyun Cho}, + eprint = {2105.09680}, + primaryclass = {cs.CL}, + title = {KLUE: Korean Language Understanding Evaluation}, + year = {2021}, +} +""", ) def dataset_transform(self): diff --git a/mteb/tasks/Classification/kor/KorFin.py b/mteb/tasks/Classification/kor/KorFin.py index 1fdfb47694..c59de86c92 100644 --- a/mteb/tasks/Classification/kor/KorFin.py +++ b/mteb/tasks/Classification/kor/KorFin.py @@ -31,14 +31,14 @@ class KorFin(AbsTaskClassification): annotations_creators="expert-annotated", dialect=[], sample_creation="found", - bibtex_citation="""" - @article{son2023removing, - title={Removing Non-Stationary Knowledge From Pre-Trained Language Models for Entity-Level Sentiment Classification in Finance}, - author={Son, Guijin and Lee, Hanwool and Kang, Nahyeon and Hahm, Moonjeong}, - journal={arXiv preprint arXiv:2301.03136}, - year={2023} - } - """, + bibtex_citation=r""" +@article{son2023removing, + author = {Son, Guijin and Lee, Hanwool and Kang, Nahyeon and Hahm, Moonjeong}, + journal = {arXiv preprint arXiv:2301.03136}, + title = {Removing Non-Stationary Knowledge From Pre-Trained Language Models for Entity-Level Sentiment Classification in Finance}, + year = {2023}, +} +""", ) def dataset_transform(self): diff --git a/mteb/tasks/Classification/kor/KorHateClassification.py b/mteb/tasks/Classification/kor/KorHateClassification.py index 49cce26a25..a9ec38fdef 100644 --- a/mteb/tasks/Classification/kor/KorHateClassification.py +++ b/mteb/tasks/Classification/kor/KorHateClassification.py @@ -34,15 +34,16 @@ class KorHateClassification(AbsTaskClassification): annotations_creators="expert-annotated", dialect=[], sample_creation="found", - bibtex_citation=""" - @misc{moon2020beep, - title={BEEP! Korean Corpus of Online News Comments for Toxic Speech Detection}, - author={Jihyung Moon and Won Ik Cho and Junbum Lee}, - year={2020}, - eprint={2005.12503}, - archivePrefix={arXiv}, - primaryClass={cs.CL} - }""", + bibtex_citation=r""" +@misc{moon2020beep, + archiveprefix = {arXiv}, + author = {Jihyung Moon and Won Ik Cho and Junbum Lee}, + eprint = {2005.12503}, + primaryclass = {cs.CL}, + title = {BEEP! Korean Corpus of Online News Comments for Toxic Speech Detection}, + year = {2020}, +} +""", ) def dataset_transform(self): diff --git a/mteb/tasks/Classification/kor/KorSarcasmClassification.py b/mteb/tasks/Classification/kor/KorSarcasmClassification.py index 666ae4ca54..abae7c8222 100644 --- a/mteb/tasks/Classification/kor/KorSarcasmClassification.py +++ b/mteb/tasks/Classification/kor/KorSarcasmClassification.py @@ -34,16 +34,16 @@ class KorSarcasmClassification(AbsTaskClassification): annotations_creators="expert-annotated", dialect=[], sample_creation="found", - bibtex_citation=""" - @misc{kim2019kocasm, - author = {Kim, Jiwon and Cho, Won Ik}, - title = {Kocasm: Korean Automatic Sarcasm Detection}, - year = {2019}, - publisher = {GitHub}, - journal = {GitHub repository}, - howpublished = {https://github.com/SpellOnYou/korean-sarcasm} - } - """, + bibtex_citation=r""" +@misc{kim2019kocasm, + author = {Kim, Jiwon and Cho, Won Ik}, + howpublished = {https://github.com/SpellOnYou/korean-sarcasm}, + journal = {GitHub repository}, + publisher = {GitHub}, + title = {Kocasm: Korean Automatic Sarcasm Detection}, + year = {2019}, +} +""", ) def dataset_transform(self): diff --git a/mteb/tasks/Classification/kur/KurdishSentimentClassification.py b/mteb/tasks/Classification/kur/KurdishSentimentClassification.py index 2f9564caff..876b7450fd 100644 --- a/mteb/tasks/Classification/kur/KurdishSentimentClassification.py +++ b/mteb/tasks/Classification/kur/KurdishSentimentClassification.py @@ -26,15 +26,15 @@ class KurdishSentimentClassification(AbsTaskClassification): annotations_creators="derived", dialect=["Sorani"], sample_creation="found", - bibtex_citation=""" - @article{article, - author = {Badawi, Soran and Kazemi, Arefeh and Rezaie, Vali}, - year = {2024}, - month = {01}, - pages = {1-20}, - title = {KurdiSent: a corpus for kurdish sentiment analysis}, - journal = {Language Resources and Evaluation}, - doi = {10.1007/s10579-023-09716-6} - } - """, + bibtex_citation=r""" +@article{article, + author = {Badawi, Soran and Kazemi, Arefeh and Rezaie, Vali}, + doi = {10.1007/s10579-023-09716-6}, + journal = {Language Resources and Evaluation}, + month = {01}, + pages = {1-20}, + title = {KurdiSent: a corpus for kurdish sentiment analysis}, + year = {2024}, +} +""", ) diff --git a/mteb/tasks/Classification/mal/MalayalamNewsClassification.py b/mteb/tasks/Classification/mal/MalayalamNewsClassification.py index e454700717..689e7688ac 100644 --- a/mteb/tasks/Classification/mal/MalayalamNewsClassification.py +++ b/mteb/tasks/Classification/mal/MalayalamNewsClassification.py @@ -26,12 +26,14 @@ class MalayalamNewsClassification(AbsTaskClassification): annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation="""@article{kunchukuttan2020indicnlpcorpus, - title={AI4Bharat-IndicNLP Corpus: Monolingual Corpora and Word Embeddings for Indic Languages}, - author={Anoop Kunchukuttan and Divyanshu Kakwani and Satish Golla and Gokul N.C. and Avik Bhattacharyya and Mitesh M. Khapra and Pratyush Kumar}, - year={2020}, - journal={arXiv preprint arXiv:2005.00085}, -}""", + bibtex_citation=r""" +@article{kunchukuttan2020indicnlpcorpus, + author = {Anoop Kunchukuttan and Divyanshu Kakwani and Satish Golla and Gokul N.C. and Avik Bhattacharyya and Mitesh M. Khapra and Pratyush Kumar}, + journal = {arXiv preprint arXiv:2005.00085}, + title = {AI4Bharat-IndicNLP Corpus: Monolingual Corpora and Word Embeddings for Indic Languages}, + year = {2020}, +} +""", ) def dataset_transform(self): diff --git a/mteb/tasks/Classification/mar/MarathiNewsClassification.py b/mteb/tasks/Classification/mar/MarathiNewsClassification.py index 7fa104c444..4f652e2373 100644 --- a/mteb/tasks/Classification/mar/MarathiNewsClassification.py +++ b/mteb/tasks/Classification/mar/MarathiNewsClassification.py @@ -26,12 +26,14 @@ class MarathiNewsClassification(AbsTaskClassification): annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation="""@article{kunchukuttan2020indicnlpcorpus, - title={AI4Bharat-IndicNLP Corpus: Monolingual Corpora and Word Embeddings for Indic Languages}, - author={Anoop Kunchukuttan and Divyanshu Kakwani and Satish Golla and Gokul N.C. and Avik Bhattacharyya and Mitesh M. Khapra and Pratyush Kumar}, - year={2020}, - journal={arXiv preprint arXiv:2005.00085}, -}""", + bibtex_citation=r""" +@article{kunchukuttan2020indicnlpcorpus, + author = {Anoop Kunchukuttan and Divyanshu Kakwani and Satish Golla and Gokul N.C. and Avik Bhattacharyya and Mitesh M. Khapra and Pratyush Kumar}, + journal = {arXiv preprint arXiv:2005.00085}, + title = {AI4Bharat-IndicNLP Corpus: Monolingual Corpora and Word Embeddings for Indic Languages}, + year = {2020}, +} +""", ) def dataset_transform(self): diff --git a/mteb/tasks/Classification/mkd/MacedonianTweetSentimentClassification.py b/mteb/tasks/Classification/mkd/MacedonianTweetSentimentClassification.py index 58a555c6b1..3eb6b2dc81 100644 --- a/mteb/tasks/Classification/mkd/MacedonianTweetSentimentClassification.py +++ b/mteb/tasks/Classification/mkd/MacedonianTweetSentimentClassification.py @@ -26,20 +26,22 @@ class MacedonianTweetSentimentClassification(AbsTaskClassification): annotations_creators="human-annotated", dialect=[], sample_creation="found", - bibtex_citation="""@inproceedings{jovanoski-etal-2015-sentiment, - title = "Sentiment Analysis in {T}witter for {M}acedonian", - author = "Jovanoski, Dame and - Pachovski, Veno and - Nakov, Preslav", - editor = "Mitkov, Ruslan and - Angelova, Galia and - Bontcheva, Kalina", - booktitle = "Proceedings of the International Conference Recent Advances in Natural Language Processing", - month = sep, - year = "2015", - address = "Hissar, Bulgaria", - publisher = "INCOMA Ltd. Shoumen, BULGARIA", - url = "https://aclanthology.org/R15-1034", - pages = "249--257", -}""", + bibtex_citation=r""" +@inproceedings{jovanoski-etal-2015-sentiment, + address = {Hissar, Bulgaria}, + author = {Jovanoski, Dame and +Pachovski, Veno and +Nakov, Preslav}, + booktitle = {Proceedings of the International Conference Recent Advances in Natural Language Processing}, + editor = {Mitkov, Ruslan and +Angelova, Galia and +Bontcheva, Kalina}, + month = sep, + pages = {249--257}, + publisher = {INCOMA Ltd. Shoumen, BULGARIA}, + title = {Sentiment Analysis in {T}witter for {M}acedonian}, + url = {https://aclanthology.org/R15-1034}, + year = {2015}, +} +""", ) diff --git a/mteb/tasks/Classification/multilingual/AfriSentiClassification.py b/mteb/tasks/Classification/multilingual/AfriSentiClassification.py index 8a4a79d68b..ff3e6575c9 100644 --- a/mteb/tasks/Classification/multilingual/AfriSentiClassification.py +++ b/mteb/tasks/Classification/multilingual/AfriSentiClassification.py @@ -52,11 +52,13 @@ class AfriSentiClassification(MultilingualTask, AbsTaskClassification): annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation="""@inproceedings{Muhammad2023AfriSentiAT, - title=AfriSenti: A Twitter Sentiment Analysis Benchmark for African Languages, - author=Shamsuddeen Hassan Muhammad and Idris Abdulmumin and Abinew Ali Ayele and Nedjma Ousidhoum and David Ifeoluwa Adelani and Seid Muhie Yimam and Ibrahim Sa'id Ahmad and Meriem Beloucif and Saif Mohammad and Sebastian Ruder and Oumaima Hourrane and Pavel Brazdil and Felermino D'ario M'ario Ant'onio Ali and Davis Davis and Salomey Osei and Bello Shehu Bello and Falalu Ibrahim and Tajuddeen Gwadabe and Samuel Rutunda and Tadesse Belay and Wendimu Baye Messelle and Hailu Beshada Balcha and Sisay Adugna Chala and Hagos Tesfahun Gebremichael and Bernard Opoku and Steven Arthur, - year=2023 - }""", + bibtex_citation=r""" +@inproceedings{Muhammad2023AfriSentiAT, + author = {Shamsuddeen Hassan Muhammad and Idris Abdulmumin and Abinew Ali Ayele and Nedjma Ousidhoum and David Ifeoluwa Adelani and Seid Muhie Yimam and Ibrahim Sa'id Ahmad and Meriem Beloucif and Saif Mohammad and Sebastian Ruder and Oumaima Hourrane and Pavel Brazdil and Felermino D'ario M'ario Ant'onio Ali and Davis Davis and Salomey Osei and Bello Shehu Bello and Falalu Ibrahim and Tajuddeen Gwadabe and Samuel Rutunda and Tadesse Belay and Wendimu Baye Messelle and Hailu Beshada Balcha and Sisay Adugna Chala and Hagos Tesfahun Gebremichael and Bernard Opoku and Steven Arthur}, + title = {AfriSenti: A Twitter Sentiment Analysis Benchmark for African Languages}, + year = {2023}, +} +""", ) def load_data(self, **kwargs): diff --git a/mteb/tasks/Classification/multilingual/AmazonCounterfactualClassification.py b/mteb/tasks/Classification/multilingual/AmazonCounterfactualClassification.py index 112d4e0b27..165a5ea54d 100644 --- a/mteb/tasks/Classification/multilingual/AmazonCounterfactualClassification.py +++ b/mteb/tasks/Classification/multilingual/AmazonCounterfactualClassification.py @@ -38,27 +38,29 @@ class AmazonCounterfactualClassification(MultilingualTask, AbsTaskClassification annotations_creators="human-annotated", dialect=[], sample_creation="found", - bibtex_citation="""@inproceedings{oneill-etal-2021-wish, - title = "{I} Wish {I} Would Have Loved This One, But {I} Didn{'}t {--} A Multilingual Dataset for Counterfactual Detection in Product Review", - author = "O{'}Neill, James and - Rozenshtein, Polina and - Kiryo, Ryuichi and - Kubota, Motoko and - Bollegala, Danushka", - editor = "Moens, Marie-Francine and - Huang, Xuanjing and - Specia, Lucia and - Yih, Scott Wen-tau", - booktitle = "Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing", - month = nov, - year = "2021", - address = "Online and Punta Cana, Dominican Republic", - publisher = "Association for Computational Linguistics", - url = "https://aclanthology.org/2021.emnlp-main.568", - doi = "10.18653/v1/2021.emnlp-main.568", - pages = "7092--7108", - abstract = "Counterfactual statements describe events that did not or cannot take place. We consider the problem of counterfactual detection (CFD) in product reviews. For this purpose, we annotate a multilingual CFD dataset from Amazon product reviews covering counterfactual statements written in English, German, and Japanese languages. The dataset is unique as it contains counterfactuals in multiple languages, covers a new application area of e-commerce reviews, and provides high quality professional annotations. We train CFD models using different text representation methods and classifiers. We find that these models are robust against the selectional biases introduced due to cue phrase-based sentence selection. Moreover, our CFD dataset is compatible with prior datasets and can be merged to learn accurate CFD models. Applying machine translation on English counterfactual examples to create multilingual data performs poorly, demonstrating the language-specificity of this problem, which has been ignored so far.", -}""", + bibtex_citation=r""" +@inproceedings{oneill-etal-2021-wish, + abstract = {Counterfactual statements describe events that did not or cannot take place. We consider the problem of counterfactual detection (CFD) in product reviews. For this purpose, we annotate a multilingual CFD dataset from Amazon product reviews covering counterfactual statements written in English, German, and Japanese languages. The dataset is unique as it contains counterfactuals in multiple languages, covers a new application area of e-commerce reviews, and provides high quality professional annotations. We train CFD models using different text representation methods and classifiers. We find that these models are robust against the selectional biases introduced due to cue phrase-based sentence selection. Moreover, our CFD dataset is compatible with prior datasets and can be merged to learn accurate CFD models. Applying machine translation on English counterfactual examples to create multilingual data performs poorly, demonstrating the language-specificity of this problem, which has been ignored so far.}, + address = {Online and Punta Cana, Dominican Republic}, + author = {O{'}Neill, James and +Rozenshtein, Polina and +Kiryo, Ryuichi and +Kubota, Motoko and +Bollegala, Danushka}, + booktitle = {Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing}, + doi = {10.18653/v1/2021.emnlp-main.568}, + editor = {Moens, Marie-Francine and +Huang, Xuanjing and +Specia, Lucia and +Yih, Scott Wen-tau}, + month = nov, + pages = {7092--7108}, + publisher = {Association for Computational Linguistics}, + title = {{I} Wish {I} Would Have Loved This One, But {I} Didn{'}t {--} A Multilingual Dataset for Counterfactual Detection in Product Review}, + url = {https://aclanthology.org/2021.emnlp-main.568}, + year = {2021}, +} +""", prompt="Classify a given Amazon customer review text as either counterfactual or not-counterfactual", ) diff --git a/mteb/tasks/Classification/multilingual/AmazonReviewsClassification.py b/mteb/tasks/Classification/multilingual/AmazonReviewsClassification.py index ca9fab41e0..d2250701f1 100644 --- a/mteb/tasks/Classification/multilingual/AmazonReviewsClassification.py +++ b/mteb/tasks/Classification/multilingual/AmazonReviewsClassification.py @@ -35,13 +35,15 @@ class AmazonReviewsClassification(MultilingualTask, AbsTaskClassification): annotations_creators="human-annotated", dialect=[], sample_creation="found", - bibtex_citation="""@misc{keung2020multilingual, - title={The Multilingual Amazon Reviews Corpus}, - author={Phillip Keung and Yichao Lu and György Szarvas and Noah A. Smith}, - year={2020}, - eprint={2010.02573}, - archivePrefix={arXiv}, - primaryClass={cs.CL} -}""", + bibtex_citation=r""" +@misc{keung2020multilingual, + archiveprefix = {arXiv}, + author = {Phillip Keung and Yichao Lu and György Szarvas and Noah A. Smith}, + eprint = {2010.02573}, + primaryclass = {cs.CL}, + title = {The Multilingual Amazon Reviews Corpus}, + year = {2020}, +} +""", prompt="Classify the given Amazon review into its appropriate rating category", ) diff --git a/mteb/tasks/Classification/multilingual/CataloniaTweetClassification.py b/mteb/tasks/Classification/multilingual/CataloniaTweetClassification.py index c21fee9cfa..f65861e3fb 100644 --- a/mteb/tasks/Classification/multilingual/CataloniaTweetClassification.py +++ b/mteb/tasks/Classification/multilingual/CataloniaTweetClassification.py @@ -37,33 +37,35 @@ class CataloniaTweetClassification(MultilingualTask, AbsTaskClassification): annotations_creators="expert-annotated", dialect=[], sample_creation="created", - bibtex_citation="""@inproceedings{zotova-etal-2020-multilingual, - title = "Multilingual Stance Detection in Tweets: The {C}atalonia Independence Corpus", - author = "Zotova, Elena and - Agerri, Rodrigo and - Nu{\~n}ez, Manuel and - Rigau, German", - editor = "Calzolari, Nicoletta and - B{\'e}chet, Fr{\'e}d{\'e}ric and - Blache, Philippe and - Choukri, Khalid and - Cieri, Christopher and - Declerck, Thierry and - Goggi, Sara and - Isahara, Hitoshi and - Maegaard, Bente and - Mariani, Joseph and - Mazo, H{\'e}l{\`e}ne and - Moreno, Asuncion and - Odijk, Jan and - Piperidis, Stelios", - booktitle = "Proceedings of the Twelfth Language Resources and Evaluation Conference", - month = may, - year = "2020", - publisher = "European Language Resources Association", - pages = "1368--1375", - ISBN = "979-10-95546-34-4", - }""", + bibtex_citation=r""" +@inproceedings{zotova-etal-2020-multilingual, + author = {Zotova, Elena and +Agerri, Rodrigo and +Nu{\~n}ez, Manuel and +Rigau, German}, + booktitle = {Proceedings of the Twelfth Language Resources and Evaluation Conference}, + editor = {Calzolari, Nicoletta and +B{\'e}chet, Fr{\'e}d{\'e}ric and +Blache, Philippe and +Choukri, Khalid and +Cieri, Christopher and +Declerck, Thierry and +Goggi, Sara and +Isahara, Hitoshi and +Maegaard, Bente and +Mariani, Joseph and +Mazo, H{\'e}l{\`e}ne and +Moreno, Asuncion and +Odijk, Jan and +Piperidis, Stelios}, + isbn = {979-10-95546-34-4}, + month = may, + pages = {1368--1375}, + publisher = {European Language Resources Association}, + title = {Multilingual Stance Detection in Tweets: The {C}atalonia Independence Corpus}, + year = {2020}, +} +""", ) def dataset_transform(self): diff --git a/mteb/tasks/Classification/multilingual/CyrillicTurkicLangClassification.py b/mteb/tasks/Classification/multilingual/CyrillicTurkicLangClassification.py index 3c0d2ca2a2..6aebfe32c9 100644 --- a/mteb/tasks/Classification/multilingual/CyrillicTurkicLangClassification.py +++ b/mteb/tasks/Classification/multilingual/CyrillicTurkicLangClassification.py @@ -36,14 +36,14 @@ class CyrillicTurkicLangClassification(AbsTaskClassification): annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation=""" - @inproceedings{goldhahn2012building, - title={Building Large Monolingual Dictionaries at the Leipzig Corpora Collection: From 100 to 200 Languages}, - author={Goldhahn, Dirk and Eckart, Thomas and Quasthoff, Uwe}, - booktitle={Proceedings of the Eighth International Conference on Language Resources and Evaluation (LREC'12)}, - year={2012} - } - """, + bibtex_citation=r""" +@inproceedings{goldhahn2012building, + author = {Goldhahn, Dirk and Eckart, Thomas and Quasthoff, Uwe}, + booktitle = {Proceedings of the Eighth International Conference on Language Resources and Evaluation (LREC'12)}, + title = {Building Large Monolingual Dictionaries at the Leipzig Corpora Collection: From 100 to 200 Languages}, + year = {2012}, +} +""", ) def dataset_transform(self): diff --git a/mteb/tasks/Classification/multilingual/HinDialectClassification.py b/mteb/tasks/Classification/multilingual/HinDialectClassification.py index c9d6b36669..08258c3387 100644 --- a/mteb/tasks/Classification/multilingual/HinDialectClassification.py +++ b/mteb/tasks/Classification/multilingual/HinDialectClassification.py @@ -50,15 +50,16 @@ class HinDialectClassification(AbsTaskClassification): annotations_creators="expert-annotated", dialect=[], sample_creation="found", - bibtex_citation=""" - @misc{11234/1-4839, - title = {{HinDialect} 1.1: 26 Hindi-related languages and dialects of the Indic Continuum in North India}, - author = {Bafna, Niyati and {\v Z}abokrtsk{\'y}, Zden{\v e}k and Espa{\~n}a-Bonet, Cristina and van Genabith, Josef and Kumar, Lalit "Samyak Lalit" and Suman, Sharda and Shivay, Rahul}, - url = {http://hdl.handle.net/11234/1-4839}, - note = {{LINDAT}/{CLARIAH}-{CZ} digital library at the Institute of Formal and Applied Linguistics ({{\'U}FAL}), Faculty of Mathematics and Physics, Charles University}, - copyright = {Creative Commons - Attribution-{NonCommercial}-{ShareAlike} 4.0 International ({CC} {BY}-{NC}-{SA} 4.0)}, - year = {2022} } - """, + bibtex_citation=r""" +@misc{11234/1-4839, + author = {Bafna, Niyati and {\v Z}abokrtsk{\'y}, Zden{\v e}k and Espa{\~n}a-Bonet, Cristina and van Genabith, Josef and Kumar, Lalit "Samyak Lalit" and Suman, Sharda and Shivay, Rahul}, + copyright = {Creative Commons - Attribution-{NonCommercial}-{ShareAlike} 4.0 International ({CC} {BY}-{NC}-{SA} 4.0)}, + note = {{LINDAT}/{CLARIAH}-{CZ} digital library at the Institute of Formal and Applied Linguistics ({{\'U}FAL}), Faculty of Mathematics and Physics, Charles University}, + title = {{HinDialect} 1.1: 26 Hindi-related languages and dialects of the Indic Continuum in North India}, + url = {http://hdl.handle.net/11234/1-4839}, + year = {2022}, +} +""", ) def dataset_transform(self) -> None: diff --git a/mteb/tasks/Classification/multilingual/IndicLangClassification.py b/mteb/tasks/Classification/multilingual/IndicLangClassification.py index 47564cf501..51d029061b 100644 --- a/mteb/tasks/Classification/multilingual/IndicLangClassification.py +++ b/mteb/tasks/Classification/multilingual/IndicLangClassification.py @@ -84,23 +84,25 @@ class IndicLangClassification(AbsTaskClassification): annotations_creators="expert-annotated", dialect=[], sample_creation="created", - bibtex_citation="""@inproceedings{madhani-etal-2023-bhasa, - title = "Bhasa-Abhijnaanam: Native-script and romanized Language Identification for 22 {I}ndic languages", - author = "Madhani, Yash and - Khapra, Mitesh M. and - Kunchukuttan, Anoop", - editor = "Rogers, Anna and - Boyd-Graber, Jordan and - Okazaki, Naoaki", - booktitle = "Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers)", - month = jul, - year = "2023", - address = "Toronto, Canada", - publisher = "Association for Computational Linguistics", - url = "https://aclanthology.org/2023.acl-short.71", - doi = "10.18653/v1/2023.acl-short.71", - pages = "816--826" -}""", + bibtex_citation=r""" +@inproceedings{madhani-etal-2023-bhasa, + address = {Toronto, Canada}, + author = {Madhani, Yash and +Khapra, Mitesh M. and +Kunchukuttan, Anoop}, + booktitle = {Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers)}, + doi = {10.18653/v1/2023.acl-short.71}, + editor = {Rogers, Anna and +Boyd-Graber, Jordan and +Okazaki, Naoaki}, + month = jul, + pages = {816--826}, + publisher = {Association for Computational Linguistics}, + title = {Bhasa-Abhijnaanam: Native-script and romanized Language Identification for 22 {I}ndic languages}, + url = {https://aclanthology.org/2023.acl-short.71}, + year = {2023}, +} +""", ) def load_data(self, **kwargs: Any) -> None: diff --git a/mteb/tasks/Classification/multilingual/IndicNLPNewsClassification.py b/mteb/tasks/Classification/multilingual/IndicNLPNewsClassification.py index 3995917696..24e3dc8a9b 100644 --- a/mteb/tasks/Classification/multilingual/IndicNLPNewsClassification.py +++ b/mteb/tasks/Classification/multilingual/IndicNLPNewsClassification.py @@ -38,13 +38,14 @@ class IndicNLPNewsClassification(MultilingualTask, AbsTaskClassification): license="cc-by-nc-4.0", annotations_creators="expert-annotated", sample_creation="found", - bibtex_citation=""" - @article{kunchukuttan2020indicnlpcorpus, - title={AI4Bharat-IndicNLP Corpus: Monolingual Corpora and Word Embeddings for Indic Languages}, - author={Anoop Kunchukuttan and Divyanshu Kakwani and Satish Golla and Gokul N.C. and Avik Bhattacharyya and Mitesh M. Khapra and Pratyush Kumar}, - year={2020}, - journal={arXiv preprint arXiv:2005.00085} -}""", + bibtex_citation=r""" +@article{kunchukuttan2020indicnlpcorpus, + author = {Anoop Kunchukuttan and Divyanshu Kakwani and Satish Golla and Gokul N.C. and Avik Bhattacharyya and Mitesh M. Khapra and Pratyush Kumar}, + journal = {arXiv preprint arXiv:2005.00085}, + title = {AI4Bharat-IndicNLP Corpus: Monolingual Corpora and Word Embeddings for Indic Languages}, + year = {2020}, +} +""", ) def dataset_transform(self): diff --git a/mteb/tasks/Classification/multilingual/IndicSentimentClassification.py b/mteb/tasks/Classification/multilingual/IndicSentimentClassification.py index 2687422935..1401bd8f87 100644 --- a/mteb/tasks/Classification/multilingual/IndicSentimentClassification.py +++ b/mteb/tasks/Classification/multilingual/IndicSentimentClassification.py @@ -44,13 +44,15 @@ class IndicSentimentClassification(MultilingualTask, AbsTaskClassification): annotations_creators="human-annotated", dialect=[], sample_creation="machine-translated and verified", - bibtex_citation="""@article{doddapaneni2022towards, - title = {Towards Leaving No Indic Language Behind: Building Monolingual Corpora, Benchmark and Models for Indic Languages}, - author = {Sumanth Doddapaneni and Rahul Aralikatte and Gowtham Ramesh and Shreyansh Goyal and Mitesh M. Khapra and Anoop Kunchukuttan and Pratyush Kumar}, - journal = {Annual Meeting of the Association for Computational Linguistics}, - year = {2022}, - doi = {10.18653/v1/2023.acl-long.693} -}""", + bibtex_citation=r""" +@article{doddapaneni2022towards, + author = {Sumanth Doddapaneni and Rahul Aralikatte and Gowtham Ramesh and Shreyansh Goyal and Mitesh M. Khapra and Anoop Kunchukuttan and Pratyush Kumar}, + doi = {10.18653/v1/2023.acl-long.693}, + journal = {Annual Meeting of the Association for Computational Linguistics}, + title = {Towards Leaving No Indic Language Behind: Building Monolingual Corpora, Benchmark and Models for Indic Languages}, + year = {2022}, +} +""", ) def dataset_transform(self) -> None: diff --git a/mteb/tasks/Classification/multilingual/LanguageClassification.py b/mteb/tasks/Classification/multilingual/LanguageClassification.py index 9ebcfc7406..8398bddea0 100644 --- a/mteb/tasks/Classification/multilingual/LanguageClassification.py +++ b/mteb/tasks/Classification/multilingual/LanguageClassification.py @@ -49,21 +49,23 @@ class LanguageClassification(AbsTaskClassification): annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation="""@InProceedings{conneau2018xnli, + bibtex_citation=r""" +@inproceedings{conneau2018xnli, author = {Conneau, Alexis - and Rinott, Ruty - and Lample, Guillaume - and Williams, Adina - and Bowman, Samuel R. - and Schwenk, Holger - and Stoyanov, Veselin}, - title = {XNLI: Evaluating Cross-lingual Sentence Representations}, +and Rinott, Ruty +and Lample, Guillaume +and Williams, Adina +and Bowman, Samuel R. +and Schwenk, Holger +and Stoyanov, Veselin}, booktitle = {Proceedings of the 2018 Conference on Empirical Methods - in Natural Language Processing}, - year = {2018}, - publisher = {Association for Computational Linguistics}, +in Natural Language Processing}, location = {Brussels, Belgium}, -}""", + publisher = {Association for Computational Linguistics}, + title = {XNLI: Evaluating Cross-lingual Sentence Representations}, + year = {2018}, +} +""", ) def dataset_transform(self) -> None: diff --git a/mteb/tasks/Classification/multilingual/MTOPDomainClassification.py b/mteb/tasks/Classification/multilingual/MTOPDomainClassification.py index eb8713fd6d..34684005c1 100644 --- a/mteb/tasks/Classification/multilingual/MTOPDomainClassification.py +++ b/mteb/tasks/Classification/multilingual/MTOPDomainClassification.py @@ -37,26 +37,27 @@ class MTOPDomainClassification(MultilingualTask, AbsTaskClassification): annotations_creators="human-annotated", dialect=[], sample_creation="created", - bibtex_citation="""@inproceedings{li-etal-2021-mtop, - title = "{MTOP}: A Comprehensive Multilingual Task-Oriented Semantic Parsing Benchmark", - author = "Li, Haoran and - Arora, Abhinav and - Chen, Shuohui and - Gupta, Anchit and - Gupta, Sonal and - Mehdad, Yashar", - editor = "Merlo, Paola and - Tiedemann, Jorg and - Tsarfaty, Reut", - booktitle = "Proceedings of the 16th Conference of the European Chapter of the Association for Computational Linguistics: Main Volume", - month = apr, - year = "2021", - address = "Online", - publisher = "Association for Computational Linguistics", - url = "https://aclanthology.org/2021.eacl-main.257", - doi = "10.18653/v1/2021.eacl-main.257", - pages = "2950--2962", - abstract = "Scaling semantic parsing models for task-oriented dialog systems to new languages is often expensive and time-consuming due to the lack of available datasets. Available datasets suffer from several shortcomings: a) they contain few languages b) they contain small amounts of labeled examples per language c) they are based on the simple intent and slot detection paradigm for non-compositional queries. In this paper, we present a new multilingual dataset, called MTOP, comprising of 100k annotated utterances in 6 languages across 11 domains. We use this dataset and other publicly available datasets to conduct a comprehensive benchmarking study on using various state-of-the-art multilingual pre-trained models for task-oriented semantic parsing. We achieve an average improvement of +6.3 points on Slot F1 for the two existing multilingual datasets, over best results reported in their experiments. Furthermore, we demonstrate strong zero-shot performance using pre-trained models combined with automatic translation and alignment, and a proposed distant supervision method to reduce the noise in slot label projection.", + bibtex_citation=r""" +@inproceedings{li-etal-2021-mtop, + abstract = {Scaling semantic parsing models for task-oriented dialog systems to new languages is often expensive and time-consuming due to the lack of available datasets. Available datasets suffer from several shortcomings: a) they contain few languages b) they contain small amounts of labeled examples per language c) they are based on the simple intent and slot detection paradigm for non-compositional queries. In this paper, we present a new multilingual dataset, called MTOP, comprising of 100k annotated utterances in 6 languages across 11 domains. We use this dataset and other publicly available datasets to conduct a comprehensive benchmarking study on using various state-of-the-art multilingual pre-trained models for task-oriented semantic parsing. We achieve an average improvement of +6.3 points on Slot F1 for the two existing multilingual datasets, over best results reported in their experiments. Furthermore, we demonstrate strong zero-shot performance using pre-trained models combined with automatic translation and alignment, and a proposed distant supervision method to reduce the noise in slot label projection.}, + address = {Online}, + author = {Li, Haoran and +Arora, Abhinav and +Chen, Shuohui and +Gupta, Anchit and +Gupta, Sonal and +Mehdad, Yashar}, + booktitle = {Proceedings of the 16th Conference of the European Chapter of the Association for Computational Linguistics: Main Volume}, + doi = {10.18653/v1/2021.eacl-main.257}, + editor = {Merlo, Paola and +Tiedemann, Jorg and +Tsarfaty, Reut}, + month = apr, + pages = {2950--2962}, + publisher = {Association for Computational Linguistics}, + title = {{MTOP}: A Comprehensive Multilingual Task-Oriented Semantic Parsing Benchmark}, + url = {https://aclanthology.org/2021.eacl-main.257}, + year = {2021}, } """, prompt="Classify the intent domain of the given utterance in task-oriented conversation", diff --git a/mteb/tasks/Classification/multilingual/MTOPIntentClassification.py b/mteb/tasks/Classification/multilingual/MTOPIntentClassification.py index 52863107b6..0450acfb36 100644 --- a/mteb/tasks/Classification/multilingual/MTOPIntentClassification.py +++ b/mteb/tasks/Classification/multilingual/MTOPIntentClassification.py @@ -37,26 +37,27 @@ class MTOPIntentClassification(MultilingualTask, AbsTaskClassification): annotations_creators="human-annotated", dialect=[], sample_creation="created", - bibtex_citation="""@inproceedings{li-etal-2021-mtop, - title = "{MTOP}: A Comprehensive Multilingual Task-Oriented Semantic Parsing Benchmark", - author = "Li, Haoran and - Arora, Abhinav and - Chen, Shuohui and - Gupta, Anchit and - Gupta, Sonal and - Mehdad, Yashar", - editor = "Merlo, Paola and - Tiedemann, Jorg and - Tsarfaty, Reut", - booktitle = "Proceedings of the 16th Conference of the European Chapter of the Association for Computational Linguistics: Main Volume", - month = apr, - year = "2021", - address = "Online", - publisher = "Association for Computational Linguistics", - url = "https://aclanthology.org/2021.eacl-main.257", - doi = "10.18653/v1/2021.eacl-main.257", - pages = "2950--2962", - abstract = "Scaling semantic parsing models for task-oriented dialog systems to new languages is often expensive and time-consuming due to the lack of available datasets. Available datasets suffer from several shortcomings: a) they contain few languages b) they contain small amounts of labeled examples per language c) they are based on the simple intent and slot detection paradigm for non-compositional queries. In this paper, we present a new multilingual dataset, called MTOP, comprising of 100k annotated utterances in 6 languages across 11 domains. We use this dataset and other publicly available datasets to conduct a comprehensive benchmarking study on using various state-of-the-art multilingual pre-trained models for task-oriented semantic parsing. We achieve an average improvement of +6.3 points on Slot F1 for the two existing multilingual datasets, over best results reported in their experiments. Furthermore, we demonstrate strong zero-shot performance using pre-trained models combined with automatic translation and alignment, and a proposed distant supervision method to reduce the noise in slot label projection.", + bibtex_citation=r""" +@inproceedings{li-etal-2021-mtop, + abstract = {Scaling semantic parsing models for task-oriented dialog systems to new languages is often expensive and time-consuming due to the lack of available datasets. Available datasets suffer from several shortcomings: a) they contain few languages b) they contain small amounts of labeled examples per language c) they are based on the simple intent and slot detection paradigm for non-compositional queries. In this paper, we present a new multilingual dataset, called MTOP, comprising of 100k annotated utterances in 6 languages across 11 domains. We use this dataset and other publicly available datasets to conduct a comprehensive benchmarking study on using various state-of-the-art multilingual pre-trained models for task-oriented semantic parsing. We achieve an average improvement of +6.3 points on Slot F1 for the two existing multilingual datasets, over best results reported in their experiments. Furthermore, we demonstrate strong zero-shot performance using pre-trained models combined with automatic translation and alignment, and a proposed distant supervision method to reduce the noise in slot label projection.}, + address = {Online}, + author = {Li, Haoran and +Arora, Abhinav and +Chen, Shuohui and +Gupta, Anchit and +Gupta, Sonal and +Mehdad, Yashar}, + booktitle = {Proceedings of the 16th Conference of the European Chapter of the Association for Computational Linguistics: Main Volume}, + doi = {10.18653/v1/2021.eacl-main.257}, + editor = {Merlo, Paola and +Tiedemann, Jorg and +Tsarfaty, Reut}, + month = apr, + pages = {2950--2962}, + publisher = {Association for Computational Linguistics}, + title = {{MTOP}: A Comprehensive Multilingual Task-Oriented Semantic Parsing Benchmark}, + url = {https://aclanthology.org/2021.eacl-main.257}, + year = {2021}, } """, prompt="Classify the intent of the given utterance in task-oriented conversation", diff --git a/mteb/tasks/Classification/multilingual/MasakhaNEWSClassification.py b/mteb/tasks/Classification/multilingual/MasakhaNEWSClassification.py index 66cfe71b17..4969c6794a 100644 --- a/mteb/tasks/Classification/multilingual/MasakhaNEWSClassification.py +++ b/mteb/tasks/Classification/multilingual/MasakhaNEWSClassification.py @@ -46,14 +46,16 @@ class MasakhaNEWSClassification(AbsTaskClassification, MultilingualTask): annotations_creators="expert-annotated", dialect=[], sample_creation="found", - bibtex_citation="""@misc{adelani2023masakhanews, - title={MasakhaNEWS: News Topic Classification for African languages}, - author={David Ifeoluwa Adelani and Marek Masiak and Israel Abebe Azime and Jesujoba Alabi and Atnafu Lambebo Tonja and Christine Mwase and Odunayo Ogundepo and Bonaventure F. P. Dossou and Akintunde Oladipo and Doreen Nixdorf and Chris Chinenye Emezue and sana al-azzawi and Blessing Sibanda and Davis David and Lolwethu Ndolela and Jonathan Mukiibi and Tunde Ajayi and Tatiana Moteu and Brian Odhiambo and Abraham Owodunni and Nnaemeka Obiefuna and Muhidin Mohamed and Shamsuddeen Hassan Muhammad and Teshome Mulugeta Ababu and Saheed Abdullahi Salahudeen and Mesay Gemeda Yigezu and Tajuddeen Gwadabe and Idris Abdulmumin and Mahlet Taye and Oluwabusayo Awoyomi and Iyanuoluwa Shode and Tolulope Adelani and Habiba Abdulganiyu and Abdul-Hakeem Omotayo and Adetola Adeeko and Abeeb Afolabi and Anuoluwapo Aremu and Olanrewaju Samuel and Clemencia Siro and Wangari Kimotho and Onyekachi Ogbu and Chinedu Mbonu and Chiamaka Chukwuneke and Samuel Fanijo and Jessica Ojo and Oyinkansola Awosan and Tadesse Kebede and Toadoum Sari Sakayo and Pamela Nyatsine and Freedmore Sidume and Oreen Yousuf and Mardiyyah Oduwole and Tshinu Tshinu and Ussen Kimanuka and Thina Diko and Siyanda Nxakama and Sinodos Nigusse and Abdulmejid Johar and Shafie Mohamed and Fuad Mire Hassan and Moges Ahmed Mehamed and Evrard Ngabire and Jules Jules and Ivan Ssenkungu and Pontus Stenetorp}, - year={2023}, - eprint={2304.09972}, - archivePrefix={arXiv}, - primaryClass={cs.CL} -}""", + bibtex_citation=r""" +@misc{adelani2023masakhanews, + archiveprefix = {arXiv}, + author = {David Ifeoluwa Adelani and Marek Masiak and Israel Abebe Azime and Jesujoba Alabi and Atnafu Lambebo Tonja and Christine Mwase and Odunayo Ogundepo and Bonaventure F. P. Dossou and Akintunde Oladipo and Doreen Nixdorf and Chris Chinenye Emezue and sana al-azzawi and Blessing Sibanda and Davis David and Lolwethu Ndolela and Jonathan Mukiibi and Tunde Ajayi and Tatiana Moteu and Brian Odhiambo and Abraham Owodunni and Nnaemeka Obiefuna and Muhidin Mohamed and Shamsuddeen Hassan Muhammad and Teshome Mulugeta Ababu and Saheed Abdullahi Salahudeen and Mesay Gemeda Yigezu and Tajuddeen Gwadabe and Idris Abdulmumin and Mahlet Taye and Oluwabusayo Awoyomi and Iyanuoluwa Shode and Tolulope Adelani and Habiba Abdulganiyu and Abdul-Hakeem Omotayo and Adetola Adeeko and Abeeb Afolabi and Anuoluwapo Aremu and Olanrewaju Samuel and Clemencia Siro and Wangari Kimotho and Onyekachi Ogbu and Chinedu Mbonu and Chiamaka Chukwuneke and Samuel Fanijo and Jessica Ojo and Oyinkansola Awosan and Tadesse Kebede and Toadoum Sari Sakayo and Pamela Nyatsine and Freedmore Sidume and Oreen Yousuf and Mardiyyah Oduwole and Tshinu Tshinu and Ussen Kimanuka and Thina Diko and Siyanda Nxakama and Sinodos Nigusse and Abdulmejid Johar and Shafie Mohamed and Fuad Mire Hassan and Moges Ahmed Mehamed and Evrard Ngabire and Jules Jules and Ivan Ssenkungu and Pontus Stenetorp}, + eprint = {2304.09972}, + primaryclass = {cs.CL}, + title = {MasakhaNEWS: News Topic Classification for African languages}, + year = {2023}, +} +""", ) def dataset_transform(self): diff --git a/mteb/tasks/Classification/multilingual/MassiveIntentClassification.py b/mteb/tasks/Classification/multilingual/MassiveIntentClassification.py index 4538c3d4f7..b68eed3785 100644 --- a/mteb/tasks/Classification/multilingual/MassiveIntentClassification.py +++ b/mteb/tasks/Classification/multilingual/MassiveIntentClassification.py @@ -82,13 +82,15 @@ class MassiveIntentClassification(MultilingualTask, AbsTaskClassification): annotations_creators="human-annotated", dialect=[], sample_creation="human-translated and localized", # with the exception of the English data - bibtex_citation="""@misc{fitzgerald2022massive, - title={MASSIVE: A 1M-Example Multilingual Natural Language Understanding Dataset with 51 Typologically-Diverse Languages}, - author={Jack FitzGerald and Christopher Hench and Charith Peris and Scott Mackie and Kay Rottmann and Ana Sanchez and Aaron Nash and Liam Urbach and Vishesh Kakarala and Richa Singh and Swetha Ranganath and Laurie Crist and Misha Britan and Wouter Leeuwis and Gokhan Tur and Prem Natarajan}, - year={2022}, - eprint={2204.08582}, - archivePrefix={arXiv}, - primaryClass={cs.CL} -}""", + bibtex_citation=r""" +@misc{fitzgerald2022massive, + archiveprefix = {arXiv}, + author = {Jack FitzGerald and Christopher Hench and Charith Peris and Scott Mackie and Kay Rottmann and Ana Sanchez and Aaron Nash and Liam Urbach and Vishesh Kakarala and Richa Singh and Swetha Ranganath and Laurie Crist and Misha Britan and Wouter Leeuwis and Gokhan Tur and Prem Natarajan}, + eprint = {2204.08582}, + primaryclass = {cs.CL}, + title = {MASSIVE: A 1M-Example Multilingual Natural Language Understanding Dataset with 51 Typologically-Diverse Languages}, + year = {2022}, +} +""", prompt="Given a user utterance as query, find the user intents", ) diff --git a/mteb/tasks/Classification/multilingual/MassiveScenarioClassification.py b/mteb/tasks/Classification/multilingual/MassiveScenarioClassification.py index d48394318c..3ddc95dced 100644 --- a/mteb/tasks/Classification/multilingual/MassiveScenarioClassification.py +++ b/mteb/tasks/Classification/multilingual/MassiveScenarioClassification.py @@ -82,13 +82,15 @@ class MassiveScenarioClassification(MultilingualTask, AbsTaskClassification): annotations_creators="human-annotated", dialect=[], sample_creation="human-translated and localized", # with the exception of the English data - bibtex_citation="""@misc{fitzgerald2022massive, - title={MASSIVE: A 1M-Example Multilingual Natural Language Understanding Dataset with 51 Typologically-Diverse Languages}, - author={Jack FitzGerald and Christopher Hench and Charith Peris and Scott Mackie and Kay Rottmann and Ana Sanchez and Aaron Nash and Liam Urbach and Vishesh Kakarala and Richa Singh and Swetha Ranganath and Laurie Crist and Misha Britan and Wouter Leeuwis and Gokhan Tur and Prem Natarajan}, - year={2022}, - eprint={2204.08582}, - archivePrefix={arXiv}, - primaryClass={cs.CL} -}""", + bibtex_citation=r""" +@misc{fitzgerald2022massive, + archiveprefix = {arXiv}, + author = {Jack FitzGerald and Christopher Hench and Charith Peris and Scott Mackie and Kay Rottmann and Ana Sanchez and Aaron Nash and Liam Urbach and Vishesh Kakarala and Richa Singh and Swetha Ranganath and Laurie Crist and Misha Britan and Wouter Leeuwis and Gokhan Tur and Prem Natarajan}, + eprint = {2204.08582}, + primaryclass = {cs.CL}, + title = {MASSIVE: A 1M-Example Multilingual Natural Language Understanding Dataset with 51 Typologically-Diverse Languages}, + year = {2022}, +} +""", prompt="Given a user utterance as query, find the user scenarios", ) diff --git a/mteb/tasks/Classification/multilingual/MultiHateClassification.py b/mteb/tasks/Classification/multilingual/MultiHateClassification.py index f20ba592c1..ab6c35db09 100644 --- a/mteb/tasks/Classification/multilingual/MultiHateClassification.py +++ b/mteb/tasks/Classification/multilingual/MultiHateClassification.py @@ -45,53 +45,53 @@ class MultiHateClassification(MultilingualTask, AbsTaskClassification): annotations_creators="expert-annotated", dialect=[], sample_creation="created", - bibtex_citation=""" - @inproceedings{rottger-etal-2021-hatecheck, - title = "{H}ate{C}heck: Functional Tests for Hate Speech Detection Models", - author = {R{\"o}ttger, Paul and - Vidgen, Bertie and - Nguyen, Dong and - Waseem, Zeerak and - Margetts, Helen and - Pierrehumbert, Janet}, - editor = "Zong, Chengqing and - Xia, Fei and - Li, Wenjie and - Navigli, Roberto", - booktitle = "Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing (Volume 1: Long Papers)", - month = aug, - year = "2021", - address = "Online", - publisher = "Association for Computational Linguistics", - url = "https://aclanthology.org/2021.acl-long.4", - doi = "10.18653/v1/2021.acl-long.4", - pages = "41--58", - abstract = "Detecting online hate is a difficult task that even state-of-the-art models struggle with. Typically, hate speech detection models are evaluated by measuring their performance on held-out test data using metrics such as accuracy and F1 score. However, this approach makes it difficult to identify specific model weak points. It also risks overestimating generalisable model performance due to increasingly well-evidenced systematic gaps and biases in hate speech datasets. To enable more targeted diagnostic insights, we introduce HateCheck, a suite of functional tests for hate speech detection models. We specify 29 model functionalities motivated by a review of previous research and a series of interviews with civil society stakeholders. We craft test cases for each functionality and validate their quality through a structured annotation process. To illustrate HateCheck{'}s utility, we test near-state-of-the-art transformer models as well as two popular commercial models, revealing critical model weaknesses.", - } + bibtex_citation=r""" +@inproceedings{rottger-etal-2021-hatecheck, + abstract = {Detecting online hate is a difficult task that even state-of-the-art models struggle with. Typically, hate speech detection models are evaluated by measuring their performance on held-out test data using metrics such as accuracy and F1 score. However, this approach makes it difficult to identify specific model weak points. It also risks overestimating generalisable model performance due to increasingly well-evidenced systematic gaps and biases in hate speech datasets. To enable more targeted diagnostic insights, we introduce HateCheck, a suite of functional tests for hate speech detection models. We specify 29 model functionalities motivated by a review of previous research and a series of interviews with civil society stakeholders. We craft test cases for each functionality and validate their quality through a structured annotation process. To illustrate HateCheck{'}s utility, we test near-state-of-the-art transformer models as well as two popular commercial models, revealing critical model weaknesses.}, + address = {Online}, + author = {R{\"o}ttger, Paul and +Vidgen, Bertie and +Nguyen, Dong and +Waseem, Zeerak and +Margetts, Helen and +Pierrehumbert, Janet}, + booktitle = {Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing (Volume 1: Long Papers)}, + doi = {10.18653/v1/2021.acl-long.4}, + editor = {Zong, Chengqing and +Xia, Fei and +Li, Wenjie and +Navigli, Roberto}, + month = aug, + pages = {41--58}, + publisher = {Association for Computational Linguistics}, + title = {{H}ate{C}heck: Functional Tests for Hate Speech Detection Models}, + url = {https://aclanthology.org/2021.acl-long.4}, + year = {2021}, +} - @inproceedings{rottger-etal-2022-multilingual, - title = "Multilingual {H}ate{C}heck: Functional Tests for Multilingual Hate Speech Detection Models", - author = {R{\"o}ttger, Paul and - Seelawi, Haitham and - Nozza, Debora and - Talat, Zeerak and - Vidgen, Bertie}, - editor = "Narang, Kanika and - Mostafazadeh Davani, Aida and - Mathias, Lambert and - Vidgen, Bertie and - Talat, Zeerak", - booktitle = "Proceedings of the Sixth Workshop on Online Abuse and Harms (WOAH)", - month = jul, - year = "2022", - address = "Seattle, Washington (Hybrid)", - publisher = "Association for Computational Linguistics", - url = "https://aclanthology.org/2022.woah-1.15", - doi = "10.18653/v1/2022.woah-1.15", - pages = "154--169", - abstract = "Hate speech detection models are typically evaluated on held-out test sets. However, this risks painting an incomplete and potentially misleading picture of model performance because of increasingly well-documented systematic gaps and biases in hate speech datasets. To enable more targeted diagnostic insights, recent research has thus introduced functional tests for hate speech detection models. However, these tests currently only exist for English-language content, which means that they cannot support the development of more effective models in other languages spoken by billions across the world. To help address this issue, we introduce Multilingual HateCheck (MHC), a suite of functional tests for multilingual hate speech detection models. MHC covers 34 functionalities across ten languages, which is more languages than any other hate speech dataset. To illustrate MHC{'}s utility, we train and test a high-performing multilingual hate speech detection model, and reveal critical model weaknesses for monolingual and cross-lingual applications.", - } - """, +@inproceedings{rottger-etal-2022-multilingual, + abstract = {Hate speech detection models are typically evaluated on held-out test sets. However, this risks painting an incomplete and potentially misleading picture of model performance because of increasingly well-documented systematic gaps and biases in hate speech datasets. To enable more targeted diagnostic insights, recent research has thus introduced functional tests for hate speech detection models. However, these tests currently only exist for English-language content, which means that they cannot support the development of more effective models in other languages spoken by billions across the world. To help address this issue, we introduce Multilingual HateCheck (MHC), a suite of functional tests for multilingual hate speech detection models. MHC covers 34 functionalities across ten languages, which is more languages than any other hate speech dataset. To illustrate MHC{'}s utility, we train and test a high-performing multilingual hate speech detection model, and reveal critical model weaknesses for monolingual and cross-lingual applications.}, + address = {Seattle, Washington (Hybrid)}, + author = {R{\"o}ttger, Paul and +Seelawi, Haitham and +Nozza, Debora and +Talat, Zeerak and +Vidgen, Bertie}, + booktitle = {Proceedings of the Sixth Workshop on Online Abuse and Harms (WOAH)}, + doi = {10.18653/v1/2022.woah-1.15}, + editor = {Narang, Kanika and +Mostafazadeh Davani, Aida and +Mathias, Lambert and +Vidgen, Bertie and +Talat, Zeerak}, + month = jul, + pages = {154--169}, + publisher = {Association for Computational Linguistics}, + title = {Multilingual {H}ate{C}heck: Functional Tests for Multilingual Hate Speech Detection Models}, + url = {https://aclanthology.org/2022.woah-1.15}, + year = {2022}, +} +""", ) def dataset_transform(self): diff --git a/mteb/tasks/Classification/multilingual/MultilingualSentimentClassification.py b/mteb/tasks/Classification/multilingual/MultilingualSentimentClassification.py index 1108dd7cf8..b6529c87d5 100644 --- a/mteb/tasks/Classification/multilingual/MultilingualSentimentClassification.py +++ b/mteb/tasks/Classification/multilingual/MultilingualSentimentClassification.py @@ -64,31 +64,31 @@ class MultilingualSentimentClassification(AbsTaskClassification, MultilingualTas annotations_creators="derived", dialect=["ar-dz"], sample_creation="found", - bibtex_citation=""" - @inproceedings{mollanorozy-etal-2023-cross, - title = "Cross-lingual Transfer Learning with \{P\}ersian", - author = "Mollanorozy, Sepideh and - Tanti, Marc and - Nissim, Malvina", - editor = "Beinborn, Lisa and - Goswami, Koustava and - Murado{\\u{g}}lu, Saliha and - Sorokin, Alexey and - Kumar, Ritesh and - Shcherbakov, Andreas and - Ponti, Edoardo M. and - Cotterell, Ryan and - Vylomova, Ekaterina", - booktitle = "Proceedings of the 5th Workshop on Research in Computational Linguistic Typology and Multilingual NLP", - month = may, - year = "2023", - address = "Dubrovnik, Croatia", - publisher = "Association for Computational Linguistics", - url = "https://aclanthology.org/2023.sigtyp-1.9", - doi = "10.18653/v1/2023.sigtyp-1.9", - pages = "89--95", - } - """, + bibtex_citation=r""" +@inproceedings{mollanorozy-etal-2023-cross, + address = {Dubrovnik, Croatia}, + author = {Mollanorozy, Sepideh and +Tanti, Marc and +Nissim, Malvina}, + booktitle = {Proceedings of the 5th Workshop on Research in Computational Linguistic Typology and Multilingual NLP}, + doi = {10.18653/v1/2023.sigtyp-1.9}, + editor = {Beinborn, Lisa and +Goswami, Koustava and +Murado{\\u{g}}lu, Saliha and +Sorokin, Alexey and +Kumar, Ritesh and +Shcherbakov, Andreas and +Ponti, Edoardo M. and +Cotterell, Ryan and +Vylomova, Ekaterina}, + month = may, + pages = {89--95}, + publisher = {Association for Computational Linguistics}, + title = {Cross-lingual Transfer Learning with \{P\}ersian}, + url = {https://aclanthology.org/2023.sigtyp-1.9}, + year = {2023}, +} +""", ) def dataset_transform(self): diff --git a/mteb/tasks/Classification/multilingual/NaijaSenti.py b/mteb/tasks/Classification/multilingual/NaijaSenti.py index b31333236e..f9767baa71 100644 --- a/mteb/tasks/Classification/multilingual/NaijaSenti.py +++ b/mteb/tasks/Classification/multilingual/NaijaSenti.py @@ -37,29 +37,30 @@ class NaijaSenti(AbsTaskClassification, MultilingualTask): annotations_creators="expert-annotated", dialect=[], sample_creation="found", - bibtex_citation=""" - @inproceedings{muhammad-etal-2022-naijasenti, - title = "{N}aija{S}enti: A {N}igerian {T}witter Sentiment Corpus for Multilingual Sentiment Analysis", - author = "Muhammad, Shamsuddeen Hassan and - Adelani, David Ifeoluwa and - Ruder, Sebastian and - Ahmad, Ibrahim Sa{'}id and - Abdulmumin, Idris and - Bello, Bello Shehu and - Choudhury, Monojit and - Emezue, Chris Chinenye and - Abdullahi, Saheed Salahudeen and - Aremu, Anuoluwapo and - Jorge, Al{\'\i}pio and - Brazdil, Pavel", - booktitle = "Proceedings of the Thirteenth Language Resources and Evaluation Conference", - month = jun, - year = "2022", - address = "Marseille, France", - publisher = "European Language Resources Association", - url = "https://aclanthology.org/2022.lrec-1.63", - pages = "590--602", - }""", + bibtex_citation=r""" +@inproceedings{muhammad-etal-2022-naijasenti, + address = {Marseille, France}, + author = {Muhammad, Shamsuddeen Hassan and +Adelani, David Ifeoluwa and +Ruder, Sebastian and +Ahmad, Ibrahim Sa{'}id and +Abdulmumin, Idris and +Bello, Bello Shehu and +Choudhury, Monojit and +Emezue, Chris Chinenye and +Abdullahi, Saheed Salahudeen and +Aremu, Anuoluwapo and +Jorge, Al{\'\i}pio and +Brazdil, Pavel}, + booktitle = {Proceedings of the Thirteenth Language Resources and Evaluation Conference}, + month = jun, + pages = {590--602}, + publisher = {European Language Resources Association}, + title = {{N}aija{S}enti: A {N}igerian {T}witter Sentiment Corpus for Multilingual Sentiment Analysis}, + url = {https://aclanthology.org/2022.lrec-1.63}, + year = {2022}, +} +""", ) def load_data(self, **kwargs: Any) -> None: diff --git a/mteb/tasks/Classification/multilingual/NordicLangClassification.py b/mteb/tasks/Classification/multilingual/NordicLangClassification.py index 2a89e44a23..7854eca4fb 100644 --- a/mteb/tasks/Classification/multilingual/NordicLangClassification.py +++ b/mteb/tasks/Classification/multilingual/NordicLangClassification.py @@ -35,24 +35,25 @@ class NordicLangClassification(AbsTaskClassification): annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation="""@inproceedings{haas-derczynski-2021-discriminating, - title = "Discriminating Between Similar {N}ordic Languages", - author = "Haas, Ren{\'e} and - Derczynski, Leon", - editor = {Zampieri, Marcos and - Nakov, Preslav and - Ljube{\v{s}}i{\'c}, Nikola and - Tiedemann, J{\"o}rg and - Scherrer, Yves and - Jauhiainen, Tommi}, - booktitle = "Proceedings of the Eighth Workshop on NLP for Similar Languages, Varieties and Dialects", - month = apr, - year = "2021", - address = "Kiyv, Ukraine", - publisher = "Association for Computational Linguistics", - url = "https://aclanthology.org/2021.vardial-1.8", - pages = "67--75", - abstract = "Automatic language identification is a challenging problem. Discriminating between closely related languages is especially difficult. This paper presents a machine learning approach for automatic language identification for the Nordic languages, which often suffer miscategorisation by existing state-of-the-art tools. Concretely we will focus on discrimination between six Nordic languages: Danish, Swedish, Norwegian (Nynorsk), Norwegian (Bokm{\aa}l), Faroese and Icelandic.", + bibtex_citation=r""" +@inproceedings{haas-derczynski-2021-discriminating, + abstract = {Automatic language identification is a challenging problem. Discriminating between closely related languages is especially difficult. This paper presents a machine learning approach for automatic language identification for the Nordic languages, which often suffer miscategorisation by existing state-of-the-art tools. Concretely we will focus on discrimination between six Nordic languages: Danish, Swedish, Norwegian (Nynorsk), Norwegian (Bokm{\aa}l), Faroese and Icelandic.}, + address = {Kiyv, Ukraine}, + author = {Haas, Ren{\'e} and +Derczynski, Leon}, + booktitle = {Proceedings of the Eighth Workshop on NLP for Similar Languages, Varieties and Dialects}, + editor = {Zampieri, Marcos and +Nakov, Preslav and +Ljube{\v{s}}i{\'c}, Nikola and +Tiedemann, J{\"o}rg and +Scherrer, Yves and +Jauhiainen, Tommi}, + month = apr, + pages = {67--75}, + publisher = {Association for Computational Linguistics}, + title = {Discriminating Between Similar {N}ordic Languages}, + url = {https://aclanthology.org/2021.vardial-1.8}, + year = {2021}, } """, prompt="Classify texts based on language", diff --git a/mteb/tasks/Classification/multilingual/NusaParagraphEmotionClassification.py b/mteb/tasks/Classification/multilingual/NusaParagraphEmotionClassification.py index fca11b365c..857fb161df 100644 --- a/mteb/tasks/Classification/multilingual/NusaParagraphEmotionClassification.py +++ b/mteb/tasks/Classification/multilingual/NusaParagraphEmotionClassification.py @@ -40,18 +40,18 @@ class NusaParagraphEmotionClassification(MultilingualTask, AbsTaskClassification annotations_creators="human-annotated", dialect=[], sample_creation="found", - bibtex_citation=""" - @inproceedings{cahyawijaya-etal-2023-nusawrites, - title = "NusaWrites: Constructing High-Quality Corpora for Underrepresented and Extremely Low-Resource Languages", - author = "Cahyawijaya, Samuel and Lovenia, Holy and Koto, Fajri and Adhista, Dea and Dave, Emmanuel and Oktavianti, Sarah and Akbar, Salsabil and Lee, Jhonson and Shadieq, Nuur and Cenggoro, Tjeng Wawan and Linuwih, Hanung and Wilie, Bryan and Muridan, Galih and Winata, Genta and Moeljadi, David and Aji, Alham Fikri and Purwarianti, Ayu and Fung, Pascale", - editor = "Park, Jong C. and Arase, Yuki and Hu, Baotian and Lu, Wei and Wijaya, Derry and Purwarianti, Ayu and Krisnadhi, Adila Alfa", - booktitle = "Proceedings of the 13th International Joint Conference on Natural Language Processing and the 3rd Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics (Volume 1: Long Papers)", - month = nov, - year = "2023", - address = "Nusa Dua, Bali", - publisher = "Association for Computational Linguistics", - url = "https://aclanthology.org/2023.ijcnlp-main.60", - pages = "921--945", - } - """, + bibtex_citation=r""" +@inproceedings{cahyawijaya-etal-2023-nusawrites, + address = {Nusa Dua, Bali}, + author = {Cahyawijaya, Samuel and Lovenia, Holy and Koto, Fajri and Adhista, Dea and Dave, Emmanuel and Oktavianti, Sarah and Akbar, Salsabil and Lee, Jhonson and Shadieq, Nuur and Cenggoro, Tjeng Wawan and Linuwih, Hanung and Wilie, Bryan and Muridan, Galih and Winata, Genta and Moeljadi, David and Aji, Alham Fikri and Purwarianti, Ayu and Fung, Pascale}, + booktitle = {Proceedings of the 13th International Joint Conference on Natural Language Processing and the 3rd Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics (Volume 1: Long Papers)}, + editor = {Park, Jong C. and Arase, Yuki and Hu, Baotian and Lu, Wei and Wijaya, Derry and Purwarianti, Ayu and Krisnadhi, Adila Alfa}, + month = nov, + pages = {921--945}, + publisher = {Association for Computational Linguistics}, + title = {NusaWrites: Constructing High-Quality Corpora for Underrepresented and Extremely Low-Resource Languages}, + url = {https://aclanthology.org/2023.ijcnlp-main.60}, + year = {2023}, +} +""", ) diff --git a/mteb/tasks/Classification/multilingual/NusaParagraphTopicClassification.py b/mteb/tasks/Classification/multilingual/NusaParagraphTopicClassification.py index effd257709..cddeced561 100644 --- a/mteb/tasks/Classification/multilingual/NusaParagraphTopicClassification.py +++ b/mteb/tasks/Classification/multilingual/NusaParagraphTopicClassification.py @@ -40,18 +40,18 @@ class NusaParagraphTopicClassification(MultilingualTask, AbsTaskClassification): annotations_creators="human-annotated", dialect=[], sample_creation="found", - bibtex_citation=""" - @inproceedings{cahyawijaya-etal-2023-nusawrites, - title = "NusaWrites: Constructing High-Quality Corpora for Underrepresented and Extremely Low-Resource Languages", - author = "Cahyawijaya, Samuel and Lovenia, Holy and Koto, Fajri and Adhista, Dea and Dave, Emmanuel and Oktavianti, Sarah and Akbar, Salsabil and Lee, Jhonson and Shadieq, Nuur and Cenggoro, Tjeng Wawan and Linuwih, Hanung and Wilie, Bryan and Muridan, Galih and Winata, Genta and Moeljadi, David and Aji, Alham Fikri and Purwarianti, Ayu and Fung, Pascale", - editor = "Park, Jong C. and Arase, Yuki and Hu, Baotian and Lu, Wei and Wijaya, Derry and Purwarianti, Ayu and Krisnadhi, Adila Alfa", - booktitle = "Proceedings of the 13th International Joint Conference on Natural Language Processing and the 3rd Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics (Volume 1: Long Papers)", - month = nov, - year = "2023", - address = "Nusa Dua, Bali", - publisher = "Association for Computational Linguistics", - url = "https://aclanthology.org/2023.ijcnlp-main.60", - pages = "921--945", - } - """, + bibtex_citation=r""" +@inproceedings{cahyawijaya-etal-2023-nusawrites, + address = {Nusa Dua, Bali}, + author = {Cahyawijaya, Samuel and Lovenia, Holy and Koto, Fajri and Adhista, Dea and Dave, Emmanuel and Oktavianti, Sarah and Akbar, Salsabil and Lee, Jhonson and Shadieq, Nuur and Cenggoro, Tjeng Wawan and Linuwih, Hanung and Wilie, Bryan and Muridan, Galih and Winata, Genta and Moeljadi, David and Aji, Alham Fikri and Purwarianti, Ayu and Fung, Pascale}, + booktitle = {Proceedings of the 13th International Joint Conference on Natural Language Processing and the 3rd Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics (Volume 1: Long Papers)}, + editor = {Park, Jong C. and Arase, Yuki and Hu, Baotian and Lu, Wei and Wijaya, Derry and Purwarianti, Ayu and Krisnadhi, Adila Alfa}, + month = nov, + pages = {921--945}, + publisher = {Association for Computational Linguistics}, + title = {NusaWrites: Constructing High-Quality Corpora for Underrepresented and Extremely Low-Resource Languages}, + url = {https://aclanthology.org/2023.ijcnlp-main.60}, + year = {2023}, +} +""", ) diff --git a/mteb/tasks/Classification/multilingual/NusaXSenti.py b/mteb/tasks/Classification/multilingual/NusaXSenti.py index 1b9fa2460a..368abb1e6e 100644 --- a/mteb/tasks/Classification/multilingual/NusaXSenti.py +++ b/mteb/tasks/Classification/multilingual/NusaXSenti.py @@ -41,18 +41,18 @@ class NusaXSentiClassification(AbsTaskClassification, MultilingualTask): annotations_creators="expert-annotated", dialect=[], sample_creation="found", - bibtex_citation=""" - @misc{winata2022nusax, - title={NusaX: Multilingual Parallel Sentiment Dataset for 10 Indonesian Local Languages}, - author={Winata, Genta Indra and Aji, Alham Fikri and Cahyawijaya, - Samuel and Mahendra, Rahmad and Koto, Fajri and Romadhony, - Ade and Kurniawan, Kemal and Moeljadi, David and Prasojo, - Radityo Eko and Fung, Pascale and Baldwin, Timothy and Lau, - Jey Han and Sennrich, Rico and Ruder, Sebastian}, - year={2022}, - eprint={2205.15960}, - archivePrefix={arXiv}, - primaryClass={cs.CL} + bibtex_citation=r""" +@misc{winata2022nusax, + archiveprefix = {arXiv}, + author = {Winata, Genta Indra and Aji, Alham Fikri and Cahyawijaya, +Samuel and Mahendra, Rahmad and Koto, Fajri and Romadhony, +Ade and Kurniawan, Kemal and Moeljadi, David and Prasojo, +Radityo Eko and Fung, Pascale and Baldwin, Timothy and Lau, +Jey Han and Sennrich, Rico and Ruder, Sebastian}, + eprint = {2205.15960}, + primaryclass = {cs.CL}, + title = {NusaX: Multilingual Parallel Sentiment Dataset for 10 Indonesian Local Languages}, + year = {2022}, } """, ) diff --git a/mteb/tasks/Classification/multilingual/SIB200Classification.py b/mteb/tasks/Classification/multilingual/SIB200Classification.py index 88e5d4b9c8..bbfb20e40b 100644 --- a/mteb/tasks/Classification/multilingual/SIB200Classification.py +++ b/mteb/tasks/Classification/multilingual/SIB200Classification.py @@ -232,12 +232,14 @@ class SIB200Classification(MultilingualTask, AbsTaskClassification): annotations_creators="expert-annotated", # expert annotated for English --> human translations dialect=[], sample_creation="human-translated and localized", - bibtex_citation="""@article{adelani2023sib, - title={SIB-200: A simple, inclusive, and big evaluation dataset for topic classification in 200+ languages and dialects}, - author={Adelani, David Ifeoluwa and Liu, Hannah and Shen, Xiaoyu and Vassilyev, Nikita and Alabi, Jesujoba O and Mao, Yanke and Gao, Haonan and Lee, Annie En-Shiun}, - journal={arXiv preprint arXiv:2309.07445}, - year={2023} - }""", + bibtex_citation=r""" +@article{adelani2023sib, + author = {Adelani, David Ifeoluwa and Liu, Hannah and Shen, Xiaoyu and Vassilyev, Nikita and Alabi, Jesujoba O and Mao, Yanke and Gao, Haonan and Lee, Annie En-Shiun}, + journal = {arXiv preprint arXiv:2309.07445}, + title = {SIB-200: A simple, inclusive, and big evaluation dataset for topic classification in 200+ languages and dialects}, + year = {2023}, +} +""", ) def dataset_transform(self): diff --git a/mteb/tasks/Classification/multilingual/ScalaClassification.py b/mteb/tasks/Classification/multilingual/ScalaClassification.py index fec0b48b18..d037ac9c9d 100644 --- a/mteb/tasks/Classification/multilingual/ScalaClassification.py +++ b/mteb/tasks/Classification/multilingual/ScalaClassification.py @@ -38,19 +38,21 @@ class ScalaClassification(AbsTaskClassification, MultilingualTask): annotations_creators="human-annotated", dialect=[], sample_creation="created", - bibtex_citation="""@inproceedings{nielsen-2023-scandeval, - title = "{S}cand{E}val: A Benchmark for {S}candinavian Natural Language Processing", - author = "Nielsen, Dan", - editor = {Alum{\"a}e, Tanel and - Fishel, Mark}, - booktitle = "Proceedings of the 24th Nordic Conference on Computational Linguistics (NoDaLiDa)", - month = may, - year = "2023", - address = "T{\'o}rshavn, Faroe Islands", - publisher = "University of Tartu Library", - url = "https://aclanthology.org/2023.nodalida-1.20", - pages = "185--201", - }""", + bibtex_citation=r""" +@inproceedings{nielsen-2023-scandeval, + address = {T{\'o}rshavn, Faroe Islands}, + author = {Nielsen, Dan}, + booktitle = {Proceedings of the 24th Nordic Conference on Computational Linguistics (NoDaLiDa)}, + editor = {Alum{\"a}e, Tanel and +Fishel, Mark}, + month = may, + pages = {185--201}, + publisher = {University of Tartu Library}, + title = {{S}cand{E}val: A Benchmark for {S}candinavian Natural Language Processing}, + url = {https://aclanthology.org/2023.nodalida-1.20}, + year = {2023}, +} +""", prompt="Classify passages in Scandinavian Languages based on linguistic acceptability", ) diff --git a/mteb/tasks/Classification/multilingual/ScandiSentClassification.py b/mteb/tasks/Classification/multilingual/ScandiSentClassification.py new file mode 100644 index 0000000000..09bcd28a5a --- /dev/null +++ b/mteb/tasks/Classification/multilingual/ScandiSentClassification.py @@ -0,0 +1,55 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskClassification import AbsTaskClassification +from mteb.abstasks.MultilingualTask import MultilingualTask +from mteb.abstasks.TaskMetadata import TaskMetadata + +_LANGUAGES = { + "da": ["dan-Latn"], + "en": ["eng-Latn"], + "fi": ["fin-Latn"], + "no": ["nob-Latn"], + "sv": ["swe-Latn"], +} + + +class ScandiSentClassification(MultilingualTask, AbsTaskClassification): + metadata = TaskMetadata( + name="ScandiSentClassification", + dataset={ + "path": "mteb/scandisent", + "revision": "97672414ad7453a106edfbfb1a0ceb152355b9dd", + }, + description="The corpus is crawled from se.trustpilot.com, no.trustpilot.com, dk.trustpilot.com, fi.trustpilot.com and trustpilot.com.", + reference="https://github.com/timpal0l/ScandiSent", + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=_LANGUAGES, + main_score="accuracy", + date=("2020-09-01", "2022-04-09"), + domains=["Reviews", "Written"], + dialect=[], + task_subtypes=["Sentiment/Hate speech"], + license="openrail", + annotations_creators="expert-annotated", + sample_creation="found", + bibtex_citation=r""" +@inproceedings{isbister-etal-2021-stop, + address = {Reykjavik, Iceland (Online)}, + author = {Isbister, Tim and +Carlsson, Fredrik and +Sahlgren, Magnus}, + booktitle = {Proceedings of the 23rd Nordic Conference on Computational Linguistics (NoDaLiDa)}, + editor = {Dobnik, Simon and +{\O}vrelid, Lilja}, + month = may # { 31--2 } # jun, + pages = {385--390}, + publisher = {Link{\"o}ping University Electronic Press, Sweden}, + title = {Should we Stop Training More Monolingual Models, and Simply Use Machine Translation Instead?}, + url = {https://aclanthology.org/2021.nodalida-main.42/}, + year = {2021}, +} +""", + ) diff --git a/mteb/tasks/Classification/multilingual/SouthAfricanLangClassification.py b/mteb/tasks/Classification/multilingual/SouthAfricanLangClassification.py index 217d300ec0..14c3fa3f85 100644 --- a/mteb/tasks/Classification/multilingual/SouthAfricanLangClassification.py +++ b/mteb/tasks/Classification/multilingual/SouthAfricanLangClassification.py @@ -40,13 +40,15 @@ class SouthAfricanLangClassification(AbsTaskClassification): annotations_creators="expert-annotated", dialect=[], sample_creation="found", - bibtex_citation="""@misc{south-african-language-identification, - author = {ExploreAI Academy, Joanne M}, - title = {South African Language Identification}, - publisher = {Kaggle}, - year = {2022}, - url = {https://kaggle.com/competitions/south-african-language-identification} -}""", + bibtex_citation=r""" +@misc{south-african-language-identification, + author = {ExploreAI Academy, Joanne M}, + publisher = {Kaggle}, + title = {South African Language Identification}, + url = {https://kaggle.com/competitions/south-african-language-identification}, + year = {2022}, +} +""", ) def dataset_transform(self) -> None: diff --git a/mteb/tasks/Classification/multilingual/SwissJudgementClassification.py b/mteb/tasks/Classification/multilingual/SwissJudgementClassification.py index 92aa43268c..09650dac8b 100644 --- a/mteb/tasks/Classification/multilingual/SwissJudgementClassification.py +++ b/mteb/tasks/Classification/multilingual/SwissJudgementClassification.py @@ -34,13 +34,14 @@ class SwissJudgementClassification(MultilingualTask, AbsTaskClassification): annotations_creators="expert-annotated", dialect=[], sample_creation="found", - bibtex_citation="""@misc{niklaus2022empirical, - title={An Empirical Study on Cross-X Transfer for Legal Judgment Prediction}, - author={Joel Niklaus and Matthias Stürmer and Ilias Chalkidis}, - year={2022}, - eprint={2209.12325}, - archivePrefix={arXiv}, - primaryClass={cs.CL} + bibtex_citation=r""" +@misc{niklaus2022empirical, + archiveprefix = {arXiv}, + author = {Joel Niklaus and Matthias Stürmer and Ilias Chalkidis}, + eprint = {2209.12325}, + primaryclass = {cs.CL}, + title = {An Empirical Study on Cross-X Transfer for Legal Judgment Prediction}, + year = {2022}, } """, ) diff --git a/mteb/tasks/Classification/multilingual/TweetSentimentClassification.py b/mteb/tasks/Classification/multilingual/TweetSentimentClassification.py index 4105f975a9..1d05a0f885 100644 --- a/mteb/tasks/Classification/multilingual/TweetSentimentClassification.py +++ b/mteb/tasks/Classification/multilingual/TweetSentimentClassification.py @@ -39,22 +39,22 @@ class TweetSentimentClassification(MultilingualTask, AbsTaskClassification): annotations_creators="human-annotated", dialect=[], sample_creation="found", - bibtex_citation=""" - @inproceedings{barbieri-etal-2022-xlm, - title = "{XLM}-{T}: Multilingual Language Models in {T}witter for Sentiment Analysis and Beyond", - author = "Barbieri, Francesco and - Espinosa Anke, Luis and - Camacho-Collados, Jose", - booktitle = "Proceedings of the Thirteenth Language Resources and Evaluation Conference", - month = jun, - year = "2022", - address = "Marseille, France", - publisher = "European Language Resources Association", - url = "https://aclanthology.org/2022.lrec-1.27", - pages = "258--266", - abstract = "Language models are ubiquitous in current NLP, and their multilingual capacity has recently attracted considerable attention. However, current analyses have almost exclusively focused on (multilingual variants of) standard benchmarks, and have relied on clean pre-training and task-specific corpora as multilingual signals. In this paper, we introduce XLM-T, a model to train and evaluate multilingual language models in Twitter. In this paper we provide: (1) a new strong multilingual baseline consisting of an XLM-R (Conneau et al. 2020) model pre-trained on millions of tweets in over thirty languages, alongside starter code to subsequently fine-tune on a target task; and (2) a set of unified sentiment analysis Twitter datasets in eight different languages and a XLM-T model trained on this dataset.", - } - """, + bibtex_citation=r""" +@inproceedings{barbieri-etal-2022-xlm, + abstract = {Language models are ubiquitous in current NLP, and their multilingual capacity has recently attracted considerable attention. However, current analyses have almost exclusively focused on (multilingual variants of) standard benchmarks, and have relied on clean pre-training and task-specific corpora as multilingual signals. In this paper, we introduce XLM-T, a model to train and evaluate multilingual language models in Twitter. In this paper we provide: (1) a new strong multilingual baseline consisting of an XLM-R (Conneau et al. 2020) model pre-trained on millions of tweets in over thirty languages, alongside starter code to subsequently fine-tune on a target task; and (2) a set of unified sentiment analysis Twitter datasets in eight different languages and a XLM-T model trained on this dataset.}, + address = {Marseille, France}, + author = {Barbieri, Francesco and +Espinosa Anke, Luis and +Camacho-Collados, Jose}, + booktitle = {Proceedings of the Thirteenth Language Resources and Evaluation Conference}, + month = jun, + pages = {258--266}, + publisher = {European Language Resources Association}, + title = {{XLM}-{T}: Multilingual Language Models in {T}witter for Sentiment Analysis and Beyond}, + url = {https://aclanthology.org/2022.lrec-1.27}, + year = {2022}, +} +""", ) def dataset_transform(self): diff --git a/mteb/tasks/Classification/mya/MyanmarNews.py b/mteb/tasks/Classification/mya/MyanmarNews.py index 8418e20533..0d8daa8e1a 100644 --- a/mteb/tasks/Classification/mya/MyanmarNews.py +++ b/mteb/tasks/Classification/mya/MyanmarNews.py @@ -27,15 +27,16 @@ class MyanmarNews(AbsTaskClassification): annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation="""" - @inproceedings{Khine2017, - author = {A. H. Khine and K. T. Nwet and K. M. Soe}, - title = {Automatic Myanmar News Classification}, - booktitle = {15th Proceedings of International Conference on Computer Applications}, - year = {2017}, - month = {February}, - pages = {401--408} - }""", + bibtex_citation=r""" +@inproceedings{Khine2017, + author = {A. H. Khine and K. T. Nwet and K. M. Soe}, + booktitle = {15th Proceedings of International Conference on Computer Applications}, + month = {February}, + pages = {401--408}, + title = {Automatic Myanmar News Classification}, + year = {2017}, +} +""", ) def dataset_transform(self): diff --git a/mteb/tasks/Classification/nep/NepaliNewsClassification.py b/mteb/tasks/Classification/nep/NepaliNewsClassification.py index 85cc8d9661..d266e38a6b 100644 --- a/mteb/tasks/Classification/nep/NepaliNewsClassification.py +++ b/mteb/tasks/Classification/nep/NepaliNewsClassification.py @@ -26,27 +26,27 @@ class NepaliNewsClassification(AbsTaskClassification): annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation=""" - @inproceedings{arora-2020-inltk, - title = "i{NLTK}: Natural Language Toolkit for Indic Languages", - author = "Arora, Gaurav", - editor = "Park, Eunjeong L. and - Hagiwara, Masato and - Milajevs, Dmitrijs and - Liu, Nelson F. and - Chauhan, Geeticka and - Tan, Liling", - booktitle = "Proceedings of Second Workshop for NLP Open Source Software (NLP-OSS)", - month = nov, - year = "2020", - address = "Online", - publisher = "Association for Computational Linguistics", - url = "https://aclanthology.org/2020.nlposs-1.10", - doi = "10.18653/v1/2020.nlposs-1.10", - pages = "66--71", - abstract = "We present iNLTK, an open-source NLP library consisting of pre-trained language models and out-of-the-box support for Data Augmentation, Textual Similarity, Sentence Embeddings, Word Embeddings, Tokenization and Text Generation in 13 Indic Languages. By using pre-trained models from iNLTK for text classification on publicly available datasets, we significantly outperform previously reported results. On these datasets, we also show that by using pre-trained models and data augmentation from iNLTK, we can achieve more than 95{\%} of the previous best performance by using less than 10{\%} of the training data. iNLTK is already being widely used by the community and has 40,000+ downloads, 600+ stars and 100+ forks on GitHub.", - } - """, + bibtex_citation=r""" +@inproceedings{arora-2020-inltk, + abstract = {We present iNLTK, an open-source NLP library consisting of pre-trained language models and out-of-the-box support for Data Augmentation, Textual Similarity, Sentence Embeddings, Word Embeddings, Tokenization and Text Generation in 13 Indic Languages. By using pre-trained models from iNLTK for text classification on publicly available datasets, we significantly outperform previously reported results. On these datasets, we also show that by using pre-trained models and data augmentation from iNLTK, we can achieve more than 95{\%} of the previous best performance by using less than 10{\%} of the training data. iNLTK is already being widely used by the community and has 40,000+ downloads, 600+ stars and 100+ forks on GitHub.}, + address = {Online}, + author = {Arora, Gaurav}, + booktitle = {Proceedings of Second Workshop for NLP Open Source Software (NLP-OSS)}, + doi = {10.18653/v1/2020.nlposs-1.10}, + editor = {Park, Eunjeong L. and +Hagiwara, Masato and +Milajevs, Dmitrijs and +Liu, Nelson F. and +Chauhan, Geeticka and +Tan, Liling}, + month = nov, + pages = {66--71}, + publisher = {Association for Computational Linguistics}, + title = {i{NLTK}: Natural Language Toolkit for Indic Languages}, + url = {https://aclanthology.org/2020.nlposs-1.10}, + year = {2020}, +} +""", ) def dataset_transform(self): diff --git a/mteb/tasks/Classification/nld/DutchBookReviewSentimentClassification.py b/mteb/tasks/Classification/nld/DutchBookReviewSentimentClassification.py index f0ee1b07dc..882562f5f4 100644 --- a/mteb/tasks/Classification/nld/DutchBookReviewSentimentClassification.py +++ b/mteb/tasks/Classification/nld/DutchBookReviewSentimentClassification.py @@ -27,20 +27,21 @@ class DutchBookReviewSentimentClassification(AbsTaskClassification): annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation="""@article{DBLP:journals/corr/abs-1910-00896, - author = {Benjamin, van der Burgh and - Suzan, Verberne}, - title = {The merits of Universal Language Model Fine-tuning for Small Datasets - - a case with Dutch book reviews}, - journal = {CoRR}, - volume = {abs/1910.00896}, - year = {2019}, - url = {http://arxiv.org/abs/1910.00896}, - archivePrefix = {arXiv}, - eprint = {1910.00896}, + bibtex_citation=r""" +@article{DBLP:journals/corr/abs-1910-00896, + archiveprefix = {arXiv}, + author = {Benjamin, van der Burgh and +Suzan, Verberne}, + bibsource = {dblp computer science bibliography, https://dblp.org}, + biburl = {https://dblp.org/rec/journals/corr/abs-1910-00896.bib}, + eprint = {1910.00896}, + journal = {CoRR}, timestamp = {Fri, 04 Oct 2019 12:28:06 +0200}, - biburl = {https://dblp.org/rec/journals/corr/abs-1910-00896.bib}, - bibsource = {dblp computer science bibliography, https://dblp.org} + title = {The merits of Universal Language Model Fine-tuning for Small Datasets +- a case with Dutch book reviews}, + url = {http://arxiv.org/abs/1910.00896}, + volume = {abs/1910.00896}, + year = {2019}, } """, ) diff --git a/mteb/tasks/Classification/nob/NoRecClassification.py b/mteb/tasks/Classification/nob/NoRecClassification.py index 8391c880c0..3dfa084b0e 100644 --- a/mteb/tasks/Classification/nob/NoRecClassification.py +++ b/mteb/tasks/Classification/nob/NoRecClassification.py @@ -27,34 +27,35 @@ class NoRecClassification(AbsTaskClassification): annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation="""@inproceedings{velldal-etal-2018-norec, - title = "{N}o{R}e{C}: The {N}orwegian Review Corpus", - author = "Velldal, Erik and - {\\O}vrelid, Lilja and - Bergem, Eivind Alexander and - Stadsnes, Cathrine and - Touileb, Samia and - J{\\o}rgensen, Fredrik", - editor = "Calzolari, Nicoletta and - Choukri, Khalid and - Cieri, Christopher and - Declerck, Thierry and - Goggi, Sara and - Hasida, Koiti and - Isahara, Hitoshi and - Maegaard, Bente and - Mariani, Joseph and - Mazo, H{\\'e}l{\\`e}ne and - Moreno, Asuncion and - Odijk, Jan and - Piperidis, Stelios and - Tokunaga, Takenobu", - booktitle = "Proceedings of the Eleventh International Conference on Language Resources and Evaluation ({LREC} 2018)", - month = may, - year = "2018", - address = "Miyazaki, Japan", - publisher = "European Language Resources Association (ELRA)", - url = "https://aclanthology.org/L18-1661", + bibtex_citation=r""" +@inproceedings{velldal-etal-2018-norec, + address = {Miyazaki, Japan}, + author = {Velldal, Erik and +{\\O}vrelid, Lilja and +Bergem, Eivind Alexander and +Stadsnes, Cathrine and +Touileb, Samia and +J{\\o}rgensen, Fredrik}, + booktitle = {Proceedings of the Eleventh International Conference on Language Resources and Evaluation ({LREC} 2018)}, + editor = {Calzolari, Nicoletta and +Choukri, Khalid and +Cieri, Christopher and +Declerck, Thierry and +Goggi, Sara and +Hasida, Koiti and +Isahara, Hitoshi and +Maegaard, Bente and +Mariani, Joseph and +Mazo, H{\\'e}l{\\`e}ne and +Moreno, Asuncion and +Odijk, Jan and +Piperidis, Stelios and +Tokunaga, Takenobu}, + month = may, + publisher = {European Language Resources Association (ELRA)}, + title = {{N}o{R}e{C}: The {N}orwegian Review Corpus}, + url = {https://aclanthology.org/L18-1661}, + year = {2018}, } """, prompt="Classify Norwegian reviews by sentiment", diff --git a/mteb/tasks/Classification/nob/NorwegianParliamentClassification.py b/mteb/tasks/Classification/nob/NorwegianParliamentClassification.py index e46ae6612a..b91c704063 100644 --- a/mteb/tasks/Classification/nob/NorwegianParliamentClassification.py +++ b/mteb/tasks/Classification/nob/NorwegianParliamentClassification.py @@ -28,22 +28,24 @@ class NorwegianParliamentClassification(AbsTaskClassification): annotations_creators="derived", # based on the speaker affiliation dialect=[], # unknown sample_creation="found", - bibtex_citation="""@inproceedings{kummervold-etal-2021-operationalizing, - title = "Operationalizing a National Digital Library: The Case for a {N}orwegian Transformer Model", - author = "Kummervold, Per E and - De la Rosa, Javier and - Wetjen, Freddy and - Brygfjeld, Svein Arne", - editor = "Dobnik, Simon and - {\O}vrelid, Lilja", - booktitle = "Proceedings of the 23rd Nordic Conference on Computational Linguistics (NoDaLiDa)", - month = may # " 31--2 " # jun, - year = "2021", - address = "Reykjavik, Iceland (Online)", - publisher = {Link{\"o}ping University Electronic Press, Sweden}, - url = "https://aclanthology.org/2021.nodalida-main.3", - pages = "20--29", - abstract = "In this work, we show the process of building a large-scale training set from digital and digitized collections at a national library. The resulting Bidirectional Encoder Representations from Transformers (BERT)-based language model for Norwegian outperforms multilingual BERT (mBERT) models in several token and sequence classification tasks for both Norwegian Bokm{\aa}l and Norwegian Nynorsk. Our model also improves the mBERT performance for other languages present in the corpus such as English, Swedish, and Danish. For languages not included in the corpus, the weights degrade moderately while keeping strong multilingual properties. Therefore, we show that building high-quality models within a memory institution using somewhat noisy optical character recognition (OCR) content is feasible, and we hope to pave the way for other memory institutions to follow.", -}""", + bibtex_citation=r""" +@inproceedings{kummervold-etal-2021-operationalizing, + abstract = {In this work, we show the process of building a large-scale training set from digital and digitized collections at a national library. The resulting Bidirectional Encoder Representations from Transformers (BERT)-based language model for Norwegian outperforms multilingual BERT (mBERT) models in several token and sequence classification tasks for both Norwegian Bokm{\aa}l and Norwegian Nynorsk. Our model also improves the mBERT performance for other languages present in the corpus such as English, Swedish, and Danish. For languages not included in the corpus, the weights degrade moderately while keeping strong multilingual properties. Therefore, we show that building high-quality models within a memory institution using somewhat noisy optical character recognition (OCR) content is feasible, and we hope to pave the way for other memory institutions to follow.}, + address = {Reykjavik, Iceland (Online)}, + author = {Kummervold, Per E and +De la Rosa, Javier and +Wetjen, Freddy and +Brygfjeld, Svein Arne}, + booktitle = {Proceedings of the 23rd Nordic Conference on Computational Linguistics (NoDaLiDa)}, + editor = {Dobnik, Simon and +{\O}vrelid, Lilja}, + month = may # { 31--2 } # jun, + pages = {20--29}, + publisher = {Link{\"o}ping University Electronic Press, Sweden}, + title = {Operationalizing a National Digital Library: The Case for a {N}orwegian Transformer Model}, + url = {https://aclanthology.org/2021.nodalida-main.3}, + year = {2021}, +} +""", prompt="Classify parliament speeches in Norwegian based on political affiliation", ) diff --git a/mteb/tasks/Classification/ory/OdiaNewsClassification.py b/mteb/tasks/Classification/ory/OdiaNewsClassification.py index 6e89c50ab1..214b8f67f3 100644 --- a/mteb/tasks/Classification/ory/OdiaNewsClassification.py +++ b/mteb/tasks/Classification/ory/OdiaNewsClassification.py @@ -26,12 +26,14 @@ class OdiaNewsClassification(AbsTaskClassification): annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation="""@article{kunchukuttan2020indicnlpcorpus, - title={AI4Bharat-IndicNLP Corpus: Monolingual Corpora and Word Embeddings for Indic Languages}, - author={Anoop Kunchukuttan and Divyanshu Kakwani and Satish Golla and Gokul N.C. and Avik Bhattacharyya and Mitesh M. Khapra and Pratyush Kumar}, - year={2020}, - journal={arXiv preprint arXiv:2005.00085}, -}""", + bibtex_citation=r""" +@article{kunchukuttan2020indicnlpcorpus, + author = {Anoop Kunchukuttan and Divyanshu Kakwani and Satish Golla and Gokul N.C. and Avik Bhattacharyya and Mitesh M. Khapra and Pratyush Kumar}, + journal = {arXiv preprint arXiv:2005.00085}, + title = {AI4Bharat-IndicNLP Corpus: Monolingual Corpora and Word Embeddings for Indic Languages}, + year = {2020}, +} +""", ) def dataset_transform(self): diff --git a/mteb/tasks/Classification/pan/PunjabiNewsClassification.py b/mteb/tasks/Classification/pan/PunjabiNewsClassification.py index fb948d7746..bc4b812dd1 100644 --- a/mteb/tasks/Classification/pan/PunjabiNewsClassification.py +++ b/mteb/tasks/Classification/pan/PunjabiNewsClassification.py @@ -26,12 +26,14 @@ class PunjabiNewsClassification(AbsTaskClassification): annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation="""@article{kunchukuttan2020indicnlpcorpus, - title={AI4Bharat-IndicNLP Corpus: Monolingual Corpora and Word Embeddings for Indic Languages}, - author={Anoop Kunchukuttan and Divyanshu Kakwani and Satish Golla and Gokul N.C. and Avik Bhattacharyya and Mitesh M. Khapra and Pratyush Kumar}, - year={2020}, - journal={arXiv preprint arXiv:2005.00085}, -}""", + bibtex_citation=r""" +@article{kunchukuttan2020indicnlpcorpus, + author = {Anoop Kunchukuttan and Divyanshu Kakwani and Satish Golla and Gokul N.C. and Avik Bhattacharyya and Mitesh M. Khapra and Pratyush Kumar}, + journal = {arXiv preprint arXiv:2005.00085}, + title = {AI4Bharat-IndicNLP Corpus: Monolingual Corpora and Word Embeddings for Indic Languages}, + year = {2020}, +} +""", ) def dataset_transform(self): diff --git a/mteb/tasks/Classification/pol/PolishClassification.py b/mteb/tasks/Classification/pol/PolishClassification.py index c0963e8283..805ac4b424 100644 --- a/mteb/tasks/Classification/pol/PolishClassification.py +++ b/mteb/tasks/Classification/pol/PolishClassification.py @@ -26,15 +26,17 @@ class CbdClassification(AbsTaskClassification): annotations_creators="human-annotated", # guess dialect=[], sample_creation="found", - bibtex_citation="""@proceedings{ogr:kob:19:poleval, - editor = {Maciej Ogrodniczuk and Łukasz Kobyliński}, - title = {{Proceedings of the PolEval 2019 Workshop}}, - year = {2019}, - address = {Warsaw, Poland}, + bibtex_citation=r""" +@proceedings{ogr:kob:19:poleval, + address = {Warsaw, Poland}, + editor = {Maciej Ogrodniczuk and Łukasz Kobyliński}, + isbn = {978-83-63159-28-3}, publisher = {Institute of Computer Science, Polish Academy of Sciences}, - url = {http://2019.poleval.pl/files/poleval2019.pdf}, - isbn = "978-83-63159-28-3"} -}""", + title = {{Proceedings of the PolEval 2019 Workshop}}, + url = {http://2019.poleval.pl/files/poleval2019.pdf}, + year = {2019}, +} +""", ) @@ -61,21 +63,23 @@ class PolEmo2InClassification(AbsTaskClassification): annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation="""@inproceedings{kocon-etal-2019-multi, - title = "Multi-Level Sentiment Analysis of {P}ol{E}mo 2.0: Extended Corpus of Multi-Domain Consumer Reviews", - author = "Koco{\'n}, Jan and - Mi{\l}kowski, Piotr and - Za{\'s}ko-Zieli{\'n}ska, Monika", - booktitle = "Proceedings of the 23rd Conference on Computational Natural Language Learning (CoNLL)", - month = nov, - year = "2019", - address = "Hong Kong, China", - publisher = "Association for Computational Linguistics", - url = "https://aclanthology.org/K19-1092", - doi = "10.18653/v1/K19-1092", - pages = "980--991", - abstract = "In this article we present an extended version of PolEmo {--} a corpus of consumer reviews from 4 domains: medicine, hotels, products and school. Current version (PolEmo 2.0) contains 8,216 reviews having 57,466 sentences. Each text and sentence was manually annotated with sentiment in 2+1 scheme, which gives a total of 197,046 annotations. We obtained a high value of Positive Specific Agreement, which is 0.91 for texts and 0.88 for sentences. PolEmo 2.0 is publicly available under a Creative Commons copyright license. We explored recent deep learning approaches for the recognition of sentiment, such as Bi-directional Long Short-Term Memory (BiLSTM) and Bidirectional Encoder Representations from Transformers (BERT).", -}""", + bibtex_citation=r""" +@inproceedings{kocon-etal-2019-multi, + abstract = {In this article we present an extended version of PolEmo {--} a corpus of consumer reviews from 4 domains: medicine, hotels, products and school. Current version (PolEmo 2.0) contains 8,216 reviews having 57,466 sentences. Each text and sentence was manually annotated with sentiment in 2+1 scheme, which gives a total of 197,046 annotations. We obtained a high value of Positive Specific Agreement, which is 0.91 for texts and 0.88 for sentences. PolEmo 2.0 is publicly available under a Creative Commons copyright license. We explored recent deep learning approaches for the recognition of sentiment, such as Bi-directional Long Short-Term Memory (BiLSTM) and Bidirectional Encoder Representations from Transformers (BERT).}, + address = {Hong Kong, China}, + author = {Koco{\'n}, Jan and +Mi{\l}kowski, Piotr and +Za{\'s}ko-Zieli{\'n}ska, Monika}, + booktitle = {Proceedings of the 23rd Conference on Computational Natural Language Learning (CoNLL)}, + doi = {10.18653/v1/K19-1092}, + month = nov, + pages = {980--991}, + publisher = {Association for Computational Linguistics}, + title = {Multi-Level Sentiment Analysis of {P}ol{E}mo 2.0: Extended Corpus of Multi-Domain Consumer Reviews}, + url = {https://aclanthology.org/K19-1092}, + year = {2019}, +} +""", ) @@ -123,7 +127,7 @@ class AllegroReviewsClassification(AbsTaskClassification): eval_langs=["pol-Latn"], main_score="accuracy", date=None, - domains=None, + domains=["Reviews"], task_subtypes=None, license=None, annotations_creators=None, @@ -156,13 +160,15 @@ class PacClassification(AbsTaskClassification): annotations_creators=None, dialect=[], sample_creation=None, - bibtex_citation="""@misc{augustyniak2022waydesigningcompilinglepiszcze, - title={This is the way: designing and compiling LEPISZCZE, a comprehensive NLP benchmark for Polish}, - author={Łukasz Augustyniak and Kamil Tagowski and Albert Sawczyn and Denis Janiak and Roman Bartusiak and Adrian Szymczak and Marcin Wątroba and Arkadiusz Janz and Piotr Szymański and Mikołaj Morzy and Tomasz Kajdanowicz and Maciej Piasecki}, - year={2022}, - eprint={2211.13112}, - archivePrefix={arXiv}, - primaryClass={cs.CL}, - url={https://arxiv.org/abs/2211.13112}, -}""", + bibtex_citation=r""" +@misc{augustyniak2022waydesigningcompilinglepiszcze, + archiveprefix = {arXiv}, + author = {Łukasz Augustyniak and Kamil Tagowski and Albert Sawczyn and Denis Janiak and Roman Bartusiak and Adrian Szymczak and Marcin Wątroba and Arkadiusz Janz and Piotr Szymański and Mikołaj Morzy and Tomasz Kajdanowicz and Maciej Piasecki}, + eprint = {2211.13112}, + primaryclass = {cs.CL}, + title = {This is the way: designing and compiling LEPISZCZE, a comprehensive NLP benchmark for Polish}, + url = {https://arxiv.org/abs/2211.13112}, + year = {2022}, +} +""", ) diff --git a/mteb/tasks/Classification/por/HateSpeechPortugueseClassification.py b/mteb/tasks/Classification/por/HateSpeechPortugueseClassification.py index a7abf6b0f9..8920f8043e 100644 --- a/mteb/tasks/Classification/por/HateSpeechPortugueseClassification.py +++ b/mteb/tasks/Classification/por/HateSpeechPortugueseClassification.py @@ -27,28 +27,28 @@ class HateSpeechPortugueseClassification(AbsTaskClassification): annotations_creators="expert-annotated", dialect=[], sample_creation="found", - bibtex_citation=""" - @inproceedings{fortuna-etal-2019-hierarchically, - title = "A Hierarchically-Labeled {P}ortuguese Hate Speech Dataset", - author = "Fortuna, Paula and - Rocha da Silva, Jo{\~a}o and - Soler-Company, Juan and - Wanner, Leo and - Nunes, S{\'e}rgio", - editor = "Roberts, Sarah T. and - Tetreault, Joel and - Prabhakaran, Vinodkumar and - Waseem, Zeerak", - booktitle = "Proceedings of the Third Workshop on Abusive Language Online", - month = aug, - year = "2019", - address = "Florence, Italy", - publisher = "Association for Computational Linguistics", - url = "https://aclanthology.org/W19-3510", - doi = "10.18653/v1/W19-3510", - pages = "94--104", - } - """, + bibtex_citation=r""" +@inproceedings{fortuna-etal-2019-hierarchically, + address = {Florence, Italy}, + author = {Fortuna, Paula and +Rocha da Silva, Jo{\~a}o and +Soler-Company, Juan and +Wanner, Leo and +Nunes, S{\'e}rgio}, + booktitle = {Proceedings of the Third Workshop on Abusive Language Online}, + doi = {10.18653/v1/W19-3510}, + editor = {Roberts, Sarah T. and +Tetreault, Joel and +Prabhakaran, Vinodkumar and +Waseem, Zeerak}, + month = aug, + pages = {94--104}, + publisher = {Association for Computational Linguistics}, + title = {A Hierarchically-Labeled {P}ortuguese Hate Speech Dataset}, + url = {https://aclanthology.org/W19-3510}, + year = {2019}, +} +""", ) def dataset_transform(self): diff --git a/mteb/tasks/Classification/ron/Moroco.py b/mteb/tasks/Classification/ron/Moroco.py index 2324f16ef8..479dd5a1d4 100644 --- a/mteb/tasks/Classification/ron/Moroco.py +++ b/mteb/tasks/Classification/ron/Moroco.py @@ -32,15 +32,15 @@ class Moroco(AbsTaskClassification): "ron-Latn-mol", ], # Moldavian, or the Romanian dialect used in Moldova, does not have an ISO 639-1 code assigned to it. However, it has been given the three-letter code "mol" under ISO 639-3 sample_creation="found", - bibtex_citation="""" - @inproceedings{ Butnaru-ACL-2019, - author = {Andrei M. Butnaru and Radu Tudor Ionescu}, - title = "{MOROCO: The Moldavian and Romanian Dialectal Corpus}", - booktitle = {Proceedings of ACL}, - year = {2019}, - pages={688--698}, - } - """, + bibtex_citation=r""" +@inproceedings{Butnaru-ACL-2019, + author = {Andrei M. Butnaru and Radu Tudor Ionescu}, + booktitle = {Proceedings of ACL}, + pages = {688--698}, + title = {{MOROCO: The Moldavian and Romanian Dialectal Corpus}}, + year = {2019}, +} +""", ) def dataset_transform(self): diff --git a/mteb/tasks/Classification/ron/RomanianReviewsSentiment.py b/mteb/tasks/Classification/ron/RomanianReviewsSentiment.py index 6666d615a3..be06de0fa5 100644 --- a/mteb/tasks/Classification/ron/RomanianReviewsSentiment.py +++ b/mteb/tasks/Classification/ron/RomanianReviewsSentiment.py @@ -27,13 +27,12 @@ class RomanianReviewsSentiment(AbsTaskClassification): annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation=""" -@article{ - tache2101clustering, - title={Clustering Word Embeddings with Self-Organizing Maps. Application on LaRoSeDa -- A Large Romanian Sentiment Data Set}, - author={Anca Maria Tache and Mihaela Gaman and Radu Tudor Ionescu}, - journal={ArXiv}, - year = {2021} + bibtex_citation=r""" +@article{tache2101clustering, + author = {Anca Maria Tache and Mihaela Gaman and Radu Tudor Ionescu}, + journal = {ArXiv}, + title = {Clustering Word Embeddings with Self-Organizing Maps. Application on LaRoSeDa -- A Large Romanian Sentiment Data Set}, + year = {2021}, } """, ) diff --git a/mteb/tasks/Classification/ron/RomanianSentimentClassification.py b/mteb/tasks/Classification/ron/RomanianSentimentClassification.py index 1bcfd0052c..3622620d50 100644 --- a/mteb/tasks/Classification/ron/RomanianSentimentClassification.py +++ b/mteb/tasks/Classification/ron/RomanianSentimentClassification.py @@ -29,11 +29,12 @@ class RomanianSentimentClassification(AbsTaskClassification): annotations_creators="human-annotated", dialect=[], sample_creation="found", - bibtex_citation="""@article{dumitrescu2020birth, - title={The birth of Romanian BERT}, - author={Dumitrescu, Stefan Daniel and Avram, Andrei-Marius and Pyysalo, Sampo}, - journal={arXiv preprint arXiv:2009.08712}, - year={2020} + bibtex_citation=r""" +@article{dumitrescu2020birth, + author = {Dumitrescu, Stefan Daniel and Avram, Andrei-Marius and Pyysalo, Sampo}, + journal = {arXiv preprint arXiv:2009.08712}, + title = {The birth of Romanian BERT}, + year = {2020}, } """, ) diff --git a/mteb/tasks/Classification/rus/HeadlineClassification.py b/mteb/tasks/Classification/rus/HeadlineClassification.py index ca16fd6a85..9def591d0c 100644 --- a/mteb/tasks/Classification/rus/HeadlineClassification.py +++ b/mteb/tasks/Classification/rus/HeadlineClassification.py @@ -26,30 +26,32 @@ class HeadlineClassification(AbsTaskClassification): annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation="""@inproceedings{gudkov-etal-2020-automatically, - title = "Automatically Ranked {R}ussian Paraphrase Corpus for Text Generation", - author = "Gudkov, Vadim and - Mitrofanova, Olga and - Filippskikh, Elizaveta", - editor = "Birch, Alexandra and - Finch, Andrew and - Hayashi, Hiroaki and - Heafield, Kenneth and - Junczys-Dowmunt, Marcin and - Konstas, Ioannis and - Li, Xian and - Neubig, Graham and - Oda, Yusuke", - booktitle = "Proceedings of the Fourth Workshop on Neural Generation and Translation", - month = jul, - year = "2020", - address = "Online", - publisher = "Association for Computational Linguistics", - url = "https://aclanthology.org/2020.ngt-1.6", - doi = "10.18653/v1/2020.ngt-1.6", - pages = "54--59", - abstract = "The article is focused on automatic development and ranking of a large corpus for Russian paraphrase generation which proves to be the first corpus of such type in Russian computational linguistics. Existing manually annotated paraphrase datasets for Russian are limited to small-sized ParaPhraser corpus and ParaPlag which are suitable for a set of NLP tasks, such as paraphrase and plagiarism detection, sentence similarity and relatedness estimation, etc. Due to size restrictions, these datasets can hardly be applied in end-to-end text generation solutions. Meanwhile, paraphrase generation requires a large amount of training data. In our study we propose a solution to the problem: we collect, rank and evaluate a new publicly available headline paraphrase corpus (ParaPhraser Plus), and then perform text generation experiments with manual evaluation on automatically ranked corpora using the Universal Transformer architecture.", - }""", + bibtex_citation=r""" +@inproceedings{gudkov-etal-2020-automatically, + abstract = {The article is focused on automatic development and ranking of a large corpus for Russian paraphrase generation which proves to be the first corpus of such type in Russian computational linguistics. Existing manually annotated paraphrase datasets for Russian are limited to small-sized ParaPhraser corpus and ParaPlag which are suitable for a set of NLP tasks, such as paraphrase and plagiarism detection, sentence similarity and relatedness estimation, etc. Due to size restrictions, these datasets can hardly be applied in end-to-end text generation solutions. Meanwhile, paraphrase generation requires a large amount of training data. In our study we propose a solution to the problem: we collect, rank and evaluate a new publicly available headline paraphrase corpus (ParaPhraser Plus), and then perform text generation experiments with manual evaluation on automatically ranked corpora using the Universal Transformer architecture.}, + address = {Online}, + author = {Gudkov, Vadim and +Mitrofanova, Olga and +Filippskikh, Elizaveta}, + booktitle = {Proceedings of the Fourth Workshop on Neural Generation and Translation}, + doi = {10.18653/v1/2020.ngt-1.6}, + editor = {Birch, Alexandra and +Finch, Andrew and +Hayashi, Hiroaki and +Heafield, Kenneth and +Junczys-Dowmunt, Marcin and +Konstas, Ioannis and +Li, Xian and +Neubig, Graham and +Oda, Yusuke}, + month = jul, + pages = {54--59}, + publisher = {Association for Computational Linguistics}, + title = {Automatically Ranked {R}ussian Paraphrase Corpus for Text Generation}, + url = {https://aclanthology.org/2020.ngt-1.6}, + year = {2020}, +} +""", prompt="Classify the topic or theme of the given news headline", ) diff --git a/mteb/tasks/Classification/rus/InappropriatenessClassification.py b/mteb/tasks/Classification/rus/InappropriatenessClassification.py index 306266d3fa..7ff0ed11b2 100644 --- a/mteb/tasks/Classification/rus/InappropriatenessClassification.py +++ b/mteb/tasks/Classification/rus/InappropriatenessClassification.py @@ -26,34 +26,36 @@ class InappropriatenessClassification(AbsTaskClassification): annotations_creators="human-annotated", dialect=[], sample_creation="found", - bibtex_citation="""@inproceedings{babakov-etal-2021-detecting, - title = "Detecting Inappropriate Messages on Sensitive Topics that Could Harm a Company{'}s Reputation", - author = "Babakov, Nikolay and - Logacheva, Varvara and - Kozlova, Olga and - Semenov, Nikita and - Panchenko, Alexander", - editor = "Babych, Bogdan and - Kanishcheva, Olga and - Nakov, Preslav and - Piskorski, Jakub and - Pivovarova, Lidia and - Starko, Vasyl and - Steinberger, Josef and - Yangarber, Roman and - Marci{\'n}czuk, Micha{\l} and - Pollak, Senja and - P{\v{r}}ib{\'a}{\v{n}}, Pavel and - Robnik-{\v{S}}ikonja, Marko", - booktitle = "Proceedings of the 8th Workshop on Balto-Slavic Natural Language Processing", - month = apr, - year = "2021", - address = "Kiyv, Ukraine", - publisher = "Association for Computational Linguistics", - url = "https://aclanthology.org/2021.bsnlp-1.4", - pages = "26--36", - abstract = "Not all topics are equally {``}flammable{''} in terms of toxicity: a calm discussion of turtles or fishing less often fuels inappropriate toxic dialogues than a discussion of politics or sexual minorities. We define a set of sensitive topics that can yield inappropriate and toxic messages and describe the methodology of collecting and labelling a dataset for appropriateness. While toxicity in user-generated data is well-studied, we aim at defining a more fine-grained notion of inappropriateness. The core of inappropriateness is that it can harm the reputation of a speaker. This is different from toxicity in two respects: (i) inappropriateness is topic-related, and (ii) inappropriate message is not toxic but still unacceptable. We collect and release two datasets for Russian: a topic-labelled dataset and an appropriateness-labelled dataset. We also release pre-trained classification models trained on this data.", - }""", + bibtex_citation=r""" +@inproceedings{babakov-etal-2021-detecting, + abstract = {Not all topics are equally {``}flammable{''} in terms of toxicity: a calm discussion of turtles or fishing less often fuels inappropriate toxic dialogues than a discussion of politics or sexual minorities. We define a set of sensitive topics that can yield inappropriate and toxic messages and describe the methodology of collecting and labelling a dataset for appropriateness. While toxicity in user-generated data is well-studied, we aim at defining a more fine-grained notion of inappropriateness. The core of inappropriateness is that it can harm the reputation of a speaker. This is different from toxicity in two respects: (i) inappropriateness is topic-related, and (ii) inappropriate message is not toxic but still unacceptable. We collect and release two datasets for Russian: a topic-labelled dataset and an appropriateness-labelled dataset. We also release pre-trained classification models trained on this data.}, + address = {Kiyv, Ukraine}, + author = {Babakov, Nikolay and +Logacheva, Varvara and +Kozlova, Olga and +Semenov, Nikita and +Panchenko, Alexander}, + booktitle = {Proceedings of the 8th Workshop on Balto-Slavic Natural Language Processing}, + editor = {Babych, Bogdan and +Kanishcheva, Olga and +Nakov, Preslav and +Piskorski, Jakub and +Pivovarova, Lidia and +Starko, Vasyl and +Steinberger, Josef and +Yangarber, Roman and +Marci{\'n}czuk, Micha{\l} and +Pollak, Senja and +P{\v{r}}ib{\'a}{\v{n}}, Pavel and +Robnik-{\v{S}}ikonja, Marko}, + month = apr, + pages = {26--36}, + publisher = {Association for Computational Linguistics}, + title = {Detecting Inappropriate Messages on Sensitive Topics that Could Harm a Company{'}s Reputation}, + url = {https://aclanthology.org/2021.bsnlp-1.4}, + year = {2021}, +} +""", prompt="Classify the given message as either sensitive topic or not", ) @@ -61,3 +63,59 @@ def dataset_transform(self): self.dataset = self.stratified_subsampling( self.dataset, seed=self.seed, n_samples=2048, splits=["test"] ) + + +class InappropriatenessClassificationv2(AbsTaskClassification): + metadata = TaskMetadata( + name="InappropriatenessClassificationv2", + dataset={ + "path": "mteb/InappropriatenessClassificationv2", + "revision": "698cb161a90150ec46618f714cdd8606cf21a9eb", + }, + description="Inappropriateness identification in the form of binary classification", + reference="https://aclanthology.org/2021.bsnlp-1.4", + type="Classification", + category="t2t", + modalities=["text"], + eval_splits=["test"], + eval_langs=["rus-Cyrl"], + main_score="accuracy", + date=("2006-01-01", "2021-04-01"), + domains=["Web", "Social", "Written"], + task_subtypes=["Sentiment/Hate speech"], + license="cc-by-nc-sa-4.0", + annotations_creators="human-annotated", + dialect=[], + sample_creation="found", + bibtex_citation=r""" +@inproceedings{babakov-etal-2021-detecting, + abstract = {Not all topics are equally {``}flammable{''} in terms of toxicity: a calm discussion of turtles or fishing less often fuels inappropriate toxic dialogues than a discussion of politics or sexual minorities. We define a set of sensitive topics that can yield inappropriate and toxic messages and describe the methodology of collecting and labelling a dataset for appropriateness. While toxicity in user-generated data is well-studied, we aim at defining a more fine-grained notion of inappropriateness. The core of inappropriateness is that it can harm the reputation of a speaker. This is different from toxicity in two respects: (i) inappropriateness is topic-related, and (ii) inappropriate message is not toxic but still unacceptable. We collect and release two datasets for Russian: a topic-labelled dataset and an appropriateness-labelled dataset. We also release pre-trained classification models trained on this data.}, + address = {Kiyv, Ukraine}, + author = {Babakov, Nikolay and +Logacheva, Varvara and +Kozlova, Olga and +Semenov, Nikita and +Panchenko, Alexander}, + booktitle = {Proceedings of the 8th Workshop on Balto-Slavic Natural Language Processing}, + editor = {Babych, Bogdan and +Kanishcheva, Olga and +Nakov, Preslav and +Piskorski, Jakub and +Pivovarova, Lidia and +Starko, Vasyl and +Steinberger, Josef and +Yangarber, Roman and +Marci{\'n}czuk, Micha{\l} and +Pollak, Senja and +P{\v{r}}ib{\'a}{\v{n}}, Pavel and +Robnik-{\v{S}}ikonja, Marko}, + month = apr, + pages = {26--36}, + publisher = {Association for Computational Linguistics}, + title = {Detecting Inappropriate Messages on Sensitive Topics that Could Harm a Company{'}s Reputation}, + url = {https://aclanthology.org/2021.bsnlp-1.4}, + year = {2021}, +} +""", + prompt="Classify the given message as either sensitive topic or not", + ) diff --git a/mteb/tasks/Classification/rus/KinopoiskClassification.py b/mteb/tasks/Classification/rus/KinopoiskClassification.py index 2fa32a7fdf..cdde48716b 100644 --- a/mteb/tasks/Classification/rus/KinopoiskClassification.py +++ b/mteb/tasks/Classification/rus/KinopoiskClassification.py @@ -26,14 +26,16 @@ class KinopoiskClassification(AbsTaskClassification): annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation="""@article{blinov2013research, - title={Research of lexical approach and machine learning methods for sentiment analysis}, - author={Blinov, PD and Klekovkina, Maria and Kotelnikov, Eugeny and Pestov, Oleg}, - journal={Computational Linguistics and Intellectual Technologies}, - volume={2}, - number={12}, - pages={48--58}, - year={2013} - }""", + bibtex_citation=r""" +@article{blinov2013research, + author = {Blinov, PD and Klekovkina, Maria and Kotelnikov, Eugeny and Pestov, Oleg}, + journal = {Computational Linguistics and Intellectual Technologies}, + number = {12}, + pages = {48--58}, + title = {Research of lexical approach and machine learning methods for sentiment analysis}, + volume = {2}, + year = {2013}, +} +""", prompt="Classify the sentiment expressed in the given movie review text", ) diff --git a/mteb/tasks/Classification/rus/RuReviewsClassification.py b/mteb/tasks/Classification/rus/RuReviewsClassification.py index 7303f3f85d..37f9e83af3 100644 --- a/mteb/tasks/Classification/rus/RuReviewsClassification.py +++ b/mteb/tasks/Classification/rus/RuReviewsClassification.py @@ -26,18 +26,20 @@ class RuReviewsClassification(AbsTaskClassification): annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation="""@INPROCEEDINGS{Smetanin-SA-2019, - author={Sergey Smetanin and Michail Komarov}, - booktitle={2019 IEEE 21st Conference on Business Informatics (CBI)}, - title={Sentiment Analysis of Product Reviews in Russian using Convolutional Neural Networks}, - year={2019}, - volume={01}, - number={}, - pages={482-486}, - doi={10.1109/CBI.2019.00062}, - ISSN={2378-1963}, - month={July} - }""", + bibtex_citation=r""" +@inproceedings{Smetanin-SA-2019, + author = {Sergey Smetanin and Michail Komarov}, + booktitle = {2019 IEEE 21st Conference on Business Informatics (CBI)}, + doi = {10.1109/CBI.2019.00062}, + issn = {2378-1963}, + month = {July}, + number = {}, + pages = {482-486}, + title = {Sentiment Analysis of Product Reviews in Russian using Convolutional Neural Networks}, + volume = {01}, + year = {2019}, +} +""", prompt="Classify product reviews into positive, negative or neutral sentiment", ) diff --git a/mteb/tasks/Classification/rus/ru_nlu_intent_classification.py b/mteb/tasks/Classification/rus/ru_nlu_intent_classification.py new file mode 100644 index 0000000000..d77990e4a9 --- /dev/null +++ b/mteb/tasks/Classification/rus/ru_nlu_intent_classification.py @@ -0,0 +1,52 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskClassification import AbsTaskClassification +from mteb.abstasks.MultilingualTask import MultilingualTask +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class RuNLUIntentClassification(AbsTaskClassification, MultilingualTask): + metadata = TaskMetadata( + name="RuNLUIntentClassification", + dataset={ + "path": "mteb/RuNLUIntentClassification", + "revision": "424d0f767aaa5c411e3a529eec04658e5726a39e", + }, + description=( + "Contains natural language data for human-robot interaction in home domain which we collected and" + " annotated for evaluating NLU Services/platforms." + ), + reference="https://arxiv.org/abs/1903.05566", + type="Classification", + category="t2t", + modalities=["text"], + eval_splits=["test"], + eval_langs={ + "rus-eng": [ + "rus-Cyrl", + "rus-Latn", + ], + "rus": [ + "rus-Cyrl", + ], + }, + main_score="accuracy", + date=("2019-03-26", "2019-03-26"), + domains=[], + task_subtypes=["Intent Classification"], + license="cc-by-4.0", + annotations_creators="human-annotated", + dialect=[], + sample_creation="found", + bibtex_citation=r""" +@misc{liu2019benchmarkingnaturallanguageunderstanding, + archiveprefix = {arXiv}, + author = {Xingkun Liu and Arash Eshghi and Pawel Swietojanski and Verena Rieser}, + eprint = {1903.05566}, + primaryclass = {cs.CL}, + title = {Benchmarking Natural Language Understanding Services for building Conversational Agents}, + url = {https://arxiv.org/abs/1903.05566}, + year = {2019}, +} +""", + ) diff --git a/mteb/tasks/Classification/rus/ru_toixic_classification_okmlcup.py b/mteb/tasks/Classification/rus/ru_toixic_classification_okmlcup.py new file mode 100644 index 0000000000..8e511655e5 --- /dev/null +++ b/mteb/tasks/Classification/rus/ru_toixic_classification_okmlcup.py @@ -0,0 +1,33 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskClassification import AbsTaskClassification +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class RuToxicOKMLCUPClassification(AbsTaskClassification): + metadata = TaskMetadata( + name="RuToxicOKMLCUPClassification", + dataset={ + "path": "mteb/RuToxicOKMLCUPClassification", + "revision": "13722b7320ef4b6a471f9e8b379f3f49167d0517", + }, + description="On the Odnoklassniki social network, users post a huge number of comments of various directions and nature every day.", + reference="https://cups.online/ru/contests/okmlcup2020", + type="Classification", + category="t2t", + modalities=["text"], + eval_splits=["test"], + eval_langs=["rus-Cyrl"], + main_score="accuracy", + date=("2015-01-01", "2020-01-01"), + domains=[], + task_subtypes=["Sentiment/Hate speech"], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation="""""", + ) + + def dataset_transform(self): + self.dataset = self.dataset.rename_column("toxic", "label") diff --git a/mteb/tasks/Classification/rus/senti_ru_eval.py b/mteb/tasks/Classification/rus/senti_ru_eval.py new file mode 100644 index 0000000000..a935dd8c76 --- /dev/null +++ b/mteb/tasks/Classification/rus/senti_ru_eval.py @@ -0,0 +1,40 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskClassification import AbsTaskClassification +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class SentiRuEval2016Classification(AbsTaskClassification): + metadata = TaskMetadata( + name="SentiRuEval2016", + dataset={ + "path": "mteb/SentiRuEval2016", + "revision": "8507eab0deef37f040a750afbcb4dba7a7de9c16", + }, + description="Russian sentiment analysis evaluation SentiRuEval-2016 devoted to reputation monitoring of banks " + "and telecom companies in Twitter. We describe the task, data, the procedure of data preparation, " + "and participants’ results.", + reference="https://github.com/mokoron/sentirueval", + type="Classification", + category="t2t", + modalities=["text"], + eval_splits=["test"], + eval_langs=["rus-Cyrl"], + main_score="accuracy", + date=("2015-01-01", "2016-01-01"), + domains=[], + task_subtypes=["Sentiment/Hate speech"], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=r""" +@inproceedings{loukachevitch2016sentirueval, + author = {Loukachevitch, NV and Rubtsova, Yu V}, + booktitle = {Computational Linguistics and Intellectual Technologies}, + pages = {416--426}, + title = {SentiRuEval-2016: overcoming time gap and data sparsity in tweet sentiment analysis}, + year = {2016}, +} +""", + ) diff --git a/mteb/tasks/Classification/san/SanskritShlokasClassification.py b/mteb/tasks/Classification/san/SanskritShlokasClassification.py index 806e468f00..91b8436e8d 100644 --- a/mteb/tasks/Classification/san/SanskritShlokasClassification.py +++ b/mteb/tasks/Classification/san/SanskritShlokasClassification.py @@ -26,27 +26,27 @@ class SanskritShlokasClassification(AbsTaskClassification): annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation=""" - @inproceedings{arora-2020-inltk, - title = "i{NLTK}: Natural Language Toolkit for Indic Languages", - author = "Arora, Gaurav", - editor = "Park, Eunjeong L. and - Hagiwara, Masato and - Milajevs, Dmitrijs and - Liu, Nelson F. and - Chauhan, Geeticka and - Tan, Liling", - booktitle = "Proceedings of Second Workshop for NLP Open Source Software (NLP-OSS)", - month = nov, - year = "2020", - address = "Online", - publisher = "Association for Computational Linguistics", - url = "https://aclanthology.org/2020.nlposs-1.10", - doi = "10.18653/v1/2020.nlposs-1.10", - pages = "66--71", - abstract = "We present iNLTK, an open-source NLP library consisting of pre-trained language models and out-of-the-box support for Data Augmentation, Textual Similarity, Sentence Embeddings, Word Embeddings, Tokenization and Text Generation in 13 Indic Languages. By using pre-trained models from iNLTK for text classification on publicly available datasets, we significantly outperform previously reported results. On these datasets, we also show that by using pre-trained models and data augmentation from iNLTK, we can achieve more than 95{\%} of the previous best performance by using less than 10{\%} of the training data. iNLTK is already being widely used by the community and has 40,000+ downloads, 600+ stars and 100+ forks on GitHub.", - } - """, + bibtex_citation=r""" +@inproceedings{arora-2020-inltk, + abstract = {We present iNLTK, an open-source NLP library consisting of pre-trained language models and out-of-the-box support for Data Augmentation, Textual Similarity, Sentence Embeddings, Word Embeddings, Tokenization and Text Generation in 13 Indic Languages. By using pre-trained models from iNLTK for text classification on publicly available datasets, we significantly outperform previously reported results. On these datasets, we also show that by using pre-trained models and data augmentation from iNLTK, we can achieve more than 95{\%} of the previous best performance by using less than 10{\%} of the training data. iNLTK is already being widely used by the community and has 40,000+ downloads, 600+ stars and 100+ forks on GitHub.}, + address = {Online}, + author = {Arora, Gaurav}, + booktitle = {Proceedings of Second Workshop for NLP Open Source Software (NLP-OSS)}, + doi = {10.18653/v1/2020.nlposs-1.10}, + editor = {Park, Eunjeong L. and +Hagiwara, Masato and +Milajevs, Dmitrijs and +Liu, Nelson F. and +Chauhan, Geeticka and +Tan, Liling}, + month = nov, + pages = {66--71}, + publisher = {Association for Computational Linguistics}, + title = {i{NLTK}: Natural Language Toolkit for Indic Languages}, + url = {https://aclanthology.org/2020.nlposs-1.10}, + year = {2020}, +} +""", ) def dataset_transform(self): diff --git a/mteb/tasks/Classification/sin/SinhalaNewsClassification.py b/mteb/tasks/Classification/sin/SinhalaNewsClassification.py index 98d414b3c0..4b8c54a184 100644 --- a/mteb/tasks/Classification/sin/SinhalaNewsClassification.py +++ b/mteb/tasks/Classification/sin/SinhalaNewsClassification.py @@ -26,18 +26,21 @@ class SinhalaNewsClassification(AbsTaskClassification): annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation="""@article{deSilva2015, - author = {Nisansa de Silva}, - title = {Sinhala Text Classification: Observations from the Perspective of a Resource Poor Language}, - journal = {Year of Publication}, - year = {2015}, - } - @article{dhananjaya2022, - author = {Dhananjaya et al.}, - title = {BERTifying Sinhala - A Comprehensive Analysis of Pre-trained Language Models for Sinhala Text Classification}, - journal = {Year of Publication}, - year = {2022}, - }""", + bibtex_citation=r""" +@article{deSilva2015, + author = {Nisansa de Silva}, + journal = {Year of Publication}, + title = {Sinhala Text Classification: Observations from the Perspective of a Resource Poor Language}, + year = {2015}, +} + +@article{dhananjaya2022, + author = {Dhananjaya et al.}, + journal = {Year of Publication}, + title = {BERTifying Sinhala - A Comprehensive Analysis of Pre-trained Language Models for Sinhala Text Classification}, + year = {2022}, +} +""", ) def dataset_transform(self): diff --git a/mteb/tasks/Classification/sin/SinhalaNewsSourceClassification.py b/mteb/tasks/Classification/sin/SinhalaNewsSourceClassification.py index a7bd9763a7..a665d1c0c9 100644 --- a/mteb/tasks/Classification/sin/SinhalaNewsSourceClassification.py +++ b/mteb/tasks/Classification/sin/SinhalaNewsSourceClassification.py @@ -26,13 +26,14 @@ class SinhalaNewsSourceClassification(AbsTaskClassification): annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation=""" - @article{dhananjaya2022, - author = {Dhananjaya et al.}, - title = {BERTifying Sinhala - A Comprehensive Analysis of Pre-trained Language Models for Sinhala Text Classification}, - journal = {Year of Publication}, - year = {2022}, - }""", + bibtex_citation=r""" +@article{dhananjaya2022, + author = {Dhananjaya et al.}, + journal = {Year of Publication}, + title = {BERTifying Sinhala - A Comprehensive Analysis of Pre-trained Language Models for Sinhala Text Classification}, + year = {2022}, +} +""", ) def dataset_transform(self): diff --git a/mteb/tasks/Classification/slk/CSFDSKMovieReviewSentimentClassification.py b/mteb/tasks/Classification/slk/CSFDSKMovieReviewSentimentClassification.py index d13305bc9c..6577f7f315 100644 --- a/mteb/tasks/Classification/slk/CSFDSKMovieReviewSentimentClassification.py +++ b/mteb/tasks/Classification/slk/CSFDSKMovieReviewSentimentClassification.py @@ -26,14 +26,14 @@ class CSFDSKMovieReviewSentimentClassification(AbsTaskClassification): annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation=""" + bibtex_citation=r""" @misc{štefánik2023resources, - title={Resources and Few-shot Learners for In-context Learning in Slavic Languages}, - author={Michal Štefánik and Marek Kadlčík and Piotr Gramacki and Petr Sojka}, - year={2023}, - eprint={2304.01922}, - archivePrefix={arXiv}, - primaryClass={cs.CL} + archiveprefix = {arXiv}, + author = {Michal Štefánik and Marek Kadlčík and Piotr Gramacki and Petr Sojka}, + eprint = {2304.01922}, + primaryclass = {cs.CL}, + title = {Resources and Few-shot Learners for In-context Learning in Slavic Languages}, + year = {2023}, } """, ) diff --git a/mteb/tasks/Classification/slv/FrenkSlClassification.py b/mteb/tasks/Classification/slv/FrenkSlClassification.py index b5add7e3eb..f88d4ff9ff 100644 --- a/mteb/tasks/Classification/slv/FrenkSlClassification.py +++ b/mteb/tasks/Classification/slv/FrenkSlClassification.py @@ -27,15 +27,17 @@ class FrenkSlClassification(AbsTaskClassification): annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation="""@misc{ljubešić2019frenk, - title={The FRENK Datasets of Socially Unacceptable Discourse in Slovene and English}, - author={Nikola Ljubešić and Darja Fišer and Tomaž Erjavec}, - year={2019}, - eprint={1906.02045}, - archivePrefix={arXiv}, - primaryClass={cs.CL}, - url={https://arxiv.org/abs/1906.02045} - }""", + bibtex_citation=r""" +@misc{ljubešić2019frenk, + archiveprefix = {arXiv}, + author = {Nikola Ljubešić and Darja Fišer and Tomaž Erjavec}, + eprint = {1906.02045}, + primaryclass = {cs.CL}, + title = {The FRENK Datasets of Socially Unacceptable Discourse in Slovene and English}, + url = {https://arxiv.org/abs/1906.02045}, + year = {2019}, +} +""", ) def dataset_transform(self): diff --git a/mteb/tasks/Classification/spa/SpanishSentimentClassification.py b/mteb/tasks/Classification/spa/SpanishSentimentClassification.py index 28e56f87c9..785b131bbc 100644 --- a/mteb/tasks/Classification/spa/SpanishSentimentClassification.py +++ b/mteb/tasks/Classification/spa/SpanishSentimentClassification.py @@ -26,29 +26,29 @@ class SpanishSentimentClassification(AbsTaskClassification): annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation=""" - @inproceedings{mollanorozy-etal-2023-cross, - title = "Cross-lingual Transfer Learning with \{P\}ersian", - author = "Mollanorozy, Sepideh and - Tanti, Marc and - Nissim, Malvina", - editor = "Beinborn, Lisa and - Goswami, Koustava and - Murado{\\u{g}}lu, Saliha and - Sorokin, Alexey and - Kumar, Ritesh and - Shcherbakov, Andreas and - Ponti, Edoardo M. and - Cotterell, Ryan and - Vylomova, Ekaterina", - booktitle = "Proceedings of the 5th Workshop on Research in Computational Linguistic Typology and Multilingual NLP", - month = may, - year = "2023", - address = "Dubrovnik, Croatia", - publisher = "Association for Computational Linguistics", - url = "https://aclanthology.org/2023.sigtyp-1.9", - doi = "10.18653/v1/2023.sigtyp-1.9", - pages = "89--95", - } - """, + bibtex_citation=r""" +@inproceedings{mollanorozy-etal-2023-cross, + address = {Dubrovnik, Croatia}, + author = {Mollanorozy, Sepideh and +Tanti, Marc and +Nissim, Malvina}, + booktitle = {Proceedings of the 5th Workshop on Research in Computational Linguistic Typology and Multilingual NLP}, + doi = {10.18653/v1/2023.sigtyp-1.9}, + editor = {Beinborn, Lisa and +Goswami, Koustava and +Murado{\\u{g}}lu, Saliha and +Sorokin, Alexey and +Kumar, Ritesh and +Shcherbakov, Andreas and +Ponti, Edoardo M. and +Cotterell, Ryan and +Vylomova, Ekaterina}, + month = may, + pages = {89--95}, + publisher = {Association for Computational Linguistics}, + title = {Cross-lingual Transfer Learning with \{P\}ersian}, + url = {https://aclanthology.org/2023.sigtyp-1.9}, + year = {2023}, +} +""", ) diff --git a/mteb/tasks/Classification/ssw/SiswatiNewsClassification.py b/mteb/tasks/Classification/ssw/SiswatiNewsClassification.py index d51b42f88d..e5b667e289 100644 --- a/mteb/tasks/Classification/ssw/SiswatiNewsClassification.py +++ b/mteb/tasks/Classification/ssw/SiswatiNewsClassification.py @@ -26,8 +26,17 @@ class SiswatiNewsClassification(AbsTaskClassification): annotations_creators="human-annotated", dialect=[], sample_creation="found", - bibtex_citation="""@article{Madodonga_Marivate_Adendorff_2023, title={Izindaba-Tindzaba: Machine learning news categorisation for Long and Short Text for isiZulu and Siswati}, volume={4}, url={https://upjournals.up.ac.za/index.php/dhasa/article/view/4449}, DOI={10.55492/dhasa.v4i01.4449}, author={Madodonga, Andani and Marivate, Vukosi and Adendorff, Matthew}, year={2023}, month={Jan.} } - """, + bibtex_citation=r""" +@article{Madodonga_Marivate_Adendorff_2023, + author = {Madodonga, Andani and Marivate, Vukosi and Adendorff, Matthew}, + doi = {10.55492/dhasa.v4i01.4449}, + month = {Jan.}, + title = {Izindaba-Tindzaba: Machine learning news categorisation for Long and Short Text for isiZulu and Siswati}, + url = {https://upjournals.up.ac.za/index.php/dhasa/article/view/4449}, + volume = {4}, + year = {2023}, +} +""", ) def dataset_transform(self): diff --git a/mteb/tasks/Classification/svk/SlovakMovieReviewSentimentClassification.py b/mteb/tasks/Classification/svk/SlovakMovieReviewSentimentClassification.py index 8918c4a1a4..25df08775d 100644 --- a/mteb/tasks/Classification/svk/SlovakMovieReviewSentimentClassification.py +++ b/mteb/tasks/Classification/svk/SlovakMovieReviewSentimentClassification.py @@ -26,14 +26,14 @@ class SlovakMovieReviewSentimentClassification(AbsTaskClassification): license="cc-by-nc-sa-4.0", annotations_creators="derived", sample_creation="found", - bibtex_citation=""" - @article{vstefanik2023resources, - title={Resources and Few-shot Learners for In-context Learning in Slavic Languages}, - author={{\v{S}}tef{\'a}nik, Michal and Kadl{\v{c}}{\'\i}k, Marek and Gramacki, Piotr and Sojka, Petr}, - journal={arXiv preprint arXiv:2304.01922}, - year={2023} - } - """, + bibtex_citation=r""" +@article{vstefanik2023resources, + author = {{\v{S}}tef{\'a}nik, Michal and Kadl{\v{c}}{\'\i}k, Marek and Gramacki, Piotr and Sojka, Petr}, + journal = {arXiv preprint arXiv:2304.01922}, + title = {Resources and Few-shot Learners for In-context Learning in Slavic Languages}, + year = {2023}, +} +""", ) def dataset_transform(self) -> None: diff --git a/mteb/tasks/Classification/swa/SwahiliNewsClassification.py b/mteb/tasks/Classification/swa/SwahiliNewsClassification.py index 6a4cb6bdc8..518b749de0 100644 --- a/mteb/tasks/Classification/swa/SwahiliNewsClassification.py +++ b/mteb/tasks/Classification/swa/SwahiliNewsClassification.py @@ -26,16 +26,16 @@ class SwahiliNewsClassification(AbsTaskClassification): license="cc-by-nc-sa-4.0", annotations_creators="derived", sample_creation="found", - bibtex_citation=""" - @inproceedings{davis2020swahili, - title = "Swahili: News Classification Dataset (0.2)", - author = "Davis, David", - year = "2020", - publisher = "Zenodo", - doi = "10.5281/zenodo.5514203", - url = "https://doi.org/10.5281/zenodo.5514203" - } - """, + bibtex_citation=r""" +@inproceedings{davis2020swahili, + author = {Davis, David}, + doi = {10.5281/zenodo.5514203}, + publisher = {Zenodo}, + title = {Swahili: News Classification Dataset (0.2)}, + url = {https://doi.org/10.5281/zenodo.5514203}, + year = {2020}, +} +""", ) def dataset_transform(self) -> None: diff --git a/mteb/tasks/Classification/swe/DalajClassification.py b/mteb/tasks/Classification/swe/DalajClassification.py index 780fe65dbf..05983d0e4f 100644 --- a/mteb/tasks/Classification/swe/DalajClassification.py +++ b/mteb/tasks/Classification/swe/DalajClassification.py @@ -29,12 +29,14 @@ class DalajClassification(AbsTaskClassification): annotations_creators="expert-annotated", dialect=[], sample_creation="created", - bibtex_citation="""@misc{2105.06681, -Author = {Elena Volodina and Yousuf Ali Mohammed and Julia Klezl}, -Title = {DaLAJ - a dataset for linguistic acceptability judgments for Swedish: Format, baseline, sharing}, -Year = {2021}, -Eprint = {arXiv:2105.06681}, -}""", + bibtex_citation=r""" +@misc{2105.06681, + author = {Elena Volodina and Yousuf Ali Mohammed and Julia Klezl}, + eprint = {arXiv:2105.06681}, + title = {DaLAJ - a dataset for linguistic acceptability judgments for Swedish: Format, baseline, sharing}, + year = {2021}, +} +""", prompt="Classify texts based on linguistic acceptability in Swedish", ) diff --git a/mteb/tasks/Classification/swe/SweRecClassification.py b/mteb/tasks/Classification/swe/SweRecClassification.py index 7083ade1fb..8cc7b8dff8 100644 --- a/mteb/tasks/Classification/swe/SweRecClassification.py +++ b/mteb/tasks/Classification/swe/SweRecClassification.py @@ -26,18 +26,19 @@ class SweRecClassification(AbsTaskClassification): annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation="""@inproceedings{nielsen-2023-scandeval, - title = "{S}cand{E}val: A Benchmark for {S}candinavian Natural Language Processing", - author = "Nielsen, Dan", - editor = {Alum{\"a}e, Tanel and - Fishel, Mark}, - booktitle = "Proceedings of the 24th Nordic Conference on Computational Linguistics (NoDaLiDa)", - month = may, - year = "2023", - address = "T{\'o}rshavn, Faroe Islands", - publisher = "University of Tartu Library", - url = "https://aclanthology.org/2023.nodalida-1.20", - pages = "185--201", + bibtex_citation=r""" +@inproceedings{nielsen-2023-scandeval, + address = {T{\'o}rshavn, Faroe Islands}, + author = {Nielsen, Dan}, + booktitle = {Proceedings of the 24th Nordic Conference on Computational Linguistics (NoDaLiDa)}, + editor = {Alum{\"a}e, Tanel and +Fishel, Mark}, + month = may, + pages = {185--201}, + publisher = {University of Tartu Library}, + title = {{S}cand{E}val: A Benchmark for {S}candinavian Natural Language Processing}, + url = {https://aclanthology.org/2023.nodalida-1.20}, + year = {2023}, } """, prompt="Classify Swedish reviews by sentiment", diff --git a/mteb/tasks/Classification/tam/TamilNewsClassification.py b/mteb/tasks/Classification/tam/TamilNewsClassification.py index af9698d0b1..3f4505bce8 100644 --- a/mteb/tasks/Classification/tam/TamilNewsClassification.py +++ b/mteb/tasks/Classification/tam/TamilNewsClassification.py @@ -26,12 +26,14 @@ class TamilNewsClassification(AbsTaskClassification): annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation="""@article{kunchukuttan2020indicnlpcorpus, - title={AI4Bharat-IndicNLP Corpus: Monolingual Corpora and Word Embeddings for Indic Languages}, - author={Anoop Kunchukuttan and Divyanshu Kakwani and Satish Golla and Gokul N.C. and Avik Bhattacharyya and Mitesh M. Khapra and Pratyush Kumar}, - year={2020}, - journal={arXiv preprint arXiv:2005.00085}, -}""", + bibtex_citation=r""" +@article{kunchukuttan2020indicnlpcorpus, + author = {Anoop Kunchukuttan and Divyanshu Kakwani and Satish Golla and Gokul N.C. and Avik Bhattacharyya and Mitesh M. Khapra and Pratyush Kumar}, + journal = {arXiv preprint arXiv:2005.00085}, + title = {AI4Bharat-IndicNLP Corpus: Monolingual Corpora and Word Embeddings for Indic Languages}, + year = {2020}, +} +""", ) def dataset_transform(self): diff --git a/mteb/tasks/Classification/tha/WisesightSentimentClassification.py b/mteb/tasks/Classification/tha/WisesightSentimentClassification.py index 3a76003d5b..7bd83863d2 100644 --- a/mteb/tasks/Classification/tha/WisesightSentimentClassification.py +++ b/mteb/tasks/Classification/tha/WisesightSentimentClassification.py @@ -27,20 +27,20 @@ class WisesightSentimentClassification(AbsTaskClassification): license="cc0-1.0", annotations_creators="expert-annotated", sample_creation="found", - bibtex_citation="""@software{bact_2019_3457447, - author = {Suriyawongkul, Arthit and - Chuangsuwanich, Ekapol and - Chormai, Pattarawat and - Polpanumas, Charin}, - title = {PyThaiNLP/wisesight-sentiment: First release}, - month = sep, - year = 2019, - publisher = {Zenodo}, - version = {v1.0}, - doi = {10.5281/zenodo.3457447}, - url = {https://doi.org/10.5281/zenodo.3457447} + bibtex_citation=r""" +@software{bact_2019_3457447, + author = {Suriyawongkul, Arthit and +Chuangsuwanich, Ekapol and +Chormai, Pattarawat and +Polpanumas, Charin}, + doi = {10.5281/zenodo.3457447}, + month = sep, + publisher = {Zenodo}, + title = {PyThaiNLP/wisesight-sentiment: First release}, + url = {https://doi.org/10.5281/zenodo.3457447}, + version = {v1.0}, + year = {2019}, } - """, ) diff --git a/mteb/tasks/Classification/tha/WongnaiReviewsClassification .py b/mteb/tasks/Classification/tha/WongnaiReviewsClassification .py index 4afd64dd21..9a51214759 100644 --- a/mteb/tasks/Classification/tha/WongnaiReviewsClassification .py +++ b/mteb/tasks/Classification/tha/WongnaiReviewsClassification .py @@ -26,17 +26,18 @@ class WongnaiReviewsClassification(AbsTaskClassification): license="lgpl-3.0", annotations_creators="derived", sample_creation="found", - bibtex_citation=""" - @software{cstorm125_2020_3852912, - author = {cstorm125 and lukkiddd}, - title = {PyThaiNLP/classification-benchmarks: v0.1-alpha}, - month = may, - year = 2020, - publisher = {Zenodo}, - version = {v0.1-alpha}, - doi = {10.5281/zenodo.3852912}, - url = {https://doi.org/10.5281/zenodo.3852912} - }""", + bibtex_citation=r""" +@software{cstorm125_2020_3852912, + author = {cstorm125 and lukkiddd}, + doi = {10.5281/zenodo.3852912}, + month = may, + publisher = {Zenodo}, + title = {PyThaiNLP/classification-benchmarks: v0.1-alpha}, + url = {https://doi.org/10.5281/zenodo.3852912}, + version = {v0.1-alpha}, + year = {2020}, +} +""", ) def dataset_transform(self): diff --git a/mteb/tasks/Classification/tsn/TswanaNewsClassification.py b/mteb/tasks/Classification/tsn/TswanaNewsClassification.py index c1eee27779..e9095fd0d3 100644 --- a/mteb/tasks/Classification/tsn/TswanaNewsClassification.py +++ b/mteb/tasks/Classification/tsn/TswanaNewsClassification.py @@ -26,16 +26,16 @@ class TswanaNewsClassification(AbsTaskClassification): annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation=""" - @inproceedings{marivate2023puoberta, - title = {PuoBERTa: Training and evaluation of a curated language model for Setswana}, - author = {Vukosi Marivate and Moseli Mots'Oehli and Valencia Wagner and Richard Lastrucci and Isheanesu Dzingirai}, - year = {2023}, - booktitle= {SACAIR 2023 (To Appear)}, - keywords = {NLP}, - preprint_url = {https://arxiv.org/abs/2310.09141}, - dataset_url = {https://github.com/dsfsi/PuoBERTa}, - software_url = {https://huggingface.co/dsfsi/PuoBERTa} - } - """, + bibtex_citation=r""" +@inproceedings{marivate2023puoberta, + author = {Vukosi Marivate and Moseli Mots'Oehli and Valencia Wagner and Richard Lastrucci and Isheanesu Dzingirai}, + booktitle = {SACAIR 2023 (To Appear)}, + dataset_url = {https://github.com/dsfsi/PuoBERTa}, + keywords = {NLP}, + preprint_url = {https://arxiv.org/abs/2310.09141}, + software_url = {https://huggingface.co/dsfsi/PuoBERTa}, + title = {PuoBERTa: Training and evaluation of a curated language model for Setswana}, + year = {2023}, +} +""", ) diff --git a/mteb/tasks/Classification/tur/TurkishMovieSentimentClassification.py b/mteb/tasks/Classification/tur/TurkishMovieSentimentClassification.py index 64981c6ec2..680b52009b 100644 --- a/mteb/tasks/Classification/tur/TurkishMovieSentimentClassification.py +++ b/mteb/tasks/Classification/tur/TurkishMovieSentimentClassification.py @@ -26,15 +26,15 @@ class TurkishMovieSentimentClassification(AbsTaskClassification): annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation=""" - @inproceedings{Demirtas2013CrosslingualPD, - title={Cross-lingual polarity detection with machine translation}, - author={Erkin Demirtas and Mykola Pechenizkiy}, - booktitle={wisdom}, - year={2013}, - url={https://api.semanticscholar.org/CorpusID:3912960} - } - """, + bibtex_citation=r""" +@inproceedings{Demirtas2013CrosslingualPD, + author = {Erkin Demirtas and Mykola Pechenizkiy}, + booktitle = {wisdom}, + title = {Cross-lingual polarity detection with machine translation}, + url = {https://api.semanticscholar.org/CorpusID:3912960}, + year = {2013}, +} +""", ) def dataset_transform(self): diff --git a/mteb/tasks/Classification/tur/TurkishProductSentimentClassification.py b/mteb/tasks/Classification/tur/TurkishProductSentimentClassification.py index c33c537c69..7bfb086d99 100644 --- a/mteb/tasks/Classification/tur/TurkishProductSentimentClassification.py +++ b/mteb/tasks/Classification/tur/TurkishProductSentimentClassification.py @@ -26,13 +26,13 @@ class TurkishProductSentimentClassification(AbsTaskClassification): annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation=""" - @inproceedings{Demirtas2013CrosslingualPD, - title={Cross-lingual polarity detection with machine translation}, - author={Erkin Demirtas and Mykola Pechenizkiy}, - booktitle={wisdom}, - year={2013}, - url={https://api.semanticscholar.org/CorpusID:3912960} - } - """, + bibtex_citation=r""" +@inproceedings{Demirtas2013CrosslingualPD, + author = {Erkin Demirtas and Mykola Pechenizkiy}, + booktitle = {wisdom}, + title = {Cross-lingual polarity detection with machine translation}, + url = {https://api.semanticscholar.org/CorpusID:3912960}, + year = {2013}, +} +""", ) diff --git a/mteb/tasks/Classification/ukr/UkrFormalityClassification.py b/mteb/tasks/Classification/ukr/UkrFormalityClassification.py index 0a7f08b8e0..fadc60edd8 100644 --- a/mteb/tasks/Classification/ukr/UkrFormalityClassification.py +++ b/mteb/tasks/Classification/ukr/UkrFormalityClassification.py @@ -32,16 +32,18 @@ class UkrFormalityClassification(AbsTaskClassification): annotations_creators="derived", dialect=[], sample_creation="machine-translated", - bibtex_citation="""@inproceedings{rao-tetreault-2018-dear, - title = "Dear Sir or Madam, May {I} Introduce the {GYAFC} Dataset: Corpus, Benchmarks and Metrics for Formality Style Transfer", - author = "Rao, Sudha and - Tetreault, Joel", - booktitle = "Proceedings of the 2018 Conference of the North {A}merican Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long Papers)", - month = jun, - year = "2018", - publisher = "Association for Computational Linguistics", - url = "https://aclanthology.org/N18-1012", - }""", + bibtex_citation=r""" +@inproceedings{rao-tetreault-2018-dear, + author = {Rao, Sudha and +Tetreault, Joel}, + booktitle = {Proceedings of the 2018 Conference of the North {A}merican Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long Papers)}, + month = jun, + publisher = {Association for Computational Linguistics}, + title = {Dear Sir or Madam, May {I} Introduce the {GYAFC} Dataset: Corpus, Benchmarks and Metrics for Formality Style Transfer}, + url = {https://aclanthology.org/N18-1012}, + year = {2018}, +} +""", ) def dataset_transform(self): diff --git a/mteb/tasks/Classification/urd/UrduRomanSentimentClassification.py b/mteb/tasks/Classification/urd/UrduRomanSentimentClassification.py index 62440ef9c2..ff3d5032df 100644 --- a/mteb/tasks/Classification/urd/UrduRomanSentimentClassification.py +++ b/mteb/tasks/Classification/urd/UrduRomanSentimentClassification.py @@ -27,15 +27,15 @@ class UrduRomanSentimentClassification(AbsTaskClassification): annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation=""" - @misc{misc_roman_urdu_data_set_458, - author = {Sharf,Zareen}, - title = {{Roman Urdu Data Set}}, - year = {2018}, + bibtex_citation=r""" +@misc{misc_roman_urdu_data_set_458, + author = {Sharf,Zareen}, howpublished = {UCI Machine Learning Repository}, - note = {{DOI}: https://doi.org/10.24432/C58325} + note = {{DOI}: https://doi.org/10.24432/C58325}, + title = {{Roman Urdu Data Set}}, + year = {2018}, } - """, +""", ) def dataset_transform(self): diff --git a/mteb/tasks/Classification/vie/VieStudentFeedbackClassification.py b/mteb/tasks/Classification/vie/VieStudentFeedbackClassification.py index 901d2861f9..8d40b89ff8 100644 --- a/mteb/tasks/Classification/vie/VieStudentFeedbackClassification.py +++ b/mteb/tasks/Classification/vie/VieStudentFeedbackClassification.py @@ -29,16 +29,18 @@ class VieStudentFeedbackClassification(AbsTaskClassification): annotations_creators="human-annotated", dialect=[], sample_creation="created", - bibtex_citation="""@InProceedings{8573337, - author={Nguyen, Kiet Van and Nguyen, Vu Duc and Nguyen, Phu X. V. and Truong, Tham T. H. and Nguyen, Ngan Luu-Thuy}, - booktitle={2018 10th International Conference on Knowledge and Systems Engineering (KSE)}, - title={UIT-VSFC: Vietnamese Students’ Feedback Corpus for Sentiment Analysis}, - year={2018}, - volume={}, - number={}, - pages={19-24}, - doi={10.1109/KSE.2018.8573337} -}""", + bibtex_citation=r""" +@inproceedings{8573337, + author = {Nguyen, Kiet Van and Nguyen, Vu Duc and Nguyen, Phu X. V. and Truong, Tham T. H. and Nguyen, Ngan Luu-Thuy}, + booktitle = {2018 10th International Conference on Knowledge and Systems Engineering (KSE)}, + doi = {10.1109/KSE.2018.8573337}, + number = {}, + pages = {19-24}, + title = {UIT-VSFC: Vietnamese Students’ Feedback Corpus for Sentiment Analysis}, + volume = {}, + year = {2018}, +} +""", ) def dataset_transform(self): diff --git a/mteb/tasks/Classification/zho/CMTEBClassification.py b/mteb/tasks/Classification/zho/CMTEBClassification.py index 7e790ecf9a..64fb95298a 100644 --- a/mteb/tasks/Classification/zho/CMTEBClassification.py +++ b/mteb/tasks/Classification/zho/CMTEBClassification.py @@ -26,49 +26,51 @@ class TNews(AbsTaskClassification): annotations_creators=None, dialect=None, sample_creation=None, - bibtex_citation="""@inproceedings {xu-etal-2020-clue, - title = "{CLUE}: A {C}hinese Language Understanding Evaluation Benchmark", - author = "Xu, Liang and - Hu, Hai and - Zhang, Xuanwei and - Li, Lu and - Cao, Chenjie and - Li, Yudong and - Xu, Yechen and - Sun, Kai and - Yu, Dian and - Yu, Cong and - Tian, Yin and - Dong, Qianqian and - Liu, Weitang and - Shi, Bo and - Cui, Yiming and - Li, Junyi and - Zeng, Jun and - Wang, Rongzhao and - Xie, Weijian and - Li, Yanting and - Patterson, Yina and - Tian, Zuoyu and - Zhang, Yiwen and - Zhou, He and - Liu, Shaoweihua and - Zhao, Zhe and - Zhao, Qipeng and - Yue, Cong and - Zhang, Xinrui and - Yang, Zhengliang and - Richardson, Kyle and - Lan, Zhenzhong ", - booktitle = "Proceedings of the 28th International Conference on Computational Linguistics", - month = dec, - year = "2020", - address = "Barcelona, Spain (Online)", - publisher = "International Committee on Computational Linguistics", - url = "https://aclanthology.org/2020.coling-main.419", - doi = "10.18653/v1/2020.coling-main.419", - pages = "4762--4772", -}""", + bibtex_citation=r""" +@inproceedings{xu-etal-2020-clue, + address = {Barcelona, Spain (Online)}, + author = {Xu, Liang and +Hu, Hai and +Zhang, Xuanwei and +Li, Lu and +Cao, Chenjie and +Li, Yudong and +Xu, Yechen and +Sun, Kai and +Yu, Dian and +Yu, Cong and +Tian, Yin and +Dong, Qianqian and +Liu, Weitang and +Shi, Bo and +Cui, Yiming and +Li, Junyi and +Zeng, Jun and +Wang, Rongzhao and +Xie, Weijian and +Li, Yanting and +Patterson, Yina and +Tian, Zuoyu and +Zhang, Yiwen and +Zhou, He and +Liu, Shaoweihua and +Zhao, Zhe and +Zhao, Qipeng and +Yue, Cong and +Zhang, Xinrui and +Yang, Zhengliang and +Richardson, Kyle and +Lan, Zhenzhong }, + booktitle = {Proceedings of the 28th International Conference on Computational Linguistics}, + doi = {10.18653/v1/2020.coling-main.419}, + month = dec, + pages = {4762--4772}, + publisher = {International Committee on Computational Linguistics}, + title = {{CLUE}: A {C}hinese Language Understanding Evaluation Benchmark}, + url = {https://aclanthology.org/2020.coling-main.419}, + year = {2020}, +} +""", prompt="Classify the fine-grained category of the given news title", ) @@ -97,50 +99,52 @@ class IFlyTek(AbsTaskClassification): annotations_creators=None, dialect=None, sample_creation=None, - bibtex_citation="""@inproceedings {xu-etal-2020-clue, - title = "{CLUE}: A {C}hinese Language Understanding Evaluation Benchmark", - author = "Xu, Liang and - Hu, Hai and - Zhang, Xuanwei and - Li, Lu and - Cao, Chenjie and - Li, Yudong and - Xu, Yechen and - Sun, Kai and - Yu, Dian and - Yu, Cong and - Tian, Yin and - Dong, Qianqian and - Liu, Weitang and - Shi, Bo and - Cui, Yiming and - Li, Junyi and - Zeng, Jun and - Wang, Rongzhao and - Xie, Weijian and - Li, Yanting and - Patterson, Yina and - Tian, Zuoyu and - Zhang, Yiwen and - Zhou, He and - Liu, Shaoweihua and - Zhao, Zhe and - Zhao, Qipeng and - Yue, Cong and - Zhang, Xinrui and - Yang, Zhengliang and - Richardson, Kyle and - Lan, Zhenzhong ", - booktitle = "Proceedings of the 28th International Conference on Computational Linguistics", - month = dec, - year = "2020", - address = "Barcelona, Spain (Online)", - publisher = "International Committee on Computational Linguistics", - url = "https://aclanthology.org/2020.coling-main.419", - doi = "10.18653/v1/2020.coling-main.419", - pages = "4762--4772", - abstract = "The advent of natural language understanding (NLU) benchmarks for English, such as GLUE and SuperGLUE allows new NLU models to be evaluated across a diverse set of tasks. These comprehensive benchmarks have facilitated a broad range of research and applications in natural language processing (NLP). The problem, however, is that most such benchmarks are limited to English, which has made it difficult to replicate many of the successes in English NLU for other languages. To help remedy this issue, we introduce the first large-scale Chinese Language Understanding Evaluation (CLUE) benchmark. CLUE is an open-ended, community-driven project that brings together 9 tasks spanning several well-established single-sentence/sentence-pair classification tasks, as well as machine reading comprehension, all on original Chinese text. To establish results on these tasks, we report scores using an exhaustive set of current state-of-the-art pre-trained Chinese models (9 in total). We also introduce a number of supplementary datasets and additional tools to help facilitate further progress on Chinese NLU. Our benchmark is released at https://www.cluebenchmarks.com", -}""", + bibtex_citation=r""" +@inproceedings{xu-etal-2020-clue, + abstract = {The advent of natural language understanding (NLU) benchmarks for English, such as GLUE and SuperGLUE allows new NLU models to be evaluated across a diverse set of tasks. These comprehensive benchmarks have facilitated a broad range of research and applications in natural language processing (NLP). The problem, however, is that most such benchmarks are limited to English, which has made it difficult to replicate many of the successes in English NLU for other languages. To help remedy this issue, we introduce the first large-scale Chinese Language Understanding Evaluation (CLUE) benchmark. CLUE is an open-ended, community-driven project that brings together 9 tasks spanning several well-established single-sentence/sentence-pair classification tasks, as well as machine reading comprehension, all on original Chinese text. To establish results on these tasks, we report scores using an exhaustive set of current state-of-the-art pre-trained Chinese models (9 in total). We also introduce a number of supplementary datasets and additional tools to help facilitate further progress on Chinese NLU. Our benchmark is released at https://www.cluebenchmarks.com}, + address = {Barcelona, Spain (Online)}, + author = {Xu, Liang and +Hu, Hai and +Zhang, Xuanwei and +Li, Lu and +Cao, Chenjie and +Li, Yudong and +Xu, Yechen and +Sun, Kai and +Yu, Dian and +Yu, Cong and +Tian, Yin and +Dong, Qianqian and +Liu, Weitang and +Shi, Bo and +Cui, Yiming and +Li, Junyi and +Zeng, Jun and +Wang, Rongzhao and +Xie, Weijian and +Li, Yanting and +Patterson, Yina and +Tian, Zuoyu and +Zhang, Yiwen and +Zhou, He and +Liu, Shaoweihua and +Zhao, Zhe and +Zhao, Qipeng and +Yue, Cong and +Zhang, Xinrui and +Yang, Zhengliang and +Richardson, Kyle and +Lan, Zhenzhong }, + booktitle = {Proceedings of the 28th International Conference on Computational Linguistics}, + doi = {10.18653/v1/2020.coling-main.419}, + month = dec, + pages = {4762--4772}, + publisher = {International Committee on Computational Linguistics}, + title = {{CLUE}: A {C}hinese Language Understanding Evaluation Benchmark}, + url = {https://aclanthology.org/2020.coling-main.419}, + year = {2020}, +} +""", prompt="Given an App description text, find the appropriate fine-grained category", ) @@ -204,12 +208,14 @@ class JDReview(AbsTaskClassification): annotations_creators=None, dialect=None, sample_creation=None, - bibtex_citation="""@article{xiao2023c, - title={C-pack: Packaged resources to advance general chinese embedding}, - author={Xiao, Shitao and Liu, Zheng and Zhang, Peitian and Muennighof, Niklas}, - journal={arXiv preprint arXiv:2309.07597}, - year={2023} -}""", + bibtex_citation=r""" +@article{xiao2023c, + author = {Xiao, Shitao and Liu, Zheng and Zhang, Peitian and Muennighof, Niklas}, + journal = {arXiv preprint arXiv:2309.07597}, + title = {C-pack: Packaged resources to advance general chinese embedding}, + year = {2023}, +} +""", prompt="Classify the customer review for iPhone on e-commerce platform into positive or negative", ) @@ -238,12 +244,14 @@ class OnlineShopping(AbsTaskClassification): annotations_creators=None, dialect=None, sample_creation=None, - bibtex_citation="""@article{xiao2023c, - title={C-pack: Packaged resources to advance general chinese embedding}, - author={Xiao, Shitao and Liu, Zheng and Zhang, Peitian and Muennighof, Niklas}, - journal={arXiv preprint arXiv:2309.07597}, - year={2023} -}""", + bibtex_citation=r""" +@article{xiao2023c, + author = {Xiao, Shitao and Liu, Zheng and Zhang, Peitian and Muennighof, Niklas}, + journal = {arXiv preprint arXiv:2309.07597}, + title = {C-pack: Packaged resources to advance general chinese embedding}, + year = {2023}, +} +""", prompt="Classify the customer review for online shopping into positive or negative", ) @@ -272,12 +280,14 @@ class Waimai(AbsTaskClassification): annotations_creators=None, dialect=None, sample_creation=None, - bibtex_citation="""@article{xiao2023c, - title={C-pack: Packaged resources to advance general chinese embedding}, - author={Xiao, Shitao and Liu, Zheng and Zhang, Peitian and Muennighof, Niklas}, - journal={arXiv preprint arXiv:2309.07597}, - year={2023} -}""", + bibtex_citation=r""" +@article{xiao2023c, + author = {Xiao, Shitao and Liu, Zheng and Zhang, Peitian and Muennighof, Niklas}, + journal = {arXiv preprint arXiv:2309.07597}, + title = {C-pack: Packaged resources to advance general chinese embedding}, + year = {2023}, +} +""", prompt="Classify the customer review from a food takeaway platform into positive or negative", ) diff --git a/mteb/tasks/Classification/zho/YueOpenriceReviewClassification.py b/mteb/tasks/Classification/zho/YueOpenriceReviewClassification.py index 2189708719..7c6134a731 100644 --- a/mteb/tasks/Classification/zho/YueOpenriceReviewClassification.py +++ b/mteb/tasks/Classification/zho/YueOpenriceReviewClassification.py @@ -26,14 +26,16 @@ class YueOpenriceReviewClassification(AbsTaskClassification): annotations_creators="human-annotated", dialect=[], sample_creation="found", - bibtex_citation="""@inproceedings{xiang2019sentiment, - title={Sentiment Augmented Attention Network for Cantonese Restaurant Review Analysis}, - author={Xiang, Rong and Jiao, Ying and Lu, Qin}, - booktitle={Proceedings of the 8th KDD Workshop on Issues of Sentiment Discovery and Opinion Mining (WISDOM)}, - pages={1--9}, - year={2019}, - organization={KDD WISDOM} -}""", + bibtex_citation=r""" +@inproceedings{xiang2019sentiment, + author = {Xiang, Rong and Jiao, Ying and Lu, Qin}, + booktitle = {Proceedings of the 8th KDD Workshop on Issues of Sentiment Discovery and Opinion Mining (WISDOM)}, + organization = {KDD WISDOM}, + pages = {1--9}, + title = {Sentiment Augmented Attention Network for Cantonese Restaurant Review Analysis}, + year = {2019}, +} +""", ) samples_per_label = 32 diff --git a/mteb/tasks/Classification/zul/IsiZuluNewsClassification.py b/mteb/tasks/Classification/zul/IsiZuluNewsClassification.py index 26e3d16553..f8ca8c8e36 100644 --- a/mteb/tasks/Classification/zul/IsiZuluNewsClassification.py +++ b/mteb/tasks/Classification/zul/IsiZuluNewsClassification.py @@ -26,8 +26,17 @@ class IsiZuluNewsClassification(AbsTaskClassification): annotations_creators="human-annotated", dialect=[], sample_creation="found", - bibtex_citation="""@article{Madodonga_Marivate_Adendorff_2023, title={Izindaba-Tindzaba: Machine learning news categorisation for Long and Short Text for isiZulu and Siswati}, volume={4}, url={https://upjournals.up.ac.za/index.php/dhasa/article/view/4449}, DOI={10.55492/dhasa.v4i01.4449}, author={Madodonga, Andani and Marivate, Vukosi and Adendorff, Matthew}, year={2023}, month={Jan.} } - """, + bibtex_citation=r""" +@article{Madodonga_Marivate_Adendorff_2023, + author = {Madodonga, Andani and Marivate, Vukosi and Adendorff, Matthew}, + doi = {10.55492/dhasa.v4i01.4449}, + month = {Jan.}, + title = {Izindaba-Tindzaba: Machine learning news categorisation for Long and Short Text for isiZulu and Siswati}, + url = {https://upjournals.up.ac.za/index.php/dhasa/article/view/4449}, + volume = {4}, + year = {2023}, +} +""", ) def dataset_transform(self): diff --git a/mteb/tasks/Clustering/deu/BlurbsClusteringP2P.py b/mteb/tasks/Clustering/deu/BlurbsClusteringP2P.py index dbe155658e..859b13f5a7 100644 --- a/mteb/tasks/Clustering/deu/BlurbsClusteringP2P.py +++ b/mteb/tasks/Clustering/deu/BlurbsClusteringP2P.py @@ -26,19 +26,21 @@ class BlurbsClusteringP2P(AbsTaskClustering): main_score="v_measure", date=None, form=None, - domains=None, + domains=["Written"], task_subtypes=None, license=None, annotations_creators=None, dialect=None, sample_creation=None, - bibtex_citation="""@inproceedings{Remus2019GermEval2T, - title={GermEval 2019 Task 1: Hierarchical Classification of Blurbs}, - author={Steffen Remus and Rami Aly and Chris Biemann}, - booktitle={Conference on Natural Language Processing}, - year={2019}, - url={https://api.semanticscholar.org/CorpusID:208334484} -}""", + bibtex_citation=r""" +@inproceedings{Remus2019GermEval2T, + author = {Steffen Remus and Rami Aly and Chris Biemann}, + booktitle = {Conference on Natural Language Processing}, + title = {GermEval 2019 Task 1: Hierarchical Classification of Blurbs}, + url = {https://api.semanticscholar.org/CorpusID:208334484}, + year = {2019}, +} +""", ) @@ -72,13 +74,15 @@ class BlurbsClusteringP2PFast(AbsTaskClusteringFast): annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation="""@inproceedings{Remus2019GermEval2T, - title={GermEval 2019 Task 1: Hierarchical Classification of Blurbs}, - author={Steffen Remus and Rami Aly and Chris Biemann}, - booktitle={Conference on Natural Language Processing}, - year={2019}, - url={https://api.semanticscholar.org/CorpusID:208334484} -}""", + bibtex_citation=r""" +@inproceedings{Remus2019GermEval2T, + author = {Steffen Remus and Rami Aly and Chris Biemann}, + booktitle = {Conference on Natural Language Processing}, + title = {GermEval 2019 Task 1: Hierarchical Classification of Blurbs}, + url = {https://api.semanticscholar.org/CorpusID:208334484}, + year = {2019}, +} +""", adapted_from=["BlurbsClusteringP2P"], ) diff --git a/mteb/tasks/Clustering/deu/BlurbsClusteringS2S.py b/mteb/tasks/Clustering/deu/BlurbsClusteringS2S.py index 1a54ed8242..354efa59ec 100644 --- a/mteb/tasks/Clustering/deu/BlurbsClusteringS2S.py +++ b/mteb/tasks/Clustering/deu/BlurbsClusteringS2S.py @@ -34,19 +34,21 @@ class BlurbsClusteringS2S(AbsTaskClustering): main_score="v_measure", date=None, form=None, - domains=None, + domains=["Written"], task_subtypes=None, license=None, annotations_creators=None, dialect=None, sample_creation=None, - bibtex_citation="""@inproceedings{Remus2019GermEval2T, - title={GermEval 2019 Task 1: Hierarchical Classification of Blurbs}, - author={Steffen Remus and Rami Aly and Chris Biemann}, - booktitle={Conference on Natural Language Processing}, - year={2019}, - url={https://api.semanticscholar.org/CorpusID:208334484} -}""", + bibtex_citation=r""" +@inproceedings{Remus2019GermEval2T, + author = {Steffen Remus and Rami Aly and Chris Biemann}, + booktitle = {Conference on Natural Language Processing}, + title = {GermEval 2019 Task 1: Hierarchical Classification of Blurbs}, + url = {https://api.semanticscholar.org/CorpusID:208334484}, + year = {2019}, +} +""", ) @@ -81,13 +83,15 @@ class BlurbsClusteringS2SFast(AbsTaskClusteringFast): annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation="""@inproceedings{Remus2019GermEval2T, - title={GermEval 2019 Task 1: Hierarchical Classification of Blurbs}, - author={Steffen Remus and Rami Aly and Chris Biemann}, - booktitle={Conference on Natural Language Processing}, - year={2019}, - url={https://api.semanticscholar.org/CorpusID:208334484} -}""", + bibtex_citation=r""" +@inproceedings{Remus2019GermEval2T, + author = {Steffen Remus and Rami Aly and Chris Biemann}, + booktitle = {Conference on Natural Language Processing}, + title = {GermEval 2019 Task 1: Hierarchical Classification of Blurbs}, + url = {https://api.semanticscholar.org/CorpusID:208334484}, + year = {2019}, +} +""", adapted_from=["BlurbsClusteringS2S"], ) diff --git a/mteb/tasks/Clustering/deu/TenKGnadClusteringS2S.py b/mteb/tasks/Clustering/deu/TenKGnadClusteringS2S.py index f62c0f0aca..d4ac2f2581 100644 --- a/mteb/tasks/Clustering/deu/TenKGnadClusteringS2S.py +++ b/mteb/tasks/Clustering/deu/TenKGnadClusteringS2S.py @@ -24,8 +24,8 @@ class TenKGnadClusteringS2S(AbsTaskClustering): main_score="v_measure", date=None, form=None, - domains=None, - task_subtypes=None, + domains=["News", "Non-fiction", "Written"], + task_subtypes=["Topic classification"], license=None, annotations_creators=None, dialect=None, @@ -57,7 +57,7 @@ class TenKGnadClusteringS2SFast(AbsTaskClusteringFast): "2020-12-31", ), # since it is news it is guessed that it is from 2000 to 2020 domains=["News", "Non-fiction", "Written"], - task_subtypes=None, + task_subtypes=["Topic classification"], license="cc-by-sa-4.0", annotations_creators="derived", dialect=[], diff --git a/mteb/tasks/Clustering/eng/ArxivClusteringP2P.py b/mteb/tasks/Clustering/eng/ArxivClusteringP2P.py index 36155e7efc..efdd2a828b 100644 --- a/mteb/tasks/Clustering/eng/ArxivClusteringP2P.py +++ b/mteb/tasks/Clustering/eng/ArxivClusteringP2P.py @@ -29,14 +29,16 @@ class ArxivClusteringP2P(AbsTaskClustering): annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation="""@misc{arxiv_org_submitters_2024, - title={arXiv Dataset}, - url={https://www.kaggle.com/dsv/7548853}, - DOI={10.34740/KAGGLE/DSV/7548853}, - publisher={Kaggle}, - author={arXiv.org submitters}, - year={2024} -}""", + bibtex_citation=r""" +@misc{arxiv_org_submitters_2024, + author = {arXiv.org submitters}, + doi = {10.34740/KAGGLE/DSV/7548853}, + publisher = {Kaggle}, + title = {arXiv Dataset}, + url = {https://www.kaggle.com/dsv/7548853}, + year = {2024}, +} +""", prompt="Identify the main and secondary category of Arxiv papers based on the titles and abstracts", ) @@ -67,14 +69,16 @@ class ArxivClusteringP2PFast(AbsTaskClustering): annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation="""@misc{arxiv_org_submitters_2024, - title={arXiv Dataset}, - url={https://www.kaggle.com/dsv/7548853}, - DOI={10.34740/KAGGLE/DSV/7548853}, - publisher={Kaggle}, - author={arXiv.org submitters}, - year={2024} -}""", # None found + bibtex_citation=r""" +@misc{arxiv_org_submitters_2024, + author = {arXiv.org submitters}, + doi = {10.34740/KAGGLE/DSV/7548853}, + publisher = {Kaggle}, + title = {arXiv Dataset}, + url = {https://www.kaggle.com/dsv/7548853}, + year = {2024}, +} +""", # None found prompt="Identify the main and secondary category of Arxiv papers based on the titles and abstracts", adapted_from=["ArxivClusteringP2P"], ) diff --git a/mteb/tasks/Clustering/eng/ArxivClusteringS2S.py b/mteb/tasks/Clustering/eng/ArxivClusteringS2S.py index 8b4beb0e26..d92139e5a9 100644 --- a/mteb/tasks/Clustering/eng/ArxivClusteringS2S.py +++ b/mteb/tasks/Clustering/eng/ArxivClusteringS2S.py @@ -28,13 +28,15 @@ class ArxivClusteringS2S(AbsTaskClustering): annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation="""@misc{arxiv_org_submitters_2024, - title={arXiv Dataset}, - url={https://www.kaggle.com/dsv/7548853}, - DOI={10.34740/KAGGLE/DSV/7548853}, - publisher={Kaggle}, - author={arXiv.org submitters}, - year={2024} -}""", + bibtex_citation=r""" +@misc{arxiv_org_submitters_2024, + author = {arXiv.org submitters}, + doi = {10.34740/KAGGLE/DSV/7548853}, + publisher = {Kaggle}, + title = {arXiv Dataset}, + url = {https://www.kaggle.com/dsv/7548853}, + year = {2024}, +} +""", prompt="Identify the main and secondary category of Arxiv papers based on the titles", ) diff --git a/mteb/tasks/Clustering/eng/BigPatentClustering.py b/mteb/tasks/Clustering/eng/BigPatentClustering.py index 306119fed8..2148c7bb60 100644 --- a/mteb/tasks/Clustering/eng/BigPatentClustering.py +++ b/mteb/tasks/Clustering/eng/BigPatentClustering.py @@ -36,22 +36,24 @@ class BigPatentClustering(AbsTaskClustering): annotations_creators=None, dialect=None, sample_creation=None, - bibtex_citation="""@article{DBLP:journals/corr/abs-1906-03741, - author = {Eva Sharma and - Chen Li and - Lu Wang}, - title = {{BIGPATENT:} {A} Large-Scale Dataset for Abstractive and Coherent - Summarization}, - journal = {CoRR}, - volume = {abs/1906.03741}, - year = {2019}, - url = {http://arxiv.org/abs/1906.03741}, + bibtex_citation=r""" +@article{DBLP:journals/corr/abs-1906-03741, + author = {Eva Sharma and +Chen Li and +Lu Wang}, + bibsource = {dblp computer science bibliography, https://dblp.org}, + biburl = {https://dblp.org/rec/journals/corr/abs-1906-03741.bib}, + eprint = {1906.03741}, eprinttype = {arXiv}, - eprint = {1906.03741}, + journal = {CoRR}, timestamp = {Wed, 26 Jun 2019 07:14:58 +0200}, - biburl = {https://dblp.org/rec/journals/corr/abs-1906-03741.bib}, - bibsource = {dblp computer science bibliography, https://dblp.org} -}""", + title = {{BIGPATENT:} {A} Large-Scale Dataset for Abstractive and Coherent +Summarization}, + url = {http://arxiv.org/abs/1906.03741}, + volume = {abs/1906.03741}, + year = {2019}, +} +""", ) @@ -82,22 +84,24 @@ class BigPatentClusteringFast(AbsTaskClusteringFast): annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation="""@article{DBLP:journals/corr/abs-1906-03741, - author = {Eva Sharma and - Chen Li and - Lu Wang}, - title = {{BIGPATENT:} {A} Large-Scale Dataset for Abstractive and Coherent - Summarization}, - journal = {CoRR}, - volume = {abs/1906.03741}, - year = {2019}, - url = {http://arxiv.org/abs/1906.03741}, + bibtex_citation=r""" +@article{DBLP:journals/corr/abs-1906-03741, + author = {Eva Sharma and +Chen Li and +Lu Wang}, + bibsource = {dblp computer science bibliography, https://dblp.org}, + biburl = {https://dblp.org/rec/journals/corr/abs-1906-03741.bib}, + eprint = {1906.03741}, eprinttype = {arXiv}, - eprint = {1906.03741}, + journal = {CoRR}, timestamp = {Wed, 26 Jun 2019 07:14:58 +0200}, - biburl = {https://dblp.org/rec/journals/corr/abs-1906-03741.bib}, - bibsource = {dblp computer science bibliography, https://dblp.org} -}""", + title = {{BIGPATENT:} {A} Large-Scale Dataset for Abstractive and Coherent +Summarization}, + url = {http://arxiv.org/abs/1906.03741}, + volume = {abs/1906.03741}, + year = {2019}, +} +""", adapted_from=["BigPatentClustering"], ) diff --git a/mteb/tasks/Clustering/eng/BuiltBenchClusteringP2P.py b/mteb/tasks/Clustering/eng/BuiltBenchClusteringP2P.py index a7739a11da..8dc95ad597 100644 --- a/mteb/tasks/Clustering/eng/BuiltBenchClusteringP2P.py +++ b/mteb/tasks/Clustering/eng/BuiltBenchClusteringP2P.py @@ -26,11 +26,13 @@ class BuiltBenchClusteringP2P(AbsTaskClustering): annotations_creators="derived", dialect=[], sample_creation="created", - bibtex_citation="""@article{shahinmoghadam2024benchmarking, - title={Benchmarking pre-trained text embedding models in aligning built asset information}, - author={Shahinmoghadam, Mehrzad and Motamedi, Ali}, - journal={arXiv preprint arXiv:2411.12056}, - year={2024} -}""", + bibtex_citation=r""" +@article{shahinmoghadam2024benchmarking, + author = {Shahinmoghadam, Mehrzad and Motamedi, Ali}, + journal = {arXiv preprint arXiv:2411.12056}, + title = {Benchmarking pre-trained text embedding models in aligning built asset information}, + year = {2024}, +} +""", prompt="Identify the category of the built asset entities based on the entity description", ) diff --git a/mteb/tasks/Clustering/eng/BuiltBenchClusteringS2S.py b/mteb/tasks/Clustering/eng/BuiltBenchClusteringS2S.py index 58b53a476d..78f0bb471b 100644 --- a/mteb/tasks/Clustering/eng/BuiltBenchClusteringS2S.py +++ b/mteb/tasks/Clustering/eng/BuiltBenchClusteringS2S.py @@ -26,11 +26,13 @@ class BuiltBenchClusteringS2S(AbsTaskClustering): annotations_creators="derived", dialect=[], sample_creation="created", - bibtex_citation="""@article{shahinmoghadam2024benchmarking, - title={Benchmarking pre-trained text embedding models in aligning built asset information}, - author={Shahinmoghadam, Mehrzad and Motamedi, Ali}, - journal={arXiv preprint arXiv:2411.12056}, - year={2024} -}""", + bibtex_citation=r""" +@article{shahinmoghadam2024benchmarking, + author = {Shahinmoghadam, Mehrzad and Motamedi, Ali}, + journal = {arXiv preprint arXiv:2411.12056}, + title = {Benchmarking pre-trained text embedding models in aligning built asset information}, + year = {2024}, +} +""", prompt="Identify the category of the built asset entities based on the names or titles", ) diff --git a/mteb/tasks/Clustering/eng/ClusTrecCovid.py b/mteb/tasks/Clustering/eng/ClusTrecCovid.py index 51fb455cd6..b6c10103a5 100644 --- a/mteb/tasks/Clustering/eng/ClusTrecCovid.py +++ b/mteb/tasks/Clustering/eng/ClusTrecCovid.py @@ -29,19 +29,20 @@ class ClusTrecCovid(AbsTaskClusteringFast, MultilingualTask): annotations_creators="expert-annotated", dialect=[], sample_creation="created", - bibtex_citation="""@inproceedings{katz-etal-2024-knowledge, - title = "Knowledge Navigator: {LLM}-guided Browsing Framework for Exploratory Search in Scientific Literature", - author = "Katz, Uri and - Levy, Mosh and - Goldberg, Yoav", - booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2024", - month = nov, - year = "2024", - address = "Miami, Florida, USA", - publisher = "Association for Computational Linguistics", - url = "https://aclanthology.org/2024.findings-emnlp.516", - pages = "8838--8855", - } - """, + bibtex_citation=r""" +@inproceedings{katz-etal-2024-knowledge, + address = {Miami, Florida, USA}, + author = {Katz, Uri and +Levy, Mosh and +Goldberg, Yoav}, + booktitle = {Findings of the Association for Computational Linguistics: EMNLP 2024}, + month = nov, + pages = {8838--8855}, + publisher = {Association for Computational Linguistics}, + title = {Knowledge Navigator: {LLM}-guided Browsing Framework for Exploratory Search in Scientific Literature}, + url = {https://aclanthology.org/2024.findings-emnlp.516}, + year = {2024}, +} +""", prompt="Identify the main category of the covid-19 papers based on the titles and abstracts", ) diff --git a/mteb/tasks/Clustering/eng/RedditClustering.py b/mteb/tasks/Clustering/eng/RedditClustering.py index a49b2b63c8..e1d111b8e6 100644 --- a/mteb/tasks/Clustering/eng/RedditClustering.py +++ b/mteb/tasks/Clustering/eng/RedditClustering.py @@ -34,19 +34,21 @@ class RedditFastClusteringS2S(AbsTaskClusteringFast): annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation="""@article{geigle:2021:arxiv, - author = {Gregor Geigle and - Nils Reimers and - Andreas R{\"u}ckl{\'e} and - Iryna Gurevych}, - title = {TWEAC: Transformer with Extendable QA Agent Classifiers}, - journal = {arXiv preprint}, - volume = {abs/2104.07081}, - year = {2021}, - url = {http://arxiv.org/abs/2104.07081}, - archivePrefix = {arXiv}, - eprint = {2104.07081} - }""", + bibtex_citation=r""" +@article{geigle:2021:arxiv, + archiveprefix = {arXiv}, + author = {Gregor Geigle and +Nils Reimers and +Andreas R{\"u}ckl{\'e} and +Iryna Gurevych}, + eprint = {2104.07081}, + journal = {arXiv preprint}, + title = {TWEAC: Transformer with Extendable QA Agent Classifiers}, + url = {http://arxiv.org/abs/2104.07081}, + volume = {abs/2104.07081}, + year = {2021}, +} +""", prompt="Identify the topic or theme of Reddit posts based on the titles", adapted_from=["RedditClustering"], ) @@ -93,18 +95,20 @@ class RedditClustering(AbsTaskClustering): annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation="""@article{geigle:2021:arxiv, - author = {Gregor Geigle and - Nils Reimers and - Andreas R{\"u}ckl{\'e} and - Iryna Gurevych}, - title = {TWEAC: Transformer with Extendable QA Agent Classifiers}, - journal = {arXiv preprint}, - volume = {abs/2104.07081}, - year = {2021}, - url = {http://arxiv.org/abs/2104.07081}, - archivePrefix = {arXiv}, - eprint = {2104.07081} - }""", + bibtex_citation=r""" +@article{geigle:2021:arxiv, + archiveprefix = {arXiv}, + author = {Gregor Geigle and +Nils Reimers and +Andreas R{\"u}ckl{\'e} and +Iryna Gurevych}, + eprint = {2104.07081}, + journal = {arXiv preprint}, + title = {TWEAC: Transformer with Extendable QA Agent Classifiers}, + url = {http://arxiv.org/abs/2104.07081}, + volume = {abs/2104.07081}, + year = {2021}, +} +""", prompt="Identify the topic or theme of Reddit posts based on the titles", ) diff --git a/mteb/tasks/Clustering/eng/RedditClusteringP2P.py b/mteb/tasks/Clustering/eng/RedditClusteringP2P.py index 243291cdbb..78352a2356 100644 --- a/mteb/tasks/Clustering/eng/RedditClusteringP2P.py +++ b/mteb/tasks/Clustering/eng/RedditClusteringP2P.py @@ -36,19 +36,21 @@ class RedditClusteringP2P(AbsTaskClustering): annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation="""@article{geigle:2021:arxiv, - author = {Gregor Geigle and - Nils Reimers and - Andreas R{\"u}ckl{\'e} and - Iryna Gurevych}, - title = {TWEAC: Transformer with Extendable QA Agent Classifiers}, - journal = {arXiv preprint}, - volume = {abs/2104.07081}, - year = {2021}, - url = {http://arxiv.org/abs/2104.07081}, - archivePrefix = {arXiv}, - eprint = {2104.07081} - }""", + bibtex_citation=r""" +@article{geigle:2021:arxiv, + archiveprefix = {arXiv}, + author = {Gregor Geigle and +Nils Reimers and +Andreas R{\"u}ckl{\'e} and +Iryna Gurevych}, + eprint = {2104.07081}, + journal = {arXiv preprint}, + title = {TWEAC: Transformer with Extendable QA Agent Classifiers}, + url = {http://arxiv.org/abs/2104.07081}, + volume = {abs/2104.07081}, + year = {2021}, +} +""", prompt="Identify the topic or theme of Reddit posts based on the titles and posts", ) @@ -75,19 +77,21 @@ class RedditFastClusteringP2P(AbsTaskClusteringFast): annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation="""@article{geigle:2021:arxiv, - author = {Gregor Geigle and - Nils Reimers and - Andreas R{\"u}ckl{\'e} and - Iryna Gurevych}, - title = {TWEAC: Transformer with Extendable QA Agent Classifiers}, - journal = {arXiv preprint}, - volume = {abs/2104.07081}, - year = {2021}, - url = {http://arxiv.org/abs/2104.07081}, - archivePrefix = {arXiv}, - eprint = {2104.07081} - }""", + bibtex_citation=r""" +@article{geigle:2021:arxiv, + archiveprefix = {arXiv}, + author = {Gregor Geigle and +Nils Reimers and +Andreas R{\"u}ckl{\'e} and +Iryna Gurevych}, + eprint = {2104.07081}, + journal = {arXiv preprint}, + title = {TWEAC: Transformer with Extendable QA Agent Classifiers}, + url = {http://arxiv.org/abs/2104.07081}, + volume = {abs/2104.07081}, + year = {2021}, +} +""", prompt="Identify the topic or theme of Reddit posts based on the titles and posts", adapted_from=["RedditClusteringP2P"], ) diff --git a/mteb/tasks/Clustering/eng/StackExchangeClustering.py b/mteb/tasks/Clustering/eng/StackExchangeClustering.py index 3ab53e4c0b..419f04c2c7 100644 --- a/mteb/tasks/Clustering/eng/StackExchangeClustering.py +++ b/mteb/tasks/Clustering/eng/StackExchangeClustering.py @@ -34,19 +34,21 @@ class StackExchangeClusteringFast(AbsTaskClusteringFast): annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation="""@article{geigle:2021:arxiv, - author = {Gregor Geigle and - Nils Reimers and - Andreas R{\"u}ckl{\'e} and - Iryna Gurevych}, - title = {TWEAC: Transformer with Extendable QA Agent Classifiers}, - journal = {arXiv preprint}, - volume = {abs/2104.07081}, - year = {2021}, - url = {http://arxiv.org/abs/2104.07081}, - archivePrefix = {arXiv}, - eprint = {2104.07081} - }""", + bibtex_citation=r""" +@article{geigle:2021:arxiv, + archiveprefix = {arXiv}, + author = {Gregor Geigle and +Nils Reimers and +Andreas R{\"u}ckl{\'e} and +Iryna Gurevych}, + eprint = {2104.07081}, + journal = {arXiv preprint}, + title = {TWEAC: Transformer with Extendable QA Agent Classifiers}, + url = {http://arxiv.org/abs/2104.07081}, + volume = {abs/2104.07081}, + year = {2021}, +} +""", prompt="Identify the topic or theme of StackExchange posts based on the titles", adapted_from=["StackExchangeClustering"], ) @@ -95,18 +97,20 @@ class StackExchangeClustering(AbsTaskClustering): annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation="""@article{geigle:2021:arxiv, - author = {Gregor Geigle and - Nils Reimers and - Andreas R{\"u}ckl{\'e} and - Iryna Gurevych}, - title = {TWEAC: Transformer with Extendable QA Agent Classifiers}, - journal = {arXiv preprint}, - volume = {abs/2104.07081}, - year = {2021}, - url = {http://arxiv.org/abs/2104.07081}, - archivePrefix = {arXiv}, - eprint = {2104.07081} - }""", + bibtex_citation=r""" +@article{geigle:2021:arxiv, + archiveprefix = {arXiv}, + author = {Gregor Geigle and +Nils Reimers and +Andreas R{\"u}ckl{\'e} and +Iryna Gurevych}, + eprint = {2104.07081}, + journal = {arXiv preprint}, + title = {TWEAC: Transformer with Extendable QA Agent Classifiers}, + url = {http://arxiv.org/abs/2104.07081}, + volume = {abs/2104.07081}, + year = {2021}, +} +""", prompt="Identify the topic or theme of StackExchange posts based on the titles", ) diff --git a/mteb/tasks/Clustering/eng/StackExchangeClusteringP2P.py b/mteb/tasks/Clustering/eng/StackExchangeClusteringP2P.py index 40b3bd82d6..c485e769da 100644 --- a/mteb/tasks/Clustering/eng/StackExchangeClusteringP2P.py +++ b/mteb/tasks/Clustering/eng/StackExchangeClusteringP2P.py @@ -36,19 +36,21 @@ class StackExchangeClusteringP2PFast(AbsTaskClusteringFast): annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation="""@article{geigle:2021:arxiv, - author = {Gregor Geigle and - Nils Reimers and - Andreas R{\"u}ckl{\'e} and - Iryna Gurevych}, - title = {TWEAC: Transformer with Extendable QA Agent Classifiers}, - journal = {arXiv preprint}, - volume = {abs/2104.07081}, - year = {2021}, - url = {http://arxiv.org/abs/2104.07081}, - archivePrefix = {arXiv}, - eprint = {2104.07081} - }""", + bibtex_citation=r""" +@article{geigle:2021:arxiv, + archiveprefix = {arXiv}, + author = {Gregor Geigle and +Nils Reimers and +Andreas R{\"u}ckl{\'e} and +Iryna Gurevych}, + eprint = {2104.07081}, + journal = {arXiv preprint}, + title = {TWEAC: Transformer with Extendable QA Agent Classifiers}, + url = {http://arxiv.org/abs/2104.07081}, + volume = {abs/2104.07081}, + year = {2021}, +} +""", prompt="Identify the topic or theme of StackExchange posts based on the given paragraphs", adapted_from=["StackExchangeClusteringP2P"], ) @@ -99,18 +101,20 @@ class StackExchangeClusteringP2P(AbsTaskClustering): annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation="""@article{geigle:2021:arxiv, - author = {Gregor Geigle and - Nils Reimers and - Andreas R{\"u}ckl{\'e} and - Iryna Gurevych}, - title = {TWEAC: Transformer with Extendable QA Agent Classifiers}, - journal = {arXiv preprint}, - volume = {abs/2104.07081}, - year = {2021}, - url = {http://arxiv.org/abs/2104.07081}, - archivePrefix = {arXiv}, - eprint = {2104.07081} - }""", + bibtex_citation=r""" +@article{geigle:2021:arxiv, + archiveprefix = {arXiv}, + author = {Gregor Geigle and +Nils Reimers and +Andreas R{\"u}ckl{\'e} and +Iryna Gurevych}, + eprint = {2104.07081}, + journal = {arXiv preprint}, + title = {TWEAC: Transformer with Extendable QA Agent Classifiers}, + url = {http://arxiv.org/abs/2104.07081}, + volume = {abs/2104.07081}, + year = {2021}, +} +""", prompt="Identify the topic or theme of StackExchange posts based on the given paragraphs", ) diff --git a/mteb/tasks/Clustering/eng/TwentyNewsgroupsClustering.py b/mteb/tasks/Clustering/eng/TwentyNewsgroupsClustering.py index abdca6638c..5c82af900a 100644 --- a/mteb/tasks/Clustering/eng/TwentyNewsgroupsClustering.py +++ b/mteb/tasks/Clustering/eng/TwentyNewsgroupsClustering.py @@ -35,20 +35,21 @@ class TwentyNewsgroupsClustering(AbsTaskClustering): annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation="""@incollection{LANG1995331, - title = {NewsWeeder: Learning to Filter Netnews}, - editor = {Armand Prieditis and Stuart Russell}, - booktitle = {Machine Learning Proceedings 1995}, - publisher = {Morgan Kaufmann}, - address = {San Francisco (CA)}, - pages = {331-339}, - year = {1995}, - isbn = {978-1-55860-377-6}, - doi = {https://doi.org/10.1016/B978-1-55860-377-6.50048-7}, - url = {https://www.sciencedirect.com/science/article/pii/B9781558603776500487}, - author = {Ken Lang}, - } - """, + bibtex_citation=r""" +@incollection{LANG1995331, + address = {San Francisco (CA)}, + author = {Ken Lang}, + booktitle = {Machine Learning Proceedings 1995}, + doi = {https://doi.org/10.1016/B978-1-55860-377-6.50048-7}, + editor = {Armand Prieditis and Stuart Russell}, + isbn = {978-1-55860-377-6}, + pages = {331-339}, + publisher = {Morgan Kaufmann}, + title = {NewsWeeder: Learning to Filter Netnews}, + url = {https://www.sciencedirect.com/science/article/pii/B9781558603776500487}, + year = {1995}, +} +""", prompt="Identify the topic or theme of the given news articles", ) @@ -75,20 +76,21 @@ class TwentyNewsgroupsClusteringFast(AbsTaskClusteringFast): annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation="""@incollection{LANG1995331, - title = {NewsWeeder: Learning to Filter Netnews}, - editor = {Armand Prieditis and Stuart Russell}, - booktitle = {Machine Learning Proceedings 1995}, - publisher = {Morgan Kaufmann}, - address = {San Francisco (CA)}, - pages = {331-339}, - year = {1995}, - isbn = {978-1-55860-377-6}, - doi = {https://doi.org/10.1016/B978-1-55860-377-6.50048-7}, - url = {https://www.sciencedirect.com/science/article/pii/B9781558603776500487}, - author = {Ken Lang}, - } - """, + bibtex_citation=r""" +@incollection{LANG1995331, + address = {San Francisco (CA)}, + author = {Ken Lang}, + booktitle = {Machine Learning Proceedings 1995}, + doi = {https://doi.org/10.1016/B978-1-55860-377-6.50048-7}, + editor = {Armand Prieditis and Stuart Russell}, + isbn = {978-1-55860-377-6}, + pages = {331-339}, + publisher = {Morgan Kaufmann}, + title = {NewsWeeder: Learning to Filter Netnews}, + url = {https://www.sciencedirect.com/science/article/pii/B9781558603776500487}, + year = {1995}, +} +""", prompt="Identify the topic or theme of the given news articles", adapted_from=["TwentyNewsgroupsClustering"], ) diff --git a/mteb/tasks/Clustering/eng/WikiCitiesClustering.py b/mteb/tasks/Clustering/eng/WikiCitiesClustering.py index be897938a8..3f37207640 100644 --- a/mteb/tasks/Clustering/eng/WikiCitiesClustering.py +++ b/mteb/tasks/Clustering/eng/WikiCitiesClustering.py @@ -27,9 +27,11 @@ class WikiCitiesClustering(AbsTaskClustering): annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation="""@ONLINE{wikidump, - author = "Wikimedia Foundation", - title = "Wikimedia Downloads", - url = "https://dumps.wikimedia.org" -}""", + bibtex_citation=r""" +@online{wikidump, + author = {Wikimedia Foundation}, + title = {Wikimedia Downloads}, + url = {https://dumps.wikimedia.org}, +} +""", ) diff --git a/mteb/tasks/Clustering/eng/WikipediaChemistrySpecialtiesClustering.py b/mteb/tasks/Clustering/eng/WikipediaChemistrySpecialtiesClustering.py index a4e4082a69..05b7aec7bf 100644 --- a/mteb/tasks/Clustering/eng/WikipediaChemistrySpecialtiesClustering.py +++ b/mteb/tasks/Clustering/eng/WikipediaChemistrySpecialtiesClustering.py @@ -26,12 +26,12 @@ class WikipediaChemistrySpecialtiesClustering(AbsTaskClustering): annotations_creators="derived", dialect=[], sample_creation="created", - bibtex_citation=""" - @article{kasmaee2024chemteb, - title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, - author={Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, - journal={arXiv preprint arXiv:2412.00532}, - year={2024} - } - """, + bibtex_citation=r""" +@article{kasmaee2024chemteb, + author = {Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, + journal = {arXiv preprint arXiv:2412.00532}, + title = {ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, + year = {2024}, +} +""", ) diff --git a/mteb/tasks/Clustering/eng/WikipediaChemistryTopicsClustering.py b/mteb/tasks/Clustering/eng/WikipediaChemistryTopicsClustering.py index bfa5e1fcf3..a170d89107 100644 --- a/mteb/tasks/Clustering/eng/WikipediaChemistryTopicsClustering.py +++ b/mteb/tasks/Clustering/eng/WikipediaChemistryTopicsClustering.py @@ -26,12 +26,12 @@ class WikipediaChemistryTopicsClustering(AbsTaskClustering): annotations_creators="derived", dialect=[], sample_creation="created", - bibtex_citation=""" - @article{kasmaee2024chemteb, - title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, - author={Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, - journal={arXiv preprint arXiv:2412.00532}, - year={2024} - } - """, + bibtex_citation=r""" +@article{kasmaee2024chemteb, + author = {Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, + journal = {arXiv preprint arXiv:2412.00532}, + title = {ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, + year = {2024}, +} +""", ) diff --git a/mteb/tasks/Clustering/fra/AlloProfClusteringP2P.py b/mteb/tasks/Clustering/fra/AlloProfClusteringP2P.py index 0e95b82773..83a22953fb 100644 --- a/mteb/tasks/Clustering/fra/AlloProfClusteringP2P.py +++ b/mteb/tasks/Clustering/fra/AlloProfClusteringP2P.py @@ -30,24 +30,26 @@ class AlloProfClusteringP2P(AbsTaskClustering): eval_splits=["test"], eval_langs=["fra-Latn"], main_score="v_measure", - date=None, + date=("1996-01-01", "2023-04-14"), form=None, - domains=None, - task_subtypes=None, - license=None, - annotations_creators=None, + domains=["Encyclopaedic", "Written"], + task_subtypes=["Thematic clustering"], + license="mit", + annotations_creators="human-annotated", dialect=None, - sample_creation=None, - bibtex_citation="""@misc{lef23, - doi = {10.48550/ARXIV.2302.07738}, - url = {https://arxiv.org/abs/2302.07738}, + sample_creation="found", + bibtex_citation=r""" +@misc{lef23, author = {Lefebvre-Brossard, Antoine and Gazaille, Stephane and Desmarais, Michel C.}, + copyright = {Creative Commons Attribution Non Commercial Share Alike 4.0 International}, + doi = {10.48550/ARXIV.2302.07738}, keywords = {Computation and Language (cs.CL), Information Retrieval (cs.IR), Machine Learning (cs.LG), FOS: Computer and information sciences, FOS: Computer and information sciences}, - title = {Alloprof: a new French question-answer education dataset and its use in an information retrieval case study}, publisher = {arXiv}, + title = {Alloprof: a new French question-answer education dataset and its use in an information retrieval case study}, + url = {https://arxiv.org/abs/2302.07738}, year = {2023}, - copyright = {Creative Commons Attribution Non Commercial Share Alike 4.0 International} -}""", +} +""", ) def create_description(self, example): @@ -96,15 +98,16 @@ class AlloProfClusteringP2PFast(AbsTaskClusteringFast): annotations_creators="human-annotated", dialect=[], sample_creation="found", - bibtex_citation="""@misc{lef23, - doi = {10.48550/ARXIV.2302.07738}, - url = {https://arxiv.org/abs/2302.07738}, + bibtex_citation=r""" +@misc{lef23, author = {Lefebvre-Brossard, Antoine and Gazaille, Stephane and Desmarais, Michel C.}, + copyright = {Creative Commons Attribution Non Commercial Share Alike 4.0 International}, + doi = {10.48550/ARXIV.2302.07738}, keywords = {Computation and Language (cs.CL), Information Retrieval (cs.IR), Machine Learning (cs.LG), FOS: Computer and information sciences, FOS: Computer and information sciences}, - title = {Alloprof: a new French question-answer education dataset and its use in an information retrieval case study}, publisher = {arXiv}, + title = {Alloprof: a new French question-answer education dataset and its use in an information retrieval case study}, + url = {https://arxiv.org/abs/2302.07738}, year = {2023}, - copyright = {Creative Commons Attribution Non Commercial Share Alike 4.0 International} } """, adapted_from=["AlloProfClusteringP2P"], diff --git a/mteb/tasks/Clustering/fra/AlloProfClusteringS2S.py b/mteb/tasks/Clustering/fra/AlloProfClusteringS2S.py index 1b4f609827..349faaec7f 100644 --- a/mteb/tasks/Clustering/fra/AlloProfClusteringS2S.py +++ b/mteb/tasks/Clustering/fra/AlloProfClusteringS2S.py @@ -30,24 +30,26 @@ class AlloProfClusteringS2S(AbsTaskClustering): eval_splits=["test"], eval_langs=["fra-Latn"], main_score="v_measure", - date=None, + date=("1996-01-01", "2023-04-14"), form=None, - domains=None, - task_subtypes=None, - license=None, - annotations_creators=None, + domains=["Encyclopaedic", "Written"], + task_subtypes=["Thematic clustering"], + license="mit", + annotations_creators="human-annotated", dialect=None, - sample_creation=None, - bibtex_citation="""@misc{lef23, - doi = {10.48550/ARXIV.2302.07738}, - url = {https://arxiv.org/abs/2302.07738}, + sample_creation="found", + bibtex_citation=r""" +@misc{lef23, author = {Lefebvre-Brossard, Antoine and Gazaille, Stephane and Desmarais, Michel C.}, + copyright = {Creative Commons Attribution Non Commercial Share Alike 4.0 International}, + doi = {10.48550/ARXIV.2302.07738}, keywords = {Computation and Language (cs.CL), Information Retrieval (cs.IR), Machine Learning (cs.LG), FOS: Computer and information sciences, FOS: Computer and information sciences}, - title = {Alloprof: a new French question-answer education dataset and its use in an information retrieval case study}, publisher = {arXiv}, + title = {Alloprof: a new French question-answer education dataset and its use in an information retrieval case study}, + url = {https://arxiv.org/abs/2302.07738}, year = {2023}, - copyright = {Creative Commons Attribution Non Commercial Share Alike 4.0 International} -}""", +} +""", ) def dataset_transform(self): @@ -93,15 +95,16 @@ class AlloProfClusteringS2SFast(AbsTaskClusteringFast): annotations_creators="human-annotated", dialect=[], sample_creation="found", - bibtex_citation="""@misc{lef23, - doi = {10.48550/ARXIV.2302.07738}, - url = {https://arxiv.org/abs/2302.07738}, + bibtex_citation=r""" +@misc{lef23, author = {Lefebvre-Brossard, Antoine and Gazaille, Stephane and Desmarais, Michel C.}, + copyright = {Creative Commons Attribution Non Commercial Share Alike 4.0 International}, + doi = {10.48550/ARXIV.2302.07738}, keywords = {Computation and Language (cs.CL), Information Retrieval (cs.IR), Machine Learning (cs.LG), FOS: Computer and information sciences, FOS: Computer and information sciences}, - title = {Alloprof: a new French question-answer education dataset and its use in an information retrieval case study}, publisher = {arXiv}, + title = {Alloprof: a new French question-answer education dataset and its use in an information retrieval case study}, + url = {https://arxiv.org/abs/2302.07738}, year = {2023}, - copyright = {Creative Commons Attribution Non Commercial Share Alike 4.0 International} } """, adapted_from=["AlloProfClusteringS2S"], diff --git a/mteb/tasks/Clustering/fra/HALClusteringS2S.py b/mteb/tasks/Clustering/fra/HALClusteringS2S.py index c6254befe6..eaeb4924a0 100644 --- a/mteb/tasks/Clustering/fra/HALClusteringS2S.py +++ b/mteb/tasks/Clustering/fra/HALClusteringS2S.py @@ -32,22 +32,24 @@ class HALClusteringS2S(AbsTaskClustering): eval_splits=["test"], eval_langs=["fra-Latn"], main_score="v_measure", - date=None, + date=("2000-03-29", "2024-05-24"), form=None, - domains=None, - task_subtypes=None, - license=None, - annotations_creators=None, + domains=["Academic", "Written"], + task_subtypes=["Thematic clustering"], + license="apache-2.0", + annotations_creators="human-annotated", dialect=None, - sample_creation=None, - bibtex_citation="""@misc{ciancone2024extending, - title={Extending the Massive Text Embedding Benchmark to French}, - author={Mathieu Ciancone and Imene Kerboua and Marion Schaeffer and Wissam Siblini}, - year={2024}, - eprint={2405.20468}, - archivePrefix={arXiv}, - primaryClass={cs.CL} -}""", + sample_creation="found", + bibtex_citation=r""" +@misc{ciancone2024extending, + archiveprefix = {arXiv}, + author = {Mathieu Ciancone and Imene Kerboua and Marion Schaeffer and Wissam Siblini}, + eprint = {2405.20468}, + primaryclass = {cs.CL}, + title = {Extending the Massive Text Embedding Benchmark to French}, + year = {2024}, +} +""", ) def dataset_transform(self): @@ -87,14 +89,16 @@ class HALClusteringS2SFast(AbsTaskClusteringFast): annotations_creators="human-annotated", dialect=[], sample_creation="found", - bibtex_citation="""@misc{ciancone2024extending, - title={Extending the Massive Text Embedding Benchmark to French}, - author={Mathieu Ciancone and Imene Kerboua and Marion Schaeffer and Wissam Siblini}, - year={2024}, - eprint={2405.20468}, - archivePrefix={arXiv}, - primaryClass={cs.CL} -}""", + bibtex_citation=r""" +@misc{ciancone2024extending, + archiveprefix = {arXiv}, + author = {Mathieu Ciancone and Imene Kerboua and Marion Schaeffer and Wissam Siblini}, + eprint = {2405.20468}, + primaryclass = {cs.CL}, + title = {Extending the Massive Text Embedding Benchmark to French}, + year = {2024}, +} +""", adapted_from=["HALClusteringS2S"], ) diff --git a/mteb/tasks/Clustering/jpn/MewsC16JaClustering.py b/mteb/tasks/Clustering/jpn/MewsC16JaClustering.py index 5c8bfe01fa..dfe010cdb6 100644 --- a/mteb/tasks/Clustering/jpn/MewsC16JaClustering.py +++ b/mteb/tasks/Clustering/jpn/MewsC16JaClustering.py @@ -33,25 +33,24 @@ class MewsC16JaClustering(AbsTaskClusteringFast): annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation=""" - @inproceedings{ - nishikawa-etal-2022-ease, - title = "{EASE}: Entity-Aware Contrastive Learning of Sentence Embedding", - author = "Nishikawa, Sosuke and - Ri, Ryokan and - Yamada, Ikuya and - Tsuruoka, Yoshimasa and - Echizen, Isao", - booktitle = "Proceedings of the 2022 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies", - month = jul, - year = "2022", - address = "Seattle, United States", - publisher = "Association for Computational Linguistics", - url = "https://aclanthology.org/2022.naacl-main.284", - pages = "3870--3885", - abstract = "We present EASE, a novel method for learning sentence embeddings via contrastive learning between sentences and their related entities.The advantage of using entity supervision is twofold: (1) entities have been shown to be a strong indicator of text semantics and thus should provide rich training signals for sentence embeddings; (2) entities are defined independently of languages and thus offer useful cross-lingual alignment supervision.We evaluate EASE against other unsupervised models both in monolingual and multilingual settings.We show that EASE exhibits competitive or better performance in English semantic textual similarity (STS) and short text clustering (STC) tasks and it significantly outperforms baseline methods in multilingual settings on a variety of tasks.Our source code, pre-trained models, and newly constructed multi-lingual STC dataset are available at https://github.com/studio-ousia/ease.", - } - """, + bibtex_citation=r""" +@inproceedings{nishikawa-etal-2022-ease, + abstract = {We present EASE, a novel method for learning sentence embeddings via contrastive learning between sentences and their related entities.The advantage of using entity supervision is twofold: (1) entities have been shown to be a strong indicator of text semantics and thus should provide rich training signals for sentence embeddings; (2) entities are defined independently of languages and thus offer useful cross-lingual alignment supervision.We evaluate EASE against other unsupervised models both in monolingual and multilingual settings.We show that EASE exhibits competitive or better performance in English semantic textual similarity (STS) and short text clustering (STC) tasks and it significantly outperforms baseline methods in multilingual settings on a variety of tasks.Our source code, pre-trained models, and newly constructed multi-lingual STC dataset are available at https://github.com/studio-ousia/ease.}, + address = {Seattle, United States}, + author = {Nishikawa, Sosuke and +Ri, Ryokan and +Yamada, Ikuya and +Tsuruoka, Yoshimasa and +Echizen, Isao}, + booktitle = {Proceedings of the 2022 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies}, + month = jul, + pages = {3870--3885}, + publisher = {Association for Computational Linguistics}, + title = {{EASE}: Entity-Aware Contrastive Learning of Sentence Embedding}, + url = {https://aclanthology.org/2022.naacl-main.284}, + year = {2022}, +} +""", ) def dataset_transform(self): diff --git a/mteb/tasks/Clustering/kor/KlueMrcDomainClustering.py b/mteb/tasks/Clustering/kor/KlueMrcDomainClustering.py index fc2b27b884..de070dd087 100644 --- a/mteb/tasks/Clustering/kor/KlueMrcDomainClustering.py +++ b/mteb/tasks/Clustering/kor/KlueMrcDomainClustering.py @@ -28,14 +28,16 @@ class KlueMrcDomainClustering(AbsTaskClustering): annotations_creators="human-annotated", dialect=[], sample_creation="found", - bibtex_citation="""@misc{park2021klue, - title={KLUE: Korean Language Understanding Evaluation}, - author={Sungjoon Park and Jihyung Moon and Sungdong Kim and Won Ik Cho and Jiyoon Han and Jangwon Park and Chisung Song and Junseong Kim and Yongsook Song and Taehwan Oh and Joohong Lee and Juhyun Oh and Sungwon Lyu and Younghoon Jeong and Inkwon Lee and Sangwoo Seo and Dongjun Lee and Hyunwoo Kim and Myeonghwa Lee and Seongbo Jang and Seungwon Do and Sunkyoung Kim and Kyungtae Lim and Jongwon Lee and Kyumin Park and Jamin Shin and Seonghyun Kim and Lucy Park and Alice Oh and Jungwoo Ha and Kyunghyun Cho}, - year={2021}, - eprint={2105.09680}, - archivePrefix={arXiv}, - primaryClass={cs.CL}, -}""", + bibtex_citation=r""" +@misc{park2021klue, + archiveprefix = {arXiv}, + author = {Sungjoon Park and Jihyung Moon and Sungdong Kim and Won Ik Cho and Jiyoon Han and Jangwon Park and Chisung Song and Junseong Kim and Yongsook Song and Taehwan Oh and Joohong Lee and Juhyun Oh and Sungwon Lyu and Younghoon Jeong and Inkwon Lee and Sangwoo Seo and Dongjun Lee and Hyunwoo Kim and Myeonghwa Lee and Seongbo Jang and Seungwon Do and Sunkyoung Kim and Kyungtae Lim and Jongwon Lee and Kyumin Park and Jamin Shin and Seonghyun Kim and Lucy Park and Alice Oh and Jungwoo Ha and Kyunghyun Cho}, + eprint = {2105.09680}, + primaryclass = {cs.CL}, + title = {KLUE: Korean Language Understanding Evaluation}, + year = {2021}, +} +""", prompt="Identify the topic or theme of the given texts", ) diff --git a/mteb/tasks/Clustering/kor/KlueYnatMrcCategoryClustering.py b/mteb/tasks/Clustering/kor/KlueYnatMrcCategoryClustering.py index d31dd87add..14358e7206 100644 --- a/mteb/tasks/Clustering/kor/KlueYnatMrcCategoryClustering.py +++ b/mteb/tasks/Clustering/kor/KlueYnatMrcCategoryClustering.py @@ -28,14 +28,16 @@ class KlueYnatMrcCategoryClustering(AbsTaskClustering): annotations_creators="human-annotated", dialect=[], sample_creation="found", - bibtex_citation="""@misc{park2021klue, - title={KLUE: Korean Language Understanding Evaluation}, - author={Sungjoon Park and Jihyung Moon and Sungdong Kim and Won Ik Cho and Jiyoon Han and Jangwon Park and Chisung Song and Junseong Kim and Yongsook Song and Taehwan Oh and Joohong Lee and Juhyun Oh and Sungwon Lyu and Younghoon Jeong and Inkwon Lee and Sangwoo Seo and Dongjun Lee and Hyunwoo Kim and Myeonghwa Lee and Seongbo Jang and Seungwon Do and Sunkyoung Kim and Kyungtae Lim and Jongwon Lee and Kyumin Park and Jamin Shin and Seonghyun Kim and Lucy Park and Alice Oh and Jungwoo Ha and Kyunghyun Cho}, - year={2021}, - eprint={2105.09680}, - archivePrefix={arXiv}, - primaryClass={cs.CL}, -}""", + bibtex_citation=r""" +@misc{park2021klue, + archiveprefix = {arXiv}, + author = {Sungjoon Park and Jihyung Moon and Sungdong Kim and Won Ik Cho and Jiyoon Han and Jangwon Park and Chisung Song and Junseong Kim and Yongsook Song and Taehwan Oh and Joohong Lee and Juhyun Oh and Sungwon Lyu and Younghoon Jeong and Inkwon Lee and Sangwoo Seo and Dongjun Lee and Hyunwoo Kim and Myeonghwa Lee and Seongbo Jang and Seungwon Do and Sunkyoung Kim and Kyungtae Lim and Jongwon Lee and Kyumin Park and Jamin Shin and Seonghyun Kim and Lucy Park and Alice Oh and Jungwoo Ha and Kyunghyun Cho}, + eprint = {2105.09680}, + primaryclass = {cs.CL}, + title = {KLUE: Korean Language Understanding Evaluation}, + year = {2021}, +} +""", prompt="Identify the topic or theme of the given texts", ) diff --git a/mteb/tasks/Clustering/multilingual/IndicReviewsClusteringP2P.py b/mteb/tasks/Clustering/multilingual/IndicReviewsClusteringP2P.py index 8f649a745b..3ea45b2d99 100644 --- a/mteb/tasks/Clustering/multilingual/IndicReviewsClusteringP2P.py +++ b/mteb/tasks/Clustering/multilingual/IndicReviewsClusteringP2P.py @@ -49,13 +49,15 @@ class IndicReviewsClusteringP2P(AbsTaskClustering, MultilingualTask): annotations_creators="human-annotated", dialect=[], sample_creation="machine-translated and verified", - bibtex_citation="""@article{doddapaneni2022towards, - title = {Towards Leaving No Indic Language Behind: Building Monolingual Corpora, Benchmark and Models for Indic Languages}, - author = {Sumanth Doddapaneni and Rahul Aralikatte and Gowtham Ramesh and Shreyansh Goyal and Mitesh M. Khapra and Anoop Kunchukuttan and Pratyush Kumar}, - journal = {Annual Meeting of the Association for Computational Linguistics}, - year = {2022}, - doi = {10.18653/v1/2023.acl-long.693} -}""", + bibtex_citation=r""" +@article{doddapaneni2022towards, + author = {Sumanth Doddapaneni and Rahul Aralikatte and Gowtham Ramesh and Shreyansh Goyal and Mitesh M. Khapra and Anoop Kunchukuttan and Pratyush Kumar}, + doi = {10.18653/v1/2023.acl-long.693}, + journal = {Annual Meeting of the Association for Computational Linguistics}, + title = {Towards Leaving No Indic Language Behind: Building Monolingual Corpora, Benchmark and Models for Indic Languages}, + year = {2022}, +} +""", ) def load_data(self, **kwargs: Any) -> None: diff --git a/mteb/tasks/Clustering/multilingual/MLSUMClusteringP2P.py b/mteb/tasks/Clustering/multilingual/MLSUMClusteringP2P.py index 90d6fb17ba..fc341c00d0 100644 --- a/mteb/tasks/Clustering/multilingual/MLSUMClusteringP2P.py +++ b/mteb/tasks/Clustering/multilingual/MLSUMClusteringP2P.py @@ -44,12 +44,14 @@ class MLSUMClusteringP2P(AbsTaskClustering, MultilingualTask): annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation="""@article{scialom2020mlsum, - title={MLSUM: The Multilingual Summarization Corpus}, - author={Scialom, Thomas and Dray, Paul-Alexis and Lamprier, Sylvain and Piwowarski, Benjamin and Staiano, Jacopo}, - journal={arXiv preprint arXiv:2004.14900}, - year={2020} - }""", + bibtex_citation=r""" +@article{scialom2020mlsum, + author = {Scialom, Thomas and Dray, Paul-Alexis and Lamprier, Sylvain and Piwowarski, Benjamin and Staiano, Jacopo}, + journal = {arXiv preprint arXiv:2004.14900}, + title = {MLSUM: The Multilingual Summarization Corpus}, + year = {2020}, +} +""", ) def load_data(self, **kwargs): @@ -114,12 +116,14 @@ class MLSUMClusteringP2PFast(AbsTaskClusteringFast, MultilingualTask): annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation="""@article{scialom2020mlsum, - title={MLSUM: The Multilingual Summarization Corpus}, - author={Scialom, Thomas and Dray, Paul-Alexis and Lamprier, Sylvain and Piwowarski, Benjamin and Staiano, Jacopo}, - journal={arXiv preprint arXiv:2004.14900}, - year={2020} - }""", + bibtex_citation=r""" +@article{scialom2020mlsum, + author = {Scialom, Thomas and Dray, Paul-Alexis and Lamprier, Sylvain and Piwowarski, Benjamin and Staiano, Jacopo}, + journal = {arXiv preprint arXiv:2004.14900}, + title = {MLSUM: The Multilingual Summarization Corpus}, + year = {2020}, +} +""", adapted_from=["MLSUMClusteringP2P"], ) diff --git a/mteb/tasks/Clustering/multilingual/MLSUMClusteringS2S.py b/mteb/tasks/Clustering/multilingual/MLSUMClusteringS2S.py index 3cd6aa2d3a..f29200c233 100644 --- a/mteb/tasks/Clustering/multilingual/MLSUMClusteringS2S.py +++ b/mteb/tasks/Clustering/multilingual/MLSUMClusteringS2S.py @@ -44,12 +44,14 @@ class MLSUMClusteringS2S(AbsTaskClustering, MultilingualTask): annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation="""@article{scialom2020mlsum, - title={MLSUM: The Multilingual Summarization Corpus}, - author={Scialom, Thomas and Dray, Paul-Alexis and Lamprier, Sylvain and Piwowarski, Benjamin and Staiano, Jacopo}, - journal={arXiv preprint arXiv:2004.14900}, - year={2020} - }""", + bibtex_citation=r""" +@article{scialom2020mlsum, + author = {Scialom, Thomas and Dray, Paul-Alexis and Lamprier, Sylvain and Piwowarski, Benjamin and Staiano, Jacopo}, + journal = {arXiv preprint arXiv:2004.14900}, + title = {MLSUM: The Multilingual Summarization Corpus}, + year = {2020}, +} +""", ) def load_data(self, **kwargs): @@ -109,12 +111,14 @@ class MLSUMClusteringS2SFast(AbsTaskClusteringFast, MultilingualTask): annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation="""@article{scialom2020mlsum, - title={MLSUM: The Multilingual Summarization Corpus}, - author={Scialom, Thomas and Dray, Paul-Alexis and Lamprier, Sylvain and Piwowarski, Benjamin and Staiano, Jacopo}, - journal={arXiv preprint arXiv:2004.14900}, - year={2020} - }""", + bibtex_citation=r""" +@article{scialom2020mlsum, + author = {Scialom, Thomas and Dray, Paul-Alexis and Lamprier, Sylvain and Piwowarski, Benjamin and Staiano, Jacopo}, + journal = {arXiv preprint arXiv:2004.14900}, + title = {MLSUM: The Multilingual Summarization Corpus}, + year = {2020}, +} +""", adapted_from=["MLSUMClusteringS2S"], ) diff --git a/mteb/tasks/Clustering/multilingual/MasakhaNEWSClusteringP2P.py b/mteb/tasks/Clustering/multilingual/MasakhaNEWSClusteringP2P.py index 480cceff8f..a8c611f00f 100644 --- a/mteb/tasks/Clustering/multilingual/MasakhaNEWSClusteringP2P.py +++ b/mteb/tasks/Clustering/multilingual/MasakhaNEWSClusteringP2P.py @@ -53,13 +53,15 @@ class MasakhaNEWSClusteringP2P(AbsTaskClustering, MultilingualTask): annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation="""@article{adelani2023masakhanews, - title={MasakhaNEWS: News Topic Classification for African languages}, - author={David Ifeoluwa Adelani and Marek Masiak and Israel Abebe Azime and Jesujoba Oluwadara Alabi and Atnafu Lambebo Tonja and Christine Mwase and Odunayo Ogundepo and Bonaventure F. P. Dossou and Akintunde Oladipo and Doreen Nixdorf and Chris Chinenye Emezue and Sana Sabah al-azzawi and Blessing K. Sibanda and Davis David and Lolwethu Ndolela and Jonathan Mukiibi and Tunde Oluwaseyi Ajayi and Tatiana Moteu Ngoli and Brian Odhiambo and Abraham Toluwase Owodunni and Nnaemeka C. Obiefuna and Shamsuddeen Hassan Muhammad and Saheed Salahudeen Abdullahi and Mesay Gemeda Yigezu and Tajuddeen Gwadabe and Idris Abdulmumin and Mahlet Taye Bame and Oluwabusayo Olufunke Awoyomi and Iyanuoluwa Shode and Tolulope Anu Adelani and Habiba Abdulganiy Kailani and Abdul-Hakeem Omotayo and Adetola Adeeko and Afolabi Abeeb and Anuoluwapo Aremu and Olanrewaju Samuel and Clemencia Siro and Wangari Kimotho and Onyekachi Raphael Ogbu and Chinedu E. Mbonu and Chiamaka I. Chukwuneke and Samuel Fanijo and Jessica Ojo and Oyinkansola F. Awosan and Tadesse Kebede Guge and Sakayo Toadoum Sari and Pamela Nyatsine and Freedmore Sidume and Oreen Yousuf and Mardiyyah Oduwole and Ussen Kimanuka and Kanda Patrick Tshinu and Thina Diko and Siyanda Nxakama and Abdulmejid Tuni Johar and Sinodos Gebre and Muhidin Mohamed and Shafie Abdi Mohamed and Fuad Mire Hassan and Moges Ahmed Mehamed and Evrard Ngabire and and Pontus Stenetorp}, - journal={ArXiv}, - year={2023}, - volume={} -}""", + bibtex_citation=r""" +@article{adelani2023masakhanews, + author = {David Ifeoluwa Adelani and Marek Masiak and Israel Abebe Azime and Jesujoba Oluwadara Alabi and Atnafu Lambebo Tonja and Christine Mwase and Odunayo Ogundepo and Bonaventure F. P. Dossou and Akintunde Oladipo and Doreen Nixdorf and Chris Chinenye Emezue and Sana Sabah al-azzawi and Blessing K. Sibanda and Davis David and Lolwethu Ndolela and Jonathan Mukiibi and Tunde Oluwaseyi Ajayi and Tatiana Moteu Ngoli and Brian Odhiambo and Abraham Toluwase Owodunni and Nnaemeka C. Obiefuna and Shamsuddeen Hassan Muhammad and Saheed Salahudeen Abdullahi and Mesay Gemeda Yigezu and Tajuddeen Gwadabe and Idris Abdulmumin and Mahlet Taye Bame and Oluwabusayo Olufunke Awoyomi and Iyanuoluwa Shode and Tolulope Anu Adelani and Habiba Abdulganiy Kailani and Abdul-Hakeem Omotayo and Adetola Adeeko and Afolabi Abeeb and Anuoluwapo Aremu and Olanrewaju Samuel and Clemencia Siro and Wangari Kimotho and Onyekachi Raphael Ogbu and Chinedu E. Mbonu and Chiamaka I. Chukwuneke and Samuel Fanijo and Jessica Ojo and Oyinkansola F. Awosan and Tadesse Kebede Guge and Sakayo Toadoum Sari and Pamela Nyatsine and Freedmore Sidume and Oreen Yousuf and Mardiyyah Oduwole and Ussen Kimanuka and Kanda Patrick Tshinu and Thina Diko and Siyanda Nxakama and Abdulmejid Tuni Johar and Sinodos Gebre and Muhidin Mohamed and Shafie Abdi Mohamed and Fuad Mire Hassan and Moges Ahmed Mehamed and Evrard Ngabire and and Pontus Stenetorp}, + journal = {ArXiv}, + title = {MasakhaNEWS: News Topic Classification for African languages}, + volume = {}, + year = {2023}, +} +""", ) def load_data(self, **kwargs): diff --git a/mteb/tasks/Clustering/multilingual/MasakhaNEWSClusteringS2S.py b/mteb/tasks/Clustering/multilingual/MasakhaNEWSClusteringS2S.py index 7e8b22b9af..6ccddba538 100644 --- a/mteb/tasks/Clustering/multilingual/MasakhaNEWSClusteringS2S.py +++ b/mteb/tasks/Clustering/multilingual/MasakhaNEWSClusteringS2S.py @@ -45,20 +45,22 @@ class MasakhaNEWSClusteringS2S(AbsTaskClustering, MultilingualTask): eval_splits=["test"], eval_langs=_LANGUAGES, main_score="v_measure", - date=None, - domains=None, - task_subtypes=None, - license=None, - annotations_creators=None, + date=("2023-04-21", "2023-05-26"), + domains=["News", "Written"], + task_subtypes=["Topic classification"], + license="afl-3.0", + annotations_creators="human-annotated", dialect=None, sample_creation=None, - bibtex_citation="""@article{adelani2023masakhanews, - title={MasakhaNEWS: News Topic Classification for African languages}, - author={David Ifeoluwa Adelani and Marek Masiak and Israel Abebe Azime and Jesujoba Oluwadara Alabi and Atnafu Lambebo Tonja and Christine Mwase and Odunayo Ogundepo and Bonaventure F. P. Dossou and Akintunde Oladipo and Doreen Nixdorf and Chris Chinenye Emezue and Sana Sabah al-azzawi and Blessing K. Sibanda and Davis David and Lolwethu Ndolela and Jonathan Mukiibi and Tunde Oluwaseyi Ajayi and Tatiana Moteu Ngoli and Brian Odhiambo and Abraham Toluwase Owodunni and Nnaemeka C. Obiefuna and Shamsuddeen Hassan Muhammad and Saheed Salahudeen Abdullahi and Mesay Gemeda Yigezu and Tajuddeen Gwadabe and Idris Abdulmumin and Mahlet Taye Bame and Oluwabusayo Olufunke Awoyomi and Iyanuoluwa Shode and Tolulope Anu Adelani and Habiba Abdulganiy Kailani and Abdul-Hakeem Omotayo and Adetola Adeeko and Afolabi Abeeb and Anuoluwapo Aremu and Olanrewaju Samuel and Clemencia Siro and Wangari Kimotho and Onyekachi Raphael Ogbu and Chinedu E. Mbonu and Chiamaka I. Chukwuneke and Samuel Fanijo and Jessica Ojo and Oyinkansola F. Awosan and Tadesse Kebede Guge and Sakayo Toadoum Sari and Pamela Nyatsine and Freedmore Sidume and Oreen Yousuf and Mardiyyah Oduwole and Ussen Kimanuka and Kanda Patrick Tshinu and Thina Diko and Siyanda Nxakama and Abdulmejid Tuni Johar and Sinodos Gebre and Muhidin Mohamed and Shafie Abdi Mohamed and Fuad Mire Hassan and Moges Ahmed Mehamed and Evrard Ngabire and and Pontus Stenetorp}, - journal={ArXiv}, - year={2023}, - volume={} -}""", + bibtex_citation=r""" +@article{adelani2023masakhanews, + author = {David Ifeoluwa Adelani and Marek Masiak and Israel Abebe Azime and Jesujoba Oluwadara Alabi and Atnafu Lambebo Tonja and Christine Mwase and Odunayo Ogundepo and Bonaventure F. P. Dossou and Akintunde Oladipo and Doreen Nixdorf and Chris Chinenye Emezue and Sana Sabah al-azzawi and Blessing K. Sibanda and Davis David and Lolwethu Ndolela and Jonathan Mukiibi and Tunde Oluwaseyi Ajayi and Tatiana Moteu Ngoli and Brian Odhiambo and Abraham Toluwase Owodunni and Nnaemeka C. Obiefuna and Shamsuddeen Hassan Muhammad and Saheed Salahudeen Abdullahi and Mesay Gemeda Yigezu and Tajuddeen Gwadabe and Idris Abdulmumin and Mahlet Taye Bame and Oluwabusayo Olufunke Awoyomi and Iyanuoluwa Shode and Tolulope Anu Adelani and Habiba Abdulganiy Kailani and Abdul-Hakeem Omotayo and Adetola Adeeko and Afolabi Abeeb and Anuoluwapo Aremu and Olanrewaju Samuel and Clemencia Siro and Wangari Kimotho and Onyekachi Raphael Ogbu and Chinedu E. Mbonu and Chiamaka I. Chukwuneke and Samuel Fanijo and Jessica Ojo and Oyinkansola F. Awosan and Tadesse Kebede Guge and Sakayo Toadoum Sari and Pamela Nyatsine and Freedmore Sidume and Oreen Yousuf and Mardiyyah Oduwole and Ussen Kimanuka and Kanda Patrick Tshinu and Thina Diko and Siyanda Nxakama and Abdulmejid Tuni Johar and Sinodos Gebre and Muhidin Mohamed and Shafie Abdi Mohamed and Fuad Mire Hassan and Moges Ahmed Mehamed and Evrard Ngabire and and Pontus Stenetorp}, + journal = {ArXiv}, + title = {MasakhaNEWS: News Topic Classification for African languages}, + volume = {}, + year = {2023}, +} +""", ) def load_data(self, **kwargs): diff --git a/mteb/tasks/Clustering/multilingual/SIB200ClusteringS2S.py b/mteb/tasks/Clustering/multilingual/SIB200ClusteringS2S.py index 8569b55cd5..68e66ddc3e 100644 --- a/mteb/tasks/Clustering/multilingual/SIB200ClusteringS2S.py +++ b/mteb/tasks/Clustering/multilingual/SIB200ClusteringS2S.py @@ -237,12 +237,14 @@ class SIB200ClusteringFast(MultilingualTask, AbsTaskClusteringFast): annotations_creators="expert-annotated", # expert annotated for English --> human translations dialect=[], sample_creation="human-translated and localized", - bibtex_citation="""@article{adelani2023sib, - title={SIB-200: A simple, inclusive, and big evaluation dataset for topic classification in 200+ languages and dialects}, - author={Adelani, David Ifeoluwa and Liu, Hannah and Shen, Xiaoyu and Vassilyev, Nikita and Alabi, Jesujoba O and Mao, Yanke and Gao, Haonan and Lee, Annie En-Shiun}, - journal={arXiv preprint arXiv:2309.07445}, - year={2023} - }""", # combined train, validation, and test into test. + bibtex_citation=r""" +@article{adelani2023sib, + author = {Adelani, David Ifeoluwa and Liu, Hannah and Shen, Xiaoyu and Vassilyev, Nikita and Alabi, Jesujoba O and Mao, Yanke and Gao, Haonan and Lee, Annie En-Shiun}, + journal = {arXiv preprint arXiv:2309.07445}, + title = {SIB-200: A simple, inclusive, and big evaluation dataset for topic classification in 200+ languages and dialects}, + year = {2023}, +} +""", # combined train, validation, and test into test. ) def dataset_transform(self): diff --git a/mteb/tasks/Clustering/nob/SNLHierarchicalClustering.py b/mteb/tasks/Clustering/nob/SNLHierarchicalClustering.py index 081a99aebd..19a3d879c4 100644 --- a/mteb/tasks/Clustering/nob/SNLHierarchicalClustering.py +++ b/mteb/tasks/Clustering/nob/SNLHierarchicalClustering.py @@ -36,12 +36,14 @@ class SNLHierarchicalClusteringP2P(AbsTaskClusteringFast): dialect=[], task_subtypes=["Thematic clustering"], sample_creation="found", - bibtex_citation="""@mastersthesis{navjord2023beyond, - title={Beyond extractive: advancing abstractive automatic text summarization in Norwegian with transformers}, - author={Navjord, J{\\o}rgen Johnsen and Korsvik, Jon-Mikkel Ryen}, - year={2023}, - school={Norwegian University of Life Sciences, {\\AA}s} -}""", + bibtex_citation=r""" +@mastersthesis{navjord2023beyond, + author = {Navjord, J{\\o}rgen Johnsen and Korsvik, Jon-Mikkel Ryen}, + school = {Norwegian University of Life Sciences, {\\AA}s}, + title = {Beyond extractive: advancing abstractive automatic text summarization in Norwegian with transformers}, + year = {2023}, +} +""", prompt="Identify categories in a Norwegian lexicon", ) max_depth = 5 @@ -78,12 +80,14 @@ class SNLHierarchicalClusteringS2S(AbsTaskClusteringFast): dialect=[], task_subtypes=["Thematic clustering"], sample_creation="found", - bibtex_citation="""@mastersthesis{navjord2023beyond, - title={Beyond extractive: advancing abstractive automatic text summarization in Norwegian with transformers}, - author={Navjord, J{\\o}rgen Johnsen and Korsvik, Jon-Mikkel Ryen}, - year={2023}, - school={Norwegian University of Life Sciences, {\\AA}s} -}""", + bibtex_citation=r""" +@mastersthesis{navjord2023beyond, + author = {Navjord, J{\\o}rgen Johnsen and Korsvik, Jon-Mikkel Ryen}, + school = {Norwegian University of Life Sciences, {\\AA}s}, + title = {Beyond extractive: advancing abstractive automatic text summarization in Norwegian with transformers}, + year = {2023}, +} +""", prompt="Identify categories in a Norwegian lexicon", ) max_depth = 5 diff --git a/mteb/tasks/Clustering/nob/VGHierarchicalClustering.py b/mteb/tasks/Clustering/nob/VGHierarchicalClustering.py index eda3aff310..750d156d68 100644 --- a/mteb/tasks/Clustering/nob/VGHierarchicalClustering.py +++ b/mteb/tasks/Clustering/nob/VGHierarchicalClustering.py @@ -36,12 +36,14 @@ class VGHierarchicalClusteringP2P(AbsTaskClusteringFast): dialect=[], task_subtypes=["Thematic clustering"], sample_creation="found", - bibtex_citation="""@mastersthesis{navjord2023beyond, - title={Beyond extractive: advancing abstractive automatic text summarization in Norwegian with transformers}, - author={Navjord, J{\\o}rgen Johnsen and Korsvik, Jon-Mikkel Ryen}, - year={2023}, - school={Norwegian University of Life Sciences, {\\AA}s} -}""", + bibtex_citation=r""" +@mastersthesis{navjord2023beyond, + author = {Navjord, J{\\o}rgen Johnsen and Korsvik, Jon-Mikkel Ryen}, + school = {Norwegian University of Life Sciences, {\\AA}s}, + title = {Beyond extractive: advancing abstractive automatic text summarization in Norwegian with transformers}, + year = {2023}, +} +""", prompt="Identify the categories (e.g. sports) of given articles in Norwegian", ) @@ -81,12 +83,14 @@ class VGHierarchicalClusteringS2S(AbsTaskClusteringFast): dialect=[], task_subtypes=["Thematic clustering"], sample_creation="found", - bibtex_citation="""@mastersthesis{navjord2023beyond, - title={Beyond extractive: advancing abstractive automatic text summarization in Norwegian with transformers}, - author={Navjord, J{\\o}rgen Johnsen and Korsvik, Jon-Mikkel Ryen}, - year={2023}, - school={Norwegian University of Life Sciences, {\\AA}s} -}""", + bibtex_citation=r""" +@mastersthesis{navjord2023beyond, + author = {Navjord, J{\\o}rgen Johnsen and Korsvik, Jon-Mikkel Ryen}, + school = {Norwegian University of Life Sciences, {\\AA}s}, + title = {Beyond extractive: advancing abstractive automatic text summarization in Norwegian with transformers}, + year = {2023}, +} +""", prompt="Identify the categories (e.g. sports) of given articles in Norwegian", ) diff --git a/mteb/tasks/Clustering/nob/snl_clustering.py b/mteb/tasks/Clustering/nob/snl_clustering.py index 9256fc66c0..ae63ba1983 100644 --- a/mteb/tasks/Clustering/nob/snl_clustering.py +++ b/mteb/tasks/Clustering/nob/snl_clustering.py @@ -45,12 +45,14 @@ class SNLClustering(AbsTaskClustering): dialect=[], task_subtypes=["Thematic clustering"], sample_creation="found", - bibtex_citation="""@mastersthesis{navjord2023beyond, - title={Beyond extractive: advancing abstractive automatic text summarization in Norwegian with transformers}, - author={Navjord, J{\o}rgen Johnsen and Korsvik, Jon-Mikkel Ryen}, - year={2023}, - school={Norwegian University of Life Sciences, {\AA}s} -}""", + bibtex_citation=r""" +@mastersthesis{navjord2023beyond, + author = {Navjord, J{\o}rgen Johnsen and Korsvik, Jon-Mikkel Ryen}, + school = {Norwegian University of Life Sciences, {\AA}s}, + title = {Beyond extractive: advancing abstractive automatic text summarization in Norwegian with transformers}, + year = {2023}, +} +""", ) def dataset_transform(self): diff --git a/mteb/tasks/Clustering/nob/vg_clustering.py b/mteb/tasks/Clustering/nob/vg_clustering.py index f1050e796b..d3e6aecdd2 100644 --- a/mteb/tasks/Clustering/nob/vg_clustering.py +++ b/mteb/tasks/Clustering/nob/vg_clustering.py @@ -45,12 +45,14 @@ class VGClustering(AbsTaskClustering): dialect=[], task_subtypes=["Thematic clustering"], sample_creation="found", - bibtex_citation="""@mastersthesis{navjord2023beyond, - title={Beyond extractive: advancing abstractive automatic text summarization in Norwegian with transformers}, - author={Navjord, J{\o}rgen Johnsen and Korsvik, Jon-Mikkel Ryen}, - year={2023}, - school={Norwegian University of Life Sciences, {\AA}s} -}""", + bibtex_citation=r""" +@mastersthesis{navjord2023beyond, + author = {Navjord, J{\o}rgen Johnsen and Korsvik, Jon-Mikkel Ryen}, + school = {Norwegian University of Life Sciences, {\AA}s}, + title = {Beyond extractive: advancing abstractive automatic text summarization in Norwegian with transformers}, + year = {2023}, +} +""", ) def dataset_transform(self): diff --git a/mteb/tasks/Clustering/pol/PolishClustering.py b/mteb/tasks/Clustering/pol/PolishClustering.py index 86626e366d..8e4358e3c4 100644 --- a/mteb/tasks/Clustering/pol/PolishClustering.py +++ b/mteb/tasks/Clustering/pol/PolishClustering.py @@ -39,36 +39,38 @@ class EightTagsClustering(AbsTaskClustering): annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation="""@inproceedings{dadas-etal-2020-evaluation, - title = "Evaluation of Sentence Representations in {P}olish", - author = "Dadas, Slawomir and - Pere{\\l}kiewicz, Micha{\\l} and - Po{\\'s}wiata, Rafa{\\l}", - editor = "Calzolari, Nicoletta and - B{\'e}chet, Fr{\'e}d{\'e}ric and - Blache, Philippe and - Choukri, Khalid and - Cieri, Christopher and - Declerck, Thierry and - Goggi, Sara and - Isahara, Hitoshi and - Maegaard, Bente and - Mariani, Joseph and - Mazo, H{\\'e}l{\\`e}ne and - Moreno, Asuncion and - Odijk, Jan and - Piperidis, Stelios", - booktitle = "Proceedings of the Twelfth Language Resources and Evaluation Conference", - month = may, - year = "2020", - address = "Marseille, France", - publisher = "European Language Resources Association", - url = "https://aclanthology.org/2020.lrec-1.207", - pages = "1674--1680", - abstract = "Methods for learning sentence representations have been actively developed in recent years. However, the lack of pre-trained models and datasets annotated at the sentence level has been a problem for low-resource languages such as Polish which led to less interest in applying these methods to language-specific tasks. In this study, we introduce two new Polish datasets for evaluating sentence embeddings and provide a comprehensive evaluation of eight sentence representation methods including Polish and multilingual models. We consider classic word embedding models, recently developed contextual embeddings and multilingual sentence encoders, showing strengths and weaknesses of specific approaches. We also examine different methods of aggregating word vectors into a single sentence vector.", - language = "English", - ISBN = "979-10-95546-34-4", - }""", + bibtex_citation=r""" +@inproceedings{dadas-etal-2020-evaluation, + abstract = {Methods for learning sentence representations have been actively developed in recent years. However, the lack of pre-trained models and datasets annotated at the sentence level has been a problem for low-resource languages such as Polish which led to less interest in applying these methods to language-specific tasks. In this study, we introduce two new Polish datasets for evaluating sentence embeddings and provide a comprehensive evaluation of eight sentence representation methods including Polish and multilingual models. We consider classic word embedding models, recently developed contextual embeddings and multilingual sentence encoders, showing strengths and weaknesses of specific approaches. We also examine different methods of aggregating word vectors into a single sentence vector.}, + address = {Marseille, France}, + author = {Dadas, Slawomir and +Pere{\\l}kiewicz, Micha{\\l} and +Po{\\'s}wiata, Rafa{\\l}}, + booktitle = {Proceedings of the Twelfth Language Resources and Evaluation Conference}, + editor = {Calzolari, Nicoletta and +B{\'e}chet, Fr{\'e}d{\'e}ric and +Blache, Philippe and +Choukri, Khalid and +Cieri, Christopher and +Declerck, Thierry and +Goggi, Sara and +Isahara, Hitoshi and +Maegaard, Bente and +Mariani, Joseph and +Mazo, H{\\'e}l{\\`e}ne and +Moreno, Asuncion and +Odijk, Jan and +Piperidis, Stelios}, + isbn = {979-10-95546-34-4}, + language = {English}, + month = may, + pages = {1674--1680}, + publisher = {European Language Resources Association}, + title = {Evaluation of Sentence Representations in {P}olish}, + url = {https://aclanthology.org/2020.lrec-1.207}, + year = {2020}, +} +""", ) @@ -98,36 +100,38 @@ class EightTagsClusteringFast(AbsTaskClusteringFast): annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation="""@inproceedings{dadas-etal-2020-evaluation, - title = "Evaluation of Sentence Representations in {P}olish", - author = "Dadas, Slawomir and - Pere{\\l}kiewicz, Micha{\\l} and - Po{\\'s}wiata, Rafa{\\l}", - editor = "Calzolari, Nicoletta and - B{\\'e}chet, Fr{\\'e}d{\\'e}ric and - Blache, Philippe and - Choukri, Khalid and - Cieri, Christopher and - Declerck, Thierry and - Goggi, Sara and - Isahara, Hitoshi and - Maegaard, Bente and - Mariani, Joseph and - Mazo, H{\\'e}l{\\`e}ne and - Moreno, Asuncion and - Odijk, Jan and - Piperidis, Stelios", - booktitle = "Proceedings of the Twelfth Language Resources and Evaluation Conference", - month = may, - year = "2020", - address = "Marseille, France", - publisher = "European Language Resources Association", - url = "https://aclanthology.org/2020.lrec-1.207", - pages = "1674--1680", - abstract = "Methods for learning sentence representations have been actively developed in recent years. However, the lack of pre-trained models and datasets annotated at the sentence level has been a problem for low-resource languages such as Polish which led to less interest in applying these methods to language-specific tasks. In this study, we introduce two new Polish datasets for evaluating sentence embeddings and provide a comprehensive evaluation of eight sentence representation methods including Polish and multilingual models. We consider classic word embedding models, recently developed contextual embeddings and multilingual sentence encoders, showing strengths and weaknesses of specific approaches. We also examine different methods of aggregating word vectors into a single sentence vector.", - language = "English", - ISBN = "979-10-95546-34-4", - }""", + bibtex_citation=r""" +@inproceedings{dadas-etal-2020-evaluation, + abstract = {Methods for learning sentence representations have been actively developed in recent years. However, the lack of pre-trained models and datasets annotated at the sentence level has been a problem for low-resource languages such as Polish which led to less interest in applying these methods to language-specific tasks. In this study, we introduce two new Polish datasets for evaluating sentence embeddings and provide a comprehensive evaluation of eight sentence representation methods including Polish and multilingual models. We consider classic word embedding models, recently developed contextual embeddings and multilingual sentence encoders, showing strengths and weaknesses of specific approaches. We also examine different methods of aggregating word vectors into a single sentence vector.}, + address = {Marseille, France}, + author = {Dadas, Slawomir and +Pere{\\l}kiewicz, Micha{\\l} and +Po{\\'s}wiata, Rafa{\\l}}, + booktitle = {Proceedings of the Twelfth Language Resources and Evaluation Conference}, + editor = {Calzolari, Nicoletta and +B{\\'e}chet, Fr{\\'e}d{\\'e}ric and +Blache, Philippe and +Choukri, Khalid and +Cieri, Christopher and +Declerck, Thierry and +Goggi, Sara and +Isahara, Hitoshi and +Maegaard, Bente and +Mariani, Joseph and +Mazo, H{\\'e}l{\\`e}ne and +Moreno, Asuncion and +Odijk, Jan and +Piperidis, Stelios}, + isbn = {979-10-95546-34-4}, + language = {English}, + month = may, + pages = {1674--1680}, + publisher = {European Language Resources Association}, + title = {Evaluation of Sentence Representations in {P}olish}, + url = {https://aclanthology.org/2020.lrec-1.207}, + year = {2020}, +} +""", adapted_from=["EightTagsClustering"], ) diff --git a/mteb/tasks/Clustering/swe/SwednClustering.py b/mteb/tasks/Clustering/swe/SwednClustering.py index bef817ab6f..6845806aa8 100644 --- a/mteb/tasks/Clustering/swe/SwednClustering.py +++ b/mteb/tasks/Clustering/swe/SwednClustering.py @@ -81,12 +81,14 @@ class SwednClusteringP2P(AbsTaskClusteringFast): dialect=[], task_subtypes=["Thematic clustering"], sample_creation="found", - bibtex_citation="""@inproceedings{monsen2021method, - title={A method for building non-english corpora for abstractive text summarization}, - author={Monsen, Julius and J{\"o}nsson, Arne}, - booktitle={Proceedings of CLARIN Annual Conference}, - year={2021} -}""", + bibtex_citation=r""" +@inproceedings{monsen2021method, + author = {Monsen, Julius and J{\"o}nsson, Arne}, + booktitle = {Proceedings of CLARIN Annual Conference}, + title = {A method for building non-english corpora for abstractive text summarization}, + year = {2021}, +} +""", prompt="Identify news categories in Swedish passages", ) @@ -121,12 +123,14 @@ class SwednClusteringFastS2S(AbsTaskClusteringFast): dialect=[], task_subtypes=["Thematic clustering"], sample_creation="found", - bibtex_citation="""@inproceedings{monsen2021method, - title={A method for building non-english corpora for abstractive text summarization}, - author={Monsen, Julius and J{\"o}nsson, Arne}, - booktitle={Proceedings of CLARIN Annual Conference}, - year={2021} -}""", + bibtex_citation=r""" +@inproceedings{monsen2021method, + author = {Monsen, Julius and J{\"o}nsson, Arne}, + booktitle = {Proceedings of CLARIN Annual Conference}, + title = {A method for building non-english corpora for abstractive text summarization}, + year = {2021}, +} +""", prompt="Identify news categories in Swedish passages", ) diff --git a/mteb/tasks/Clustering/swe/swedn_clustering.py b/mteb/tasks/Clustering/swe/swedn_clustering.py index ab13883172..597496f35e 100644 --- a/mteb/tasks/Clustering/swe/swedn_clustering.py +++ b/mteb/tasks/Clustering/swe/swedn_clustering.py @@ -48,12 +48,14 @@ class SwednClustering(AbsTaskClustering): dialect=[], task_subtypes=["Thematic clustering"], sample_creation="found", - bibtex_citation="""@inproceedings{monsen2021method, - title={A method for building non-english corpora for abstractive text summarization}, - author={Monsen, Julius and J{\"o}nsson, Arne}, - booktitle={Proceedings of CLARIN Annual Conference}, - year={2021} -}""", + bibtex_citation=r""" +@inproceedings{monsen2021method, + author = {Monsen, Julius and J{\"o}nsson, Arne}, + booktitle = {Proceedings of CLARIN Annual Conference}, + title = {A method for building non-english corpora for abstractive text summarization}, + year = {2021}, +} +""", ) def dataset_transform(self): diff --git a/mteb/tasks/Clustering/zho/CMTEBClustering.py b/mteb/tasks/Clustering/zho/CMTEBClustering.py index 856a969ba0..24ea372983 100644 --- a/mteb/tasks/Clustering/zho/CMTEBClustering.py +++ b/mteb/tasks/Clustering/zho/CMTEBClustering.py @@ -39,14 +39,16 @@ class CLSClusteringFastS2S(AbsTaskClusteringFast): annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation="""@misc{li2022csl, - title={CSL: A Large-scale Chinese Scientific Literature Dataset}, - author={Yudong Li and Yuqing Zhang and Zhe Zhao and Linlin Shen and Weijie Liu and Weiquan Mao and Hui Zhang}, - year={2022}, - eprint={2209.05034}, - archivePrefix={arXiv}, - primaryClass={cs.CL} - }""", + bibtex_citation=r""" +@misc{li2022csl, + archiveprefix = {arXiv}, + author = {Yudong Li and Yuqing Zhang and Zhe Zhao and Linlin Shen and Weijie Liu and Weiquan Mao and Hui Zhang}, + eprint = {2209.05034}, + primaryclass = {cs.CL}, + title = {CSL: A Large-scale Chinese Scientific Literature Dataset}, + year = {2022}, +} +""", prompt="Identify the main category of scholar papers based on the titles", adapted_from=["CLSClusteringS2S"], ) @@ -96,14 +98,16 @@ class CLSClusteringFastP2P(AbsTaskClusteringFast): annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation="""@misc{li2022csl, - title={CSL: A Large-scale Chinese Scientific Literature Dataset}, - author={Yudong Li and Yuqing Zhang and Zhe Zhao and Linlin Shen and Weijie Liu and Weiquan Mao and Hui Zhang}, - year={2022}, - eprint={2209.05034}, - archivePrefix={arXiv}, - primaryClass={cs.CL} - }""", + bibtex_citation=r""" +@misc{li2022csl, + archiveprefix = {arXiv}, + author = {Yudong Li and Yuqing Zhang and Zhe Zhao and Linlin Shen and Weijie Liu and Weiquan Mao and Hui Zhang}, + eprint = {2209.05034}, + primaryclass = {cs.CL}, + title = {CSL: A Large-scale Chinese Scientific Literature Dataset}, + year = {2022}, +} +""", prompt="Identify the main category of scholar papers based on the titles and abstracts", adapted_from=["CLSClusteringP2P"], ) @@ -152,12 +156,12 @@ class CLSClusteringS2S(AbsTaskClustering): annotations_creators=None, dialect=None, sample_creation=None, - bibtex_citation=""" + bibtex_citation=r""" @article{li2022csl, - title={CSL: A large-scale Chinese scientific literature dataset}, - author={Li, Yudong and Zhang, Yuqing and Zhao, Zhe and Shen, Linlin and Liu, Weijie and Mao, Weiquan and Zhang, Hui}, - journal={arXiv preprint arXiv:2209.05034}, - year={2022} + author = {Li, Yudong and Zhang, Yuqing and Zhao, Zhe and Shen, Linlin and Liu, Weijie and Mao, Weiquan and Zhang, Hui}, + journal = {arXiv preprint arXiv:2209.05034}, + title = {CSL: A large-scale Chinese scientific literature dataset}, + year = {2022}, } """, prompt="Identify the main category of scholar papers based on the titles", @@ -188,12 +192,14 @@ class CLSClusteringP2P(AbsTaskClustering): annotations_creators=None, dialect=None, sample_creation=None, - bibtex_citation="""@article{li2022csl, - title={CSL: A large-scale Chinese scientific literature dataset}, - author={Li, Yudong and Zhang, Yuqing and Zhao, Zhe and Shen, Linlin and Liu, Weijie and Mao, Weiquan and Zhang, Hui}, - journal={arXiv preprint arXiv:2209.05034}, - year={2022} -}""", + bibtex_citation=r""" +@article{li2022csl, + author = {Li, Yudong and Zhang, Yuqing and Zhao, Zhe and Shen, Linlin and Liu, Weijie and Mao, Weiquan and Zhang, Hui}, + journal = {arXiv preprint arXiv:2209.05034}, + title = {CSL: A large-scale Chinese scientific literature dataset}, + year = {2022}, +} +""", prompt="Identify the main category of scholar papers based on the titles and abstracts", ) @@ -223,14 +229,16 @@ class ThuNewsClusteringFastS2S(AbsTaskClusteringFast): annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation="""@software{THUCTC, + bibtex_citation=r""" +@software{THUCTC, author = {Sun, M. and Li, J. and Guo, Z. and Yu, Z. and Zheng, Y. and Si, X. and Liu, Z.}, - title = {THUCTC: An Efficient Chinese Text Classifier}, - year = {2016}, note = {THU Chinese Text Classification Toolkit}, publisher = {THU Natural Language Processing Lab}, - url = {https://github.com/thunlp/THUCTC} -}""", + title = {THUCTC: An Efficient Chinese Text Classifier}, + url = {https://github.com/thunlp/THUCTC}, + year = {2016}, +} +""", prompt="Identify the topic or theme of the given news articles based on the titles", adapted_from=["ThuNewsClusteringS2S"], ) @@ -280,14 +288,16 @@ class ThuNewsClusteringFastP2P(AbsTaskClusteringFast): annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation="""@software{THUCTC, + bibtex_citation=r""" +@software{THUCTC, author = {Sun, M. and Li, J. and Guo, Z. and Yu, Z. and Zheng, Y. and Si, X. and Liu, Z.}, - title = {THUCTC: An Efficient Chinese Text Classifier}, - year = {2016}, note = {THU Chinese Text Classification Toolkit}, publisher = {THU Natural Language Processing Lab}, - url = {https://github.com/thunlp/THUCTC} -}""", + title = {THUCTC: An Efficient Chinese Text Classifier}, + url = {https://github.com/thunlp/THUCTC}, + year = {2016}, +} +""", prompt="Identify the topic or theme of the given news articles based on the titles and contents", adapted_from=["ThuNewsClusteringP2P"], ) @@ -336,19 +346,20 @@ class ThuNewsClusteringS2S(AbsTaskClustering): annotations_creators=None, dialect=None, sample_creation=None, - bibtex_citation=""" + bibtex_citation=r""" @inproceedings{eisner2007proceedings, - title={Proceedings of the 2007 joint conference on empirical methods in natural language processing and computational natural language learning (EMNLP-CoNLL)}, - author={Eisner, Jason}, - booktitle={Proceedings of the 2007 Joint Conference on Empirical Methods in Natural Language Processing and Computational Natural Language Learning (EMNLP-CoNLL)}, - year={2007} + author = {Eisner, Jason}, + booktitle = {Proceedings of the 2007 Joint Conference on Empirical Methods in Natural Language Processing and Computational Natural Language Learning (EMNLP-CoNLL)}, + title = {Proceedings of the 2007 joint conference on empirical methods in natural language processing and computational natural language learning (EMNLP-CoNLL)}, + year = {2007}, } + @inproceedings{li2006comparison, - title={A comparison and semi-quantitative analysis of words and character-bigrams as features in chinese text categorization}, - author={Li, Jingyang and Sun, Maosong and Zhang, Xian}, - booktitle={proceedings of the 21st international conference on computational linguistics and 44th annual meeting of the association for computational linguistics}, - pages={545--552}, - year={2006} + author = {Li, Jingyang and Sun, Maosong and Zhang, Xian}, + booktitle = {proceedings of the 21st international conference on computational linguistics and 44th annual meeting of the association for computational linguistics}, + pages = {545--552}, + title = {A comparison and semi-quantitative analysis of words and character-bigrams as features in chinese text categorization}, + year = {2006}, } """, prompt="Identify the topic or theme of the given news articles based on the titles", @@ -379,19 +390,20 @@ class ThuNewsClusteringP2P(AbsTaskClustering): annotations_creators=None, dialect=None, sample_creation=None, - bibtex_citation=""" + bibtex_citation=r""" @inproceedings{eisner2007proceedings, - title={Proceedings of the 2007 joint conference on empirical methods in natural language processing and computational natural language learning (EMNLP-CoNLL)}, - author={Eisner, Jason}, - booktitle={Proceedings of the 2007 Joint Conference on Empirical Methods in Natural Language Processing and Computational Natural Language Learning (EMNLP-CoNLL)}, - year={2007} + author = {Eisner, Jason}, + booktitle = {Proceedings of the 2007 Joint Conference on Empirical Methods in Natural Language Processing and Computational Natural Language Learning (EMNLP-CoNLL)}, + title = {Proceedings of the 2007 joint conference on empirical methods in natural language processing and computational natural language learning (EMNLP-CoNLL)}, + year = {2007}, } + @inproceedings{li2006comparison, - title={A comparison and semi-quantitative analysis of words and character-bigrams as features in chinese text categorization}, - author={Li, Jingyang and Sun, Maosong and Zhang, Xian}, - booktitle={proceedings of the 21st international conference on computational linguistics and 44th annual meeting of the association for computational linguistics}, - pages={545--552}, - year={2006} + author = {Li, Jingyang and Sun, Maosong and Zhang, Xian}, + booktitle = {proceedings of the 21st international conference on computational linguistics and 44th annual meeting of the association for computational linguistics}, + pages = {545--552}, + title = {A comparison and semi-quantitative analysis of words and character-bigrams as features in chinese text categorization}, + year = {2006}, } """, prompt="Identify the topic or theme of the given news articles based on the titles and contents", diff --git a/mteb/tasks/Image/Any2AnyMultiChoice/eng/BLINKIT2IMultiChoice.py b/mteb/tasks/Image/Any2AnyMultiChoice/eng/BLINKIT2IMultiChoice.py index 2909c87b9e..3ebe9db7a6 100644 --- a/mteb/tasks/Image/Any2AnyMultiChoice/eng/BLINKIT2IMultiChoice.py +++ b/mteb/tasks/Image/Any2AnyMultiChoice/eng/BLINKIT2IMultiChoice.py @@ -27,11 +27,12 @@ class BLINKIT2IMultiChoice(AbsTaskAny2AnyMultiChoice): dialect=[], modalities=["text", "image"], sample_creation="found", - bibtex_citation="""@article{fu2024blink, - title={Blink: Multimodal large language models can see but not perceive}, - author={Fu, Xingyu and Hu, Yushi and Li, Bangzheng and Feng, Yu and Wang, Haoyu and Lin, Xudong and Roth, Dan and Smith, Noah A and Ma, Wei-Chiu and Krishna, Ranjay}, - journal={arXiv preprint arXiv:2404.12390}, - year={2024} + bibtex_citation=r""" +@article{fu2024blink, + author = {Fu, Xingyu and Hu, Yushi and Li, Bangzheng and Feng, Yu and Wang, Haoyu and Lin, Xudong and Roth, Dan and Smith, Noah A and Ma, Wei-Chiu and Krishna, Ranjay}, + journal = {arXiv preprint arXiv:2404.12390}, + title = {Blink: Multimodal large language models can see but not perceive}, + year = {2024}, } """, descriptive_stats={ diff --git a/mteb/tasks/Image/Any2AnyMultiChoice/eng/BLINKIT2TMultiChoice.py b/mteb/tasks/Image/Any2AnyMultiChoice/eng/BLINKIT2TMultiChoice.py index 4f87c5aaa9..225cb0d971 100644 --- a/mteb/tasks/Image/Any2AnyMultiChoice/eng/BLINKIT2TMultiChoice.py +++ b/mteb/tasks/Image/Any2AnyMultiChoice/eng/BLINKIT2TMultiChoice.py @@ -26,11 +26,12 @@ class BLINKIT2TMultiChoice(AbsTaskAny2AnyMultiChoice): dialect=[], modalities=["text", "image"], sample_creation="found", - bibtex_citation="""@article{fu2024blink, - title={Blink: Multimodal large language models can see but not perceive}, - author={Fu, Xingyu and Hu, Yushi and Li, Bangzheng and Feng, Yu and Wang, Haoyu and Lin, Xudong and Roth, Dan and Smith, Noah A and Ma, Wei-Chiu and Krishna, Ranjay}, - journal={arXiv preprint arXiv:2404.12390}, - year={2024} + bibtex_citation=r""" +@article{fu2024blink, + author = {Fu, Xingyu and Hu, Yushi and Li, Bangzheng and Feng, Yu and Wang, Haoyu and Lin, Xudong and Roth, Dan and Smith, Noah A and Ma, Wei-Chiu and Krishna, Ranjay}, + journal = {arXiv preprint arXiv:2404.12390}, + title = {Blink: Multimodal large language models can see but not perceive}, + year = {2024}, } """, descriptive_stats={ diff --git a/mteb/tasks/Image/Any2AnyMultiChoice/eng/CVBench.py b/mteb/tasks/Image/Any2AnyMultiChoice/eng/CVBench.py index 2409424e96..5c36549cdc 100644 --- a/mteb/tasks/Image/Any2AnyMultiChoice/eng/CVBench.py +++ b/mteb/tasks/Image/Any2AnyMultiChoice/eng/CVBench.py @@ -109,12 +109,14 @@ class CVBenchCount(AbsTaskAny2AnyMultiChoice): dialect=[], modalities=["image", "text"], sample_creation="found", - bibtex_citation="""@article{tong2024cambrian, - title={Cambrian-1: A fully open, vision-centric exploration of multimodal llms}, - author={Tong, Shengbang and Brown, Ellis and Wu, Penghao and Woo, Sanghyun and Middepogu, Manoj and Akula, Sai Charitha and Yang, Jihan and Yang, Shusheng and Iyer, Adithya and Pan, Xichen and others}, - journal={arXiv preprint arXiv:2406.16860}, - year={2024} -}""", + bibtex_citation=r""" +@article{tong2024cambrian, + author = {Tong, Shengbang and Brown, Ellis and Wu, Penghao and Woo, Sanghyun and Middepogu, Manoj and Akula, Sai Charitha and Yang, Jihan and Yang, Shusheng and Iyer, Adithya and Pan, Xichen and others}, + journal = {arXiv preprint arXiv:2406.16860}, + title = {Cambrian-1: A fully open, vision-centric exploration of multimodal llms}, + year = {2024}, +} +""", descriptive_stats={ "n_samples": {"test": 419}, "avg_character_length": { @@ -162,12 +164,14 @@ class CVBenchRelation(AbsTaskAny2AnyMultiChoice): dialect=[], modalities=["text", "image"], sample_creation="found", - bibtex_citation="""@article{tong2024cambrian, - title={Cambrian-1: A fully open, vision-centric exploration of multimodal llms}, - author={Tong, Shengbang and Brown, Ellis and Wu, Penghao and Woo, Sanghyun and Middepogu, Manoj and Akula, Sai Charitha and Yang, Jihan and Yang, Shusheng and Iyer, Adithya and Pan, Xichen and others}, - journal={arXiv preprint arXiv:2406.16860}, - year={2024} -}""", + bibtex_citation=r""" +@article{tong2024cambrian, + author = {Tong, Shengbang and Brown, Ellis and Wu, Penghao and Woo, Sanghyun and Middepogu, Manoj and Akula, Sai Charitha and Yang, Jihan and Yang, Shusheng and Iyer, Adithya and Pan, Xichen and others}, + journal = {arXiv preprint arXiv:2406.16860}, + title = {Cambrian-1: A fully open, vision-centric exploration of multimodal llms}, + year = {2024}, +} +""", descriptive_stats={ "n_samples": {"test": 654}, "avg_character_length": { @@ -215,12 +219,14 @@ class CVBenchDepth(AbsTaskAny2AnyMultiChoice): dialect=[], modalities=["text", "image"], sample_creation="found", - bibtex_citation="""@article{tong2024cambrian, - title={Cambrian-1: A fully open, vision-centric exploration of multimodal llms}, - author={Tong, Shengbang and Brown, Ellis and Wu, Penghao and Woo, Sanghyun and Middepogu, Manoj and Akula, Sai Charitha and Yang, Jihan and Yang, Shusheng and Iyer, Adithya and Pan, Xichen and others}, - journal={arXiv preprint arXiv:2406.16860}, - year={2024} -}""", + bibtex_citation=r""" +@article{tong2024cambrian, + author = {Tong, Shengbang and Brown, Ellis and Wu, Penghao and Woo, Sanghyun and Middepogu, Manoj and Akula, Sai Charitha and Yang, Jihan and Yang, Shusheng and Iyer, Adithya and Pan, Xichen and others}, + journal = {arXiv preprint arXiv:2406.16860}, + title = {Cambrian-1: A fully open, vision-centric exploration of multimodal llms}, + year = {2024}, +} +""", descriptive_stats={ "n_samples": {"test": 669}, "avg_character_length": { @@ -268,12 +274,14 @@ class CVBenchDistance(AbsTaskAny2AnyMultiChoice): dialect=[], modalities=["text", "image"], sample_creation="found", - bibtex_citation="""@article{tong2024cambrian, - title={Cambrian-1: A fully open, vision-centric exploration of multimodal llms}, - author={Tong, Shengbang and Brown, Ellis and Wu, Penghao and Woo, Sanghyun and Middepogu, Manoj and Akula, Sai Charitha and Yang, Jihan and Yang, Shusheng and Iyer, Adithya and Pan, Xichen and others}, - journal={arXiv preprint arXiv:2406.16860}, - year={2024} -}""", + bibtex_citation=r""" +@article{tong2024cambrian, + author = {Tong, Shengbang and Brown, Ellis and Wu, Penghao and Woo, Sanghyun and Middepogu, Manoj and Akula, Sai Charitha and Yang, Jihan and Yang, Shusheng and Iyer, Adithya and Pan, Xichen and others}, + journal = {arXiv preprint arXiv:2406.16860}, + title = {Cambrian-1: A fully open, vision-centric exploration of multimodal llms}, + year = {2024}, +} +""", descriptive_stats={ "n_samples": {"test": 656}, "avg_character_length": { diff --git a/mteb/tasks/Image/Any2AnyMultiChoice/eng/ImageCoDeT2IMultiChoice.py b/mteb/tasks/Image/Any2AnyMultiChoice/eng/ImageCoDeT2IMultiChoice.py index f9400ff280..676fcd0a59 100644 --- a/mteb/tasks/Image/Any2AnyMultiChoice/eng/ImageCoDeT2IMultiChoice.py +++ b/mteb/tasks/Image/Any2AnyMultiChoice/eng/ImageCoDeT2IMultiChoice.py @@ -26,11 +26,12 @@ class ImageCoDeT2IMultiChoice(AbsTaskAny2AnyMultiChoice): dialect=[], modalities=["text", "image"], sample_creation="found", - bibtex_citation="""@article{krojer2022image, - title={Image retrieval from contextual descriptions}, - author={Krojer, Benno and Adlakha, Vaibhav and Vineet, Vibhav and Goyal, Yash and Ponti, Edoardo and Reddy, Siva}, - journal={arXiv preprint arXiv:2203.15867}, - year={2022} + bibtex_citation=r""" +@article{krojer2022image, + author = {Krojer, Benno and Adlakha, Vaibhav and Vineet, Vibhav and Goyal, Yash and Ponti, Edoardo and Reddy, Siva}, + journal = {arXiv preprint arXiv:2203.15867}, + title = {Image retrieval from contextual descriptions}, + year = {2022}, } """, descriptive_stats={ diff --git a/mteb/tasks/Image/Any2AnyMultiChoice/eng/ROxfordI2IMultiChoice.py b/mteb/tasks/Image/Any2AnyMultiChoice/eng/ROxfordI2IMultiChoice.py index d5661e2840..25cece36d0 100644 --- a/mteb/tasks/Image/Any2AnyMultiChoice/eng/ROxfordI2IMultiChoice.py +++ b/mteb/tasks/Image/Any2AnyMultiChoice/eng/ROxfordI2IMultiChoice.py @@ -28,14 +28,15 @@ class ROxfordEasyI2IMultiChoice(AbsTaskAny2AnyMultiChoice): dialect=[], modalities=["image"], sample_creation="created", - bibtex_citation="""@inproceedings{radenovic2018revisiting, - title={Revisiting oxford and paris: Large-scale image MultiChoice benchmarking}, - author={Radenovi{\'c}, Filip and Iscen, Ahmet and Tolias, Giorgos and Avrithis, Yannis and Chum, Ond{\v{r}}ej}, - booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition}, - pages={5706--5715}, - year={2018} + bibtex_citation=r""" +@inproceedings{radenovic2018revisiting, + author = {Radenovi{\'c}, Filip and Iscen, Ahmet and Tolias, Giorgos and Avrithis, Yannis and Chum, Ond{\v{r}}ej}, + booktitle = {Proceedings of the IEEE conference on computer vision and pattern recognition}, + pages = {5706--5715}, + title = {Revisiting oxford and paris: Large-scale image MultiChoice benchmarking}, + year = {2018}, } - """, +""", descriptive_stats={ "n_samples": {"test": 70}, "avg_character_length": { @@ -74,14 +75,15 @@ class ROxfordMediumI2IMultiChoice(AbsTaskAny2AnyMultiChoice): dialect=[], modalities=["image"], sample_creation="created", - bibtex_citation="""@inproceedings{radenovic2018revisiting, - title={Revisiting oxford and paris: Large-scale image MultiChoice benchmarking}, - author={Radenovi{\'c}, Filip and Iscen, Ahmet and Tolias, Giorgos and Avrithis, Yannis and Chum, Ond{\v{r}}ej}, - booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition}, - pages={5706--5715}, - year={2018} + bibtex_citation=r""" +@inproceedings{radenovic2018revisiting, + author = {Radenovi{\'c}, Filip and Iscen, Ahmet and Tolias, Giorgos and Avrithis, Yannis and Chum, Ond{\v{r}}ej}, + booktitle = {Proceedings of the IEEE conference on computer vision and pattern recognition}, + pages = {5706--5715}, + title = {Revisiting oxford and paris: Large-scale image MultiChoice benchmarking}, + year = {2018}, } - """, +""", descriptive_stats={ "n_samples": {"test": 70}, "avg_character_length": { @@ -120,14 +122,15 @@ class ROxfordHardI2IMultiChoice(AbsTaskAny2AnyMultiChoice): dialect=[], modalities=["image"], sample_creation="created", - bibtex_citation="""@inproceedings{radenovic2018revisiting, - title={Revisiting oxford and paris: Large-scale image MultiChoice benchmarking}, - author={Radenovi{\'c}, Filip and Iscen, Ahmet and Tolias, Giorgos and Avrithis, Yannis and Chum, Ond{\v{r}}ej}, - booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition}, - pages={5706--5715}, - year={2018} + bibtex_citation=r""" +@inproceedings{radenovic2018revisiting, + author = {Radenovi{\'c}, Filip and Iscen, Ahmet and Tolias, Giorgos and Avrithis, Yannis and Chum, Ond{\v{r}}ej}, + booktitle = {Proceedings of the IEEE conference on computer vision and pattern recognition}, + pages = {5706--5715}, + title = {Revisiting oxford and paris: Large-scale image MultiChoice benchmarking}, + year = {2018}, } - """, +""", descriptive_stats={ "n_samples": {"test": 70}, "avg_character_length": { diff --git a/mteb/tasks/Image/Any2AnyMultiChoice/eng/RParisI2IMultiChoice.py b/mteb/tasks/Image/Any2AnyMultiChoice/eng/RParisI2IMultiChoice.py index 754111b594..59a77f9eb5 100644 --- a/mteb/tasks/Image/Any2AnyMultiChoice/eng/RParisI2IMultiChoice.py +++ b/mteb/tasks/Image/Any2AnyMultiChoice/eng/RParisI2IMultiChoice.py @@ -28,14 +28,15 @@ class RParisEasyI2IMultiChoice(AbsTaskAny2AnyMultiChoice): dialect=[], modalities=["image"], sample_creation="created", - bibtex_citation="""@inproceedings{radenovic2018revisiting, - title={Revisiting paris and paris: Large-scale image MultiChoice benchmarking}, - author={Radenovi{\'c}, Filip and Iscen, Ahmet and Tolias, Giorgos and Avrithis, Yannis and Chum, Ond{\v{r}}ej}, - booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition}, - pages={5706--5715}, - year={2018} + bibtex_citation=r""" +@inproceedings{radenovic2018revisiting, + author = {Radenovi{\'c}, Filip and Iscen, Ahmet and Tolias, Giorgos and Avrithis, Yannis and Chum, Ond{\v{r}}ej}, + booktitle = {Proceedings of the IEEE conference on computer vision and pattern recognition}, + pages = {5706--5715}, + title = {Revisiting paris and paris: Large-scale image MultiChoice benchmarking}, + year = {2018}, } - """, +""", descriptive_stats={ "n_samples": {"test": 70}, "avg_character_length": { @@ -74,14 +75,15 @@ class RParisMediumI2IMultiChoice(AbsTaskAny2AnyMultiChoice): dialect=[], modalities=["image"], sample_creation="created", - bibtex_citation="""@inproceedings{radenovic2018revisiting, - title={Revisiting paris and paris: Large-scale image MultiChoice benchmarking}, - author={Radenovi{\'c}, Filip and Iscen, Ahmet and Tolias, Giorgos and Avrithis, Yannis and Chum, Ond{\v{r}}ej}, - booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition}, - pages={5706--5715}, - year={2018} + bibtex_citation=r""" +@inproceedings{radenovic2018revisiting, + author = {Radenovi{\'c}, Filip and Iscen, Ahmet and Tolias, Giorgos and Avrithis, Yannis and Chum, Ond{\v{r}}ej}, + booktitle = {Proceedings of the IEEE conference on computer vision and pattern recognition}, + pages = {5706--5715}, + title = {Revisiting paris and paris: Large-scale image MultiChoice benchmarking}, + year = {2018}, } - """, +""", descriptive_stats={ "n_samples": {"test": 70}, "avg_character_length": { @@ -120,14 +122,15 @@ class RParisHardI2IMultiChoice(AbsTaskAny2AnyMultiChoice): dialect=[], modalities=["image"], sample_creation="created", - bibtex_citation="""@inproceedings{radenovic2018revisiting, - title={Revisiting paris and paris: Large-scale image MultiChoice benchmarking}, - author={Radenovi{\'c}, Filip and Iscen, Ahmet and Tolias, Giorgos and Avrithis, Yannis and Chum, Ond{\v{r}}ej}, - booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition}, - pages={5706--5715}, - year={2018} + bibtex_citation=r""" +@inproceedings{radenovic2018revisiting, + author = {Radenovi{\'c}, Filip and Iscen, Ahmet and Tolias, Giorgos and Avrithis, Yannis and Chum, Ond{\v{r}}ej}, + booktitle = {Proceedings of the IEEE conference on computer vision and pattern recognition}, + pages = {5706--5715}, + title = {Revisiting paris and paris: Large-scale image MultiChoice benchmarking}, + year = {2018}, } - """, +""", descriptive_stats={ "n_samples": {"test": 70}, "avg_character_length": { diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/BLINKIT2IRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/BLINKIT2IRetrieval.py index 8202bb133c..eec1e9f3d4 100644 --- a/mteb/tasks/Image/Any2AnyRetrieval/eng/BLINKIT2IRetrieval.py +++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/BLINKIT2IRetrieval.py @@ -18,7 +18,7 @@ class BLINKIT2IRetrieval(AbsTaskAny2AnyRetrieval): category="it2i", eval_splits=["test"], eval_langs=["eng-Latn"], - main_score="ndcg_at_10", + main_score="cv_recall_at_1", date=("2018-01-01", "2018-12-31"), domains=["Encyclopaedic"], task_subtypes=["Image Text Retrieval"], @@ -27,11 +27,12 @@ class BLINKIT2IRetrieval(AbsTaskAny2AnyRetrieval): dialect=[], modalities=["text", "image"], sample_creation="found", - bibtex_citation="""@article{fu2024blink, - title={Blink: Multimodal large language models can see but not perceive}, - author={Fu, Xingyu and Hu, Yushi and Li, Bangzheng and Feng, Yu and Wang, Haoyu and Lin, Xudong and Roth, Dan and Smith, Noah A and Ma, Wei-Chiu and Krishna, Ranjay}, - journal={arXiv preprint arXiv:2404.12390}, - year={2024} + bibtex_citation=r""" +@article{fu2024blink, + author = {Fu, Xingyu and Hu, Yushi and Li, Bangzheng and Feng, Yu and Wang, Haoyu and Lin, Xudong and Roth, Dan and Smith, Noah A and Ma, Wei-Chiu and Krishna, Ranjay}, + journal = {arXiv preprint arXiv:2404.12390}, + title = {Blink: Multimodal large language models can see but not perceive}, + year = {2024}, } """, descriptive_stats={ diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/BLINKIT2TRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/BLINKIT2TRetrieval.py index ff6ec42427..377e7c80c8 100644 --- a/mteb/tasks/Image/Any2AnyRetrieval/eng/BLINKIT2TRetrieval.py +++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/BLINKIT2TRetrieval.py @@ -18,7 +18,7 @@ class BLINKIT2TRetrieval(AbsTaskAny2AnyRetrieval): category="it2t", eval_splits=["test"], eval_langs=["eng-Latn"], - main_score="ndcg_at_10", + main_score="cv_recall_at_1", date=("2018-01-01", "2018-12-31"), domains=["Encyclopaedic"], task_subtypes=["Image Text Retrieval"], @@ -27,11 +27,12 @@ class BLINKIT2TRetrieval(AbsTaskAny2AnyRetrieval): dialect=[], modalities=["text", "image"], sample_creation="found", - bibtex_citation="""@article{fu2024blink, - title={Blink: Multimodal large language models can see but not perceive}, - author={Fu, Xingyu and Hu, Yushi and Li, Bangzheng and Feng, Yu and Wang, Haoyu and Lin, Xudong and Roth, Dan and Smith, Noah A and Ma, Wei-Chiu and Krishna, Ranjay}, - journal={arXiv preprint arXiv:2404.12390}, - year={2024} + bibtex_citation=r""" +@article{fu2024blink, + author = {Fu, Xingyu and Hu, Yushi and Li, Bangzheng and Feng, Yu and Wang, Haoyu and Lin, Xudong and Roth, Dan and Smith, Noah A and Ma, Wei-Chiu and Krishna, Ranjay}, + journal = {arXiv preprint arXiv:2404.12390}, + title = {Blink: Multimodal large language models can see but not perceive}, + year = {2024}, } """, descriptive_stats={ diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/CIRRIT2IRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/CIRRIT2IRetrieval.py index ed0172ae79..91f6c970e8 100644 --- a/mteb/tasks/Image/Any2AnyRetrieval/eng/CIRRIT2IRetrieval.py +++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/CIRRIT2IRetrieval.py @@ -27,13 +27,15 @@ class CIRRIT2IRetrieval(AbsTaskAny2AnyRetrieval): dialect=[], modalities=["text", "image"], sample_creation="found", - bibtex_citation="""@inproceedings{liu2021image, - title={Image retrieval on real-life images with pre-trained vision-and-language models}, - author={Liu, Zheyuan and Rodriguez-Opazo, Cristian and Teney, Damien and Gould, Stephen}, - booktitle={Proceedings of the IEEE/CVF International Conference on Computer Vision}, - pages={2125--2134}, - year={2021} - }""", + bibtex_citation=r""" +@inproceedings{liu2021image, + author = {Liu, Zheyuan and Rodriguez-Opazo, Cristian and Teney, Damien and Gould, Stephen}, + booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision}, + pages = {2125--2134}, + title = {Image retrieval on real-life images with pre-trained vision-and-language models}, + year = {2021}, +} +""", prompt={ "query": "Retrieve a day-to-day image that aligns with the modification instructions of the provided image." }, diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/CUB200I2IRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/CUB200I2IRetrieval.py index 95a3c9a77c..1b8a39dc79 100644 --- a/mteb/tasks/Image/Any2AnyRetrieval/eng/CUB200I2IRetrieval.py +++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/CUB200I2IRetrieval.py @@ -26,14 +26,15 @@ class CUB200I2I(AbsTaskAny2AnyRetrieval): dialect=[], modalities=["image"], sample_creation="created", - bibtex_citation="""@article{article, - author = {Welinder, Peter and Branson, Steve and Mita, Takeshi and Wah, Catherine and Schroff, Florian and Belongie, Serge and Perona, Pietro}, - year = {2010}, - month = {09}, - pages = {}, - title = {Caltech-UCSD Birds 200} - } - """, + bibtex_citation=r""" +@article{article, + author = {Welinder, Peter and Branson, Steve and Mita, Takeshi and Wah, Catherine and Schroff, Florian and Belongie, Serge and Perona, Pietro}, + month = {09}, + pages = {}, + title = {Caltech-UCSD Birds 200}, + year = {2010}, +} +""", descriptive_stats={ "n_samples": {"default": 5794}, "avg_character_length": { diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/EDIST2ITRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/EDIST2ITRetrieval.py index ac7b310998..65941e2cf9 100644 --- a/mteb/tasks/Image/Any2AnyRetrieval/eng/EDIST2ITRetrieval.py +++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/EDIST2ITRetrieval.py @@ -26,13 +26,15 @@ class EDIST2ITRetrieval(AbsTaskAny2AnyRetrieval): dialect=[], modalities=["text", "image"], sample_creation="created", - bibtex_citation="""@inproceedings{liu2023edis, - title={EDIS: Entity-Driven Image Search over Multimodal Web Content}, - author={Liu, Siqi and Feng, Weixi and Fu, Tsu-Jui and Chen, Wenhu and Wang, William}, - booktitle={Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing}, - pages={4877--4894}, - year={2023} -}""", + bibtex_citation=r""" +@inproceedings{liu2023edis, + author = {Liu, Siqi and Feng, Weixi and Fu, Tsu-Jui and Chen, Wenhu and Wang, William}, + booktitle = {Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing}, + pages = {4877--4894}, + title = {EDIS: Entity-Driven Image Search over Multimodal Web Content}, + year = {2023}, +} +""", prompt={"query": "Identify the news photo for the given caption."}, descriptive_stats={ "n_samples": {"test": 3241}, diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/EncyclopediaVQAIT2ITRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/EncyclopediaVQAIT2ITRetrieval.py index 01f2e6a980..7105c4f391 100644 --- a/mteb/tasks/Image/Any2AnyRetrieval/eng/EncyclopediaVQAIT2ITRetrieval.py +++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/EncyclopediaVQAIT2ITRetrieval.py @@ -26,13 +26,15 @@ class EncyclopediaVQAIT2ITRetrieval(AbsTaskAny2AnyRetrieval): dialect=[], modalities=["image", "text"], sample_creation="created", - bibtex_citation="""@inproceedings{mensink2023encyclopedic, - title={Encyclopedic VQA: Visual questions about detailed properties of fine-grained categories}, - author={Mensink, Thomas and Uijlings, Jasper and Castrejon, Lluis and Goel, Arushi and Cadar, Felipe and Zhou, Howard and Sha, Fei and Araujo, Andr{\'e} and Ferrari, Vittorio}, - booktitle={Proceedings of the IEEE/CVF International Conference on Computer Vision}, - pages={3113--3124}, - year={2023} -}""", + bibtex_citation=r""" +@inproceedings{mensink2023encyclopedic, + author = {Mensink, Thomas and Uijlings, Jasper and Castrejon, Lluis and Goel, Arushi and Cadar, Felipe and Zhou, Howard and Sha, Fei and Araujo, Andr{\'e} and Ferrari, Vittorio}, + booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision}, + pages = {3113--3124}, + title = {Encyclopedic VQA: Visual questions about detailed properties of fine-grained categories}, + year = {2023}, +} +""", prompt={ "query": "Obtain illustrated documents that correspond to the inquiry alongside the provided image." }, diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/FORBI2IRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/FORBI2IRetrieval.py index 170ce44f25..ff3180e780 100644 --- a/mteb/tasks/Image/Any2AnyRetrieval/eng/FORBI2IRetrieval.py +++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/FORBI2IRetrieval.py @@ -26,16 +26,17 @@ class FORBI2I(AbsTaskAny2AnyRetrieval): dialect=[], modalities=["image"], sample_creation="created", - bibtex_citation="""@misc{wu2023forbflatobjectretrieval, - title={FORB: A Flat Object Retrieval Benchmark for Universal Image Embedding}, - author={Pengxiang Wu and Siman Wang and Kevin Dela Rosa and Derek Hao Hu}, - year={2023}, - eprint={2309.16249}, - archivePrefix={arXiv}, - primaryClass={cs.CV}, - url={https://arxiv.org/abs/2309.16249}, - } - """, + bibtex_citation=r""" +@misc{wu2023forbflatobjectretrieval, + archiveprefix = {arXiv}, + author = {Pengxiang Wu and Siman Wang and Kevin Dela Rosa and Derek Hao Hu}, + eprint = {2309.16249}, + primaryclass = {cs.CV}, + title = {FORB: A Flat Object Retrieval Benchmark for Universal Image Embedding}, + url = {https://arxiv.org/abs/2309.16249}, + year = {2023}, +} +""", descriptive_stats={ "n_samples": {"default": 13250}, "avg_character_length": { diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/Fashion200kI2TRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/Fashion200kI2TRetrieval.py index 5ba43daf1d..c0e89b0810 100644 --- a/mteb/tasks/Image/Any2AnyRetrieval/eng/Fashion200kI2TRetrieval.py +++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/Fashion200kI2TRetrieval.py @@ -26,13 +26,15 @@ class Fashion200kI2TRetrieval(AbsTaskAny2AnyRetrieval): dialect=[], modalities=["text", "image"], sample_creation="created", - bibtex_citation="""@inproceedings{han2017automatic, - title={Automatic spatially-aware fashion concept discovery}, - author={Han, Xintong and Wu, Zuxuan and Huang, Phoenix X and Zhang, Xiao and Zhu, Menglong and Li, Yuan and Zhao, Yang and Davis, Larry S}, - booktitle={Proceedings of the IEEE international conference on computer vision}, - pages={1463--1471}, - year={2017} -}""", + bibtex_citation=r""" +@inproceedings{han2017automatic, + author = {Han, Xintong and Wu, Zuxuan and Huang, Phoenix X and Zhang, Xiao and Zhu, Menglong and Li, Yuan and Zhao, Yang and Davis, Larry S}, + booktitle = {Proceedings of the IEEE international conference on computer vision}, + pages = {1463--1471}, + title = {Automatic spatially-aware fashion concept discovery}, + year = {2017}, +} +""", prompt={ "query": "Based on the following fashion description, retrieve the best matching image." }, diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/Fashion200kT2IRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/Fashion200kT2IRetrieval.py index 1511de7aa4..385ec4a6b8 100644 --- a/mteb/tasks/Image/Any2AnyRetrieval/eng/Fashion200kT2IRetrieval.py +++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/Fashion200kT2IRetrieval.py @@ -27,13 +27,15 @@ class Fashion200kT2IRetrieval(AbsTaskAny2AnyRetrieval): dialect=[], modalities=["text", "image"], sample_creation="created", - bibtex_citation="""@inproceedings{han2017automatic, - title={Automatic spatially-aware fashion concept discovery}, - author={Han, Xintong and Wu, Zuxuan and Huang, Phoenix X and Zhang, Xiao and Zhu, Menglong and Li, Yuan and Zhao, Yang and Davis, Larry S}, - booktitle={Proceedings of the IEEE international conference on computer vision}, - pages={1463--1471}, - year={2017} -}""", + bibtex_citation=r""" +@inproceedings{han2017automatic, + author = {Han, Xintong and Wu, Zuxuan and Huang, Phoenix X and Zhang, Xiao and Zhu, Menglong and Li, Yuan and Zhao, Yang and Davis, Larry S}, + booktitle = {Proceedings of the IEEE international conference on computer vision}, + pages = {1463--1471}, + title = {Automatic spatially-aware fashion concept discovery}, + year = {2017}, +} +""", prompt={ "query": "Based on the following fashion description, retrieve the best matching image." }, diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/FashionIQIT2IRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/FashionIQIT2IRetrieval.py index 4e1209c23c..a2910c3f16 100644 --- a/mteb/tasks/Image/Any2AnyRetrieval/eng/FashionIQIT2IRetrieval.py +++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/FashionIQIT2IRetrieval.py @@ -27,13 +27,15 @@ class FashionIQIT2IRetrieval(AbsTaskAny2AnyRetrieval): dialect=[], modalities=["text", "image"], sample_creation="created", - bibtex_citation="""@inproceedings{wu2021fashion, - title={Fashion iq: A new dataset towards retrieving images by natural language feedback}, - author={Wu, Hui and Gao, Yupeng and Guo, Xiaoxiao and Al-Halah, Ziad and Rennie, Steven and Grauman, Kristen and Feris, Rogerio}, - booktitle={Proceedings of the IEEE/CVF Conference on computer vision and pattern recognition}, - pages={11307--11317}, - year={2021} -}""", + bibtex_citation=r""" +@inproceedings{wu2021fashion, + author = {Wu, Hui and Gao, Yupeng and Guo, Xiaoxiao and Al-Halah, Ziad and Rennie, Steven and Grauman, Kristen and Feris, Rogerio}, + booktitle = {Proceedings of the IEEE/CVF Conference on computer vision and pattern recognition}, + pages = {11307--11317}, + title = {Fashion iq: A new dataset towards retrieving images by natural language feedback}, + year = {2021}, +} +""", prompt={ "query": "Find a fashion image that aligns with the reference image and style note." }, diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/Flickr30kI2TRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/Flickr30kI2TRetrieval.py index 43aeea20d4..1354473ba6 100644 --- a/mteb/tasks/Image/Any2AnyRetrieval/eng/Flickr30kI2TRetrieval.py +++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/Flickr30kI2TRetrieval.py @@ -26,15 +26,17 @@ class Flickr30kI2TRetrieval(AbsTaskAny2AnyRetrieval): dialect=[], modalities=["text", "image"], sample_creation="found", - bibtex_citation="""@article{Young2014FromID, - title={From image descriptions to visual denotations: New similarity metrics for semantic inference over event descriptions}, - author={Peter Young and Alice Lai and Micah Hodosh and J. Hockenmaier}, - journal={Transactions of the Association for Computational Linguistics}, - year={2014}, - volume={2}, - pages={67-78}, - url={https://api.semanticscholar.org/CorpusID:3104920} -}""", + bibtex_citation=r""" +@article{Young2014FromID, + author = {Peter Young and Alice Lai and Micah Hodosh and J. Hockenmaier}, + journal = {Transactions of the Association for Computational Linguistics}, + pages = {67-78}, + title = {From image descriptions to visual denotations: New similarity metrics for semantic inference over event descriptions}, + url = {https://api.semanticscholar.org/CorpusID:3104920}, + volume = {2}, + year = {2014}, +} +""", prompt={"query": "Find an image caption describing the following image."}, descriptive_stats={ "n_samples": {"test": 1000}, diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/Flickr30kT2IRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/Flickr30kT2IRetrieval.py index cb87cfcf86..fdc0597bfb 100644 --- a/mteb/tasks/Image/Any2AnyRetrieval/eng/Flickr30kT2IRetrieval.py +++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/Flickr30kT2IRetrieval.py @@ -26,15 +26,17 @@ class Flickr30kT2IRetrieval(AbsTaskAny2AnyRetrieval): dialect=[], modalities=["text", "image"], sample_creation="found", - bibtex_citation="""@article{Young2014FromID, - title={From image descriptions to visual denotations: New similarity metrics for semantic inference over event descriptions}, - author={Peter Young and Alice Lai and Micah Hodosh and J. Hockenmaier}, - journal={Transactions of the Association for Computational Linguistics}, - year={2014}, - volume={2}, - pages={67-78}, - url={https://api.semanticscholar.org/CorpusID:3104920} -}""", + bibtex_citation=r""" +@article{Young2014FromID, + author = {Peter Young and Alice Lai and Micah Hodosh and J. Hockenmaier}, + journal = {Transactions of the Association for Computational Linguistics}, + pages = {67-78}, + title = {From image descriptions to visual denotations: New similarity metrics for semantic inference over event descriptions}, + url = {https://api.semanticscholar.org/CorpusID:3104920}, + volume = {2}, + year = {2014}, +} +""", prompt={"query": "Find an image that matches the given caption."}, descriptive_stats={ "n_samples": {"test": 5000}, diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/GLDv2I2IRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/GLDv2I2IRetrieval.py index 1d0c2c3bcf..95cd709a36 100644 --- a/mteb/tasks/Image/Any2AnyRetrieval/eng/GLDv2I2IRetrieval.py +++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/GLDv2I2IRetrieval.py @@ -26,14 +26,14 @@ class GLDv2I2IRetrieval(AbsTaskAny2AnyRetrieval): dialect=[], modalities=["image"], sample_creation="created", - bibtex_citation="""@InProceedings{Weyand_2020_CVPR, -author = {Weyand, Tobias and Araujo, Andre and Cao, Bingyi and Sim, Jack}, -title = {Google Landmarks Dataset v2 - A Large-Scale Benchmark for Instance-Level Recognition and Retrieval}, -booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, -month = {June}, -year = {2020} + bibtex_citation=r""" +@inproceedings{Weyand_2020_CVPR, + author = {Weyand, Tobias and Araujo, Andre and Cao, Bingyi and Sim, Jack}, + booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, + month = {June}, + title = {Google Landmarks Dataset v2 - A Large-Scale Benchmark for Instance-Level Recognition and Retrieval}, + year = {2020}, } - """, descriptive_stats={ "n_samples": {"test": 1129}, diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/GLDv2I2TRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/GLDv2I2TRetrieval.py index db61790fb8..0a24ff2f14 100644 --- a/mteb/tasks/Image/Any2AnyRetrieval/eng/GLDv2I2TRetrieval.py +++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/GLDv2I2TRetrieval.py @@ -26,14 +26,14 @@ class GLDv2I2TRetrieval(AbsTaskAny2AnyRetrieval): dialect=[], modalities=["text", "image"], sample_creation="created", - bibtex_citation="""@InProceedings{Weyand_2020_CVPR, -author = {Weyand, Tobias and Araujo, Andre and Cao, Bingyi and Sim, Jack}, -title = {Google Landmarks Dataset v2 - A Large-Scale Benchmark for Instance-Level Recognition and Retrieval}, -booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, -month = {June}, -year = {2020} + bibtex_citation=r""" +@inproceedings{Weyand_2020_CVPR, + author = {Weyand, Tobias and Araujo, Andre and Cao, Bingyi and Sim, Jack}, + booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, + month = {June}, + title = {Google Landmarks Dataset v2 - A Large-Scale Benchmark for Instance-Level Recognition and Retrieval}, + year = {2020}, } - """, descriptive_stats={ "n_samples": {"test": 1972}, diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/HatefulMemesI2TRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/HatefulMemesI2TRetrieval.py index bf7e273d73..4d9d2bf1a1 100644 --- a/mteb/tasks/Image/Any2AnyRetrieval/eng/HatefulMemesI2TRetrieval.py +++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/HatefulMemesI2TRetrieval.py @@ -1,5 +1,6 @@ from __future__ import annotations +import polars as pl from datasets import concatenate_datasets, load_dataset from mteb.abstasks.Image.AbsTaskAny2AnyRetrieval import AbsTaskAny2AnyRetrieval @@ -16,9 +17,13 @@ def _load_data(path: str, splits: str, cache_dir: str = None, revision: str = No cache_dir=cache_dir, revision=revision, ) - dataset_splits = list(dataset) + dataset_splits = ["test", "validation", "train"] shared_corpus = concatenate_datasets([dataset[split] for split in dataset_splits]) + text_df = pl.DataFrame({"text": shared_corpus["text"]}) + unique_indices = text_df["text"].arg_unique() + shared_corpus = shared_corpus.select(unique_indices) + shared_corpus = shared_corpus.map( lambda x: { "id": "corpus-" + str(x["id"]), @@ -61,12 +66,11 @@ def _load_data(path: str, splits: str, cache_dir: str = None, revision: str = No class HatefulMemesI2TRetrieval(AbsTaskAny2AnyRetrieval): metadata = TaskMetadata( name="HatefulMemesI2TRetrieval", - description="Retrieve captions based on memes.", + description="Retrieve captions based on memes to assess OCR abilities.", reference="https://arxiv.org/pdf/2005.04790", dataset={ "path": "Ahren09/MMSoc_HatefulMemes", "revision": "c9a9a6c3ef0765622a6de0af6ebb68f323ad73ba", - # "trust_remote_code": True, }, type="Any2AnyRetrieval", category="i2t", @@ -81,14 +85,16 @@ class HatefulMemesI2TRetrieval(AbsTaskAny2AnyRetrieval): dialect=[], modalities=["text", "image"], sample_creation="found", - bibtex_citation="""@article{kiela2020hateful, - title={The hateful memes challenge: Detecting hate speech in multimodal memes}, - author={Kiela, Douwe and Firooz, Hamed and Mohan, Aravind and Goswami, Vedanuj and Singh, Amanpreet and Ringshia, Pratik and Testuggine, Davide}, - journal={Advances in neural information processing systems}, - volume={33}, - pages={2611--2624}, - year={2020} -}""", + bibtex_citation=r""" +@article{kiela2020hateful, + author = {Kiela, Douwe and Firooz, Hamed and Mohan, Aravind and Goswami, Vedanuj and Singh, Amanpreet and Ringshia, Pratik and Testuggine, Davide}, + journal = {Advances in neural information processing systems}, + pages = {2611--2624}, + title = {The hateful memes challenge: Detecting hate speech in multimodal memes}, + volume = {33}, + year = {2020}, +} +""", descriptive_stats={ "n_samples": None, "avg_character_length": { diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/HatefulMemesT2IRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/HatefulMemesT2IRetrieval.py index 89912a1213..4d403bb310 100644 --- a/mteb/tasks/Image/Any2AnyRetrieval/eng/HatefulMemesT2IRetrieval.py +++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/HatefulMemesT2IRetrieval.py @@ -1,5 +1,6 @@ from __future__ import annotations +import polars as pl from datasets import concatenate_datasets, load_dataset from mteb.abstasks.Image.AbsTaskAny2AnyRetrieval import AbsTaskAny2AnyRetrieval @@ -16,9 +17,13 @@ def _load_data(path: str, splits: str, cache_dir: str = None, revision: str = No cache_dir=cache_dir, revision=revision, ) - dataset_splits = list(dataset) + dataset_splits = ["test", "validation", "train"] shared_corpus = concatenate_datasets([dataset[split] for split in dataset_splits]) + text_df = pl.DataFrame({"text": shared_corpus["text"]}) + unique_indices = text_df["text"].arg_unique() + shared_corpus = shared_corpus.select(unique_indices) + shared_corpus = shared_corpus.map( lambda x: { "id": "corpus-" + str(x["id"]), @@ -61,12 +66,11 @@ def _load_data(path: str, splits: str, cache_dir: str = None, revision: str = No class HatefulMemesT2IRetrieval(AbsTaskAny2AnyRetrieval): metadata = TaskMetadata( name="HatefulMemesT2IRetrieval", - description="Retrieve captions based on memes.", + description="Retrieve captions based on memes to assess OCR abilities.", reference="https://arxiv.org/pdf/2005.04790", dataset={ "path": "Ahren09/MMSoc_HatefulMemes", "revision": "c9a9a6c3ef0765622a6de0af6ebb68f323ad73ba", - # "trust_remote_code": True, }, type="Any2AnyRetrieval", category="t2i", @@ -81,14 +85,16 @@ class HatefulMemesT2IRetrieval(AbsTaskAny2AnyRetrieval): dialect=[], modalities=["text", "image"], sample_creation="found", - bibtex_citation="""@article{kiela2020hateful, - title={The hateful memes challenge: Detecting hate speech in multimodal memes}, - author={Kiela, Douwe and Firooz, Hamed and Mohan, Aravind and Goswami, Vedanuj and Singh, Amanpreet and Ringshia, Pratik and Testuggine, Davide}, - journal={Advances in neural information processing systems}, - volume={33}, - pages={2611--2624}, - year={2020} -}""", + bibtex_citation=r""" +@article{kiela2020hateful, + author = {Kiela, Douwe and Firooz, Hamed and Mohan, Aravind and Goswami, Vedanuj and Singh, Amanpreet and Ringshia, Pratik and Testuggine, Davide}, + journal = {Advances in neural information processing systems}, + pages = {2611--2624}, + title = {The hateful memes challenge: Detecting hate speech in multimodal memes}, + volume = {33}, + year = {2020}, +} +""", descriptive_stats={ "n_samples": None, "avg_character_length": { diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/ImageCoDeT2IRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/ImageCoDeT2IRetrieval.py index 1b8472294c..8b420db51a 100644 --- a/mteb/tasks/Image/Any2AnyRetrieval/eng/ImageCoDeT2IRetrieval.py +++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/ImageCoDeT2IRetrieval.py @@ -17,7 +17,7 @@ class ImageCoDeT2IRetrieval(AbsTaskAny2AnyRetrieval): category="t2i", eval_splits=["test"], eval_langs=["eng-Latn"], - main_score="ndcg_at_10", + main_score="cv_recall_at_3", date=("2022-05-22", "2022-05-27"), # conference dates domains=["Web", "Written"], task_subtypes=["Image Text Retrieval"], @@ -26,11 +26,12 @@ class ImageCoDeT2IRetrieval(AbsTaskAny2AnyRetrieval): dialect=[], modalities=["text", "image"], sample_creation="found", - bibtex_citation="""@article{krojer2022image, - title={Image retrieval from contextual descriptions}, - author={Krojer, Benno and Adlakha, Vaibhav and Vineet, Vibhav and Goyal, Yash and Ponti, Edoardo and Reddy, Siva}, - journal={arXiv preprint arXiv:2203.15867}, - year={2022} + bibtex_citation=r""" +@article{krojer2022image, + author = {Krojer, Benno and Adlakha, Vaibhav and Vineet, Vibhav and Goyal, Yash and Ponti, Edoardo and Reddy, Siva}, + journal = {arXiv preprint arXiv:2203.15867}, + title = {Image retrieval from contextual descriptions}, + year = {2022}, } """, descriptive_stats={ diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/InfoSeekIT2ITRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/InfoSeekIT2ITRetrieval.py index f695de1d19..ebbf3936a3 100644 --- a/mteb/tasks/Image/Any2AnyRetrieval/eng/InfoSeekIT2ITRetrieval.py +++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/InfoSeekIT2ITRetrieval.py @@ -27,13 +27,15 @@ class InfoSeekIT2ITRetrieval(AbsTaskAny2AnyRetrieval): dialect=[], modalities=["text", "image"], sample_creation="found", - bibtex_citation="""@inproceedings{chen2023can, - title={Can Pre-trained Vision and Language Models Answer Visual Information-Seeking Questions?}, - author={Chen, Yang and Hu, Hexiang and Luan, Yi and Sun, Haitian and Changpinyo, Soravit and Ritter, Alan and Chang, Ming-Wei}, - booktitle={Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing}, - pages={14948--14968}, - year={2023} -}""", + bibtex_citation=r""" +@inproceedings{chen2023can, + author = {Chen, Yang and Hu, Hexiang and Luan, Yi and Sun, Haitian and Changpinyo, Soravit and Ritter, Alan and Chang, Ming-Wei}, + booktitle = {Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing}, + pages = {14948--14968}, + title = {Can Pre-trained Vision and Language Models Answer Visual Information-Seeking Questions?}, + year = {2023}, +} +""", prompt={ "query": "Find an image and subject description from Wikipedia that answers my question about this image." }, diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/InfoSeekIT2TRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/InfoSeekIT2TRetrieval.py index e5cecd8591..bd11c5584c 100644 --- a/mteb/tasks/Image/Any2AnyRetrieval/eng/InfoSeekIT2TRetrieval.py +++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/InfoSeekIT2TRetrieval.py @@ -27,13 +27,15 @@ class InfoSeekIT2TRetrieval(AbsTaskAny2AnyRetrieval): dialect=[], modalities=["text", "image"], sample_creation="found", - bibtex_citation="""@inproceedings{chen2023can, - title={Can Pre-trained Vision and Language Models Answer Visual Information-Seeking Questions?}, - author={Chen, Yang and Hu, Hexiang and Luan, Yi and Sun, Haitian and Changpinyo, Soravit and Ritter, Alan and Chang, Ming-Wei}, - booktitle={Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing}, - pages={14948--14968}, - year={2023} -}""", + bibtex_citation=r""" +@inproceedings{chen2023can, + author = {Chen, Yang and Hu, Hexiang and Luan, Yi and Sun, Haitian and Changpinyo, Soravit and Ritter, Alan and Chang, Ming-Wei}, + booktitle = {Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing}, + pages = {14948--14968}, + title = {Can Pre-trained Vision and Language Models Answer Visual Information-Seeking Questions?}, + year = {2023}, +} +""", prompt={ "query": "Find a paragraph from Wikipedia that answers my question about this image." }, diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/LLaVAIT2TRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/LLaVAIT2TRetrieval.py index 9a0ded2203..9147520531 100644 --- a/mteb/tasks/Image/Any2AnyRetrieval/eng/LLaVAIT2TRetrieval.py +++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/LLaVAIT2TRetrieval.py @@ -26,24 +26,26 @@ class LLaVAIT2TRetrieval(AbsTaskAny2AnyRetrieval): dialect=[], modalities=["text", "image"], sample_creation="found", - bibtex_citation="""@inproceedings{lin-etal-2024-preflmr, - title = "{P}re{FLMR}: Scaling Up Fine-Grained Late-Interaction Multi-modal Retrievers", - author = "Lin, Weizhe and - Mei, Jingbiao and - Chen, Jinghong and - Byrne, Bill", - editor = "Ku, Lun-Wei and - Martins, Andre and - Srikumar, Vivek", - booktitle = "Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)", - month = aug, - year = "2024", - address = "Bangkok, Thailand", - publisher = "Association for Computational Linguistics", - url = "https://aclanthology.org/2024.acl-long.289", - doi = "10.18653/v1/2024.acl-long.289", - pages = "5294--5316", -}""", + bibtex_citation=r""" +@inproceedings{lin-etal-2024-preflmr, + address = {Bangkok, Thailand}, + author = {Lin, Weizhe and +Mei, Jingbiao and +Chen, Jinghong and +Byrne, Bill}, + booktitle = {Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)}, + doi = {10.18653/v1/2024.acl-long.289}, + editor = {Ku, Lun-Wei and +Martins, Andre and +Srikumar, Vivek}, + month = aug, + pages = {5294--5316}, + publisher = {Association for Computational Linguistics}, + title = {{P}re{FLMR}: Scaling Up Fine-Grained Late-Interaction Multi-modal Retrievers}, + url = {https://aclanthology.org/2024.acl-long.289}, + year = {2024}, +} +""", prompt={ "query": "Provide a specific decription of the image along with the following question." }, diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/METI2IRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/METI2IRetrieval.py index 399c1fb792..30a77c0e7c 100644 --- a/mteb/tasks/Image/Any2AnyRetrieval/eng/METI2IRetrieval.py +++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/METI2IRetrieval.py @@ -26,13 +26,14 @@ class METI2IRetrieval(AbsTaskAny2AnyRetrieval): dialect=[], modalities=["image"], sample_creation="created", - bibtex_citation="""@inproceedings{ypsilantis2021met, - title={The met dataset: Instance-level recognition for artworks}, - author={Ypsilantis, Nikolaos-Antonios and Garcia, Noa and Han, Guangxing and Ibrahimi, Sarah and Van Noord, Nanne and Tolias, Giorgos}, - booktitle={Thirty-fifth Conference on Neural Information Processing Systems Datasets and Benchmarks Track (Round 2)}, - year={2021} + bibtex_citation=r""" +@inproceedings{ypsilantis2021met, + author = {Ypsilantis, Nikolaos-Antonios and Garcia, Noa and Han, Guangxing and Ibrahimi, Sarah and Van Noord, Nanne and Tolias, Giorgos}, + booktitle = {Thirty-fifth Conference on Neural Information Processing Systems Datasets and Benchmarks Track (Round 2)}, + title = {The met dataset: Instance-level recognition for artworks}, + year = {2021}, } - """, +""", descriptive_stats={ # "n_samples": {"default": 397121}, }, diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/MSCOCOI2TRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/MSCOCOI2TRetrieval.py index bc4ce63c72..ca15edc3f1 100644 --- a/mteb/tasks/Image/Any2AnyRetrieval/eng/MSCOCOI2TRetrieval.py +++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/MSCOCOI2TRetrieval.py @@ -27,14 +27,16 @@ class MSCOCOI2TRetrieval(AbsTaskAny2AnyRetrieval): dialect=[], modalities=["text", "image"], sample_creation="found", - bibtex_citation="""@inproceedings{lin2014microsoft, - title={Microsoft coco: Common objects in context}, - author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence}, - booktitle={Computer Vision--ECCV 2014: 13th European Conference, Zurich, Switzerland, September 6-12, 2014, Proceedings, Part V 13}, - pages={740--755}, - year={2014}, - organization={Springer} - }""", + bibtex_citation=r""" +@inproceedings{lin2014microsoft, + author = {Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence}, + booktitle = {Computer Vision--ECCV 2014: 13th European Conference, Zurich, Switzerland, September 6-12, 2014, Proceedings, Part V 13}, + organization = {Springer}, + pages = {740--755}, + title = {Microsoft coco: Common objects in context}, + year = {2014}, +} +""", prompt={ "query": "Find an image caption describing the following everyday image." }, diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/MSCOCOT2IRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/MSCOCOT2IRetrieval.py index 4885e236c2..534f19d573 100644 --- a/mteb/tasks/Image/Any2AnyRetrieval/eng/MSCOCOT2IRetrieval.py +++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/MSCOCOT2IRetrieval.py @@ -27,14 +27,16 @@ class MSCOCOT2IRetrieval(AbsTaskAny2AnyRetrieval): dialect=[], modalities=["text", "image"], sample_creation="found", - bibtex_citation="""@inproceedings{lin2014microsoft, - title={Microsoft coco: Common objects in context}, - author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence}, - booktitle={Computer Vision--ECCV 2014: 13th European Conference, Zurich, Switzerland, September 6-12, 2014, Proceedings, Part V 13}, - pages={740--755}, - year={2014}, - organization={Springer} - }""", + bibtex_citation=r""" +@inproceedings{lin2014microsoft, + author = {Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence}, + booktitle = {Computer Vision--ECCV 2014: 13th European Conference, Zurich, Switzerland, September 6-12, 2014, Proceedings, Part V 13}, + organization = {Springer}, + pages = {740--755}, + title = {Microsoft coco: Common objects in context}, + year = {2014}, +} +""", prompt={"query": "Identify the image showcasing the described everyday scene."}, descriptive_stats={ "n_samples": {"test": 24809}, diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/MemotionI2TRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/MemotionI2TRetrieval.py index dfc42881df..7f93fb1fe8 100644 --- a/mteb/tasks/Image/Any2AnyRetrieval/eng/MemotionI2TRetrieval.py +++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/MemotionI2TRetrieval.py @@ -21,15 +21,20 @@ def _load_data(path: str, splits: str, cache_dir: str = None, revision: str = No def map_function(split_name): return lambda x, idx: { "id": f"corpus-{split_name}-{idx}", - "text": x["text_corrected"] if x["text_corrected"] else "", + "text": x["text_corrected"], "modality": "text", "image": None, } - # Apply the map function to each split and concatenate + split_datasets = {} + for split in dataset_splits: + split_datasets[split] = dataset[split].filter( + lambda example: example["text_corrected"] is not None + ) + shared_corpus = concatenate_datasets( [ - dataset[split].map( + split_datasets[split].map( map_function(split), with_indices=True, remove_columns=[ @@ -46,13 +51,11 @@ def map_function(split_name): for split in dataset_splits ] ) - # image corrupted & caption empty - shared_corpus = shared_corpus.select( - [i for i in range(len(shared_corpus)) if i not in [4578, 6781, 6784, 6786]] - ) + for split in splits: corpus[split] = shared_corpus - split_dataset = dataset[split] + split_dataset = split_datasets[split] + queries[split] = split_dataset.map( lambda x, idx: { "id": f"query-{split}-{idx}", @@ -72,18 +75,14 @@ def map_function(split_name): "text_corrected", ], ) - if split == "test": - queries[split] = queries[split].select( - [i for i in range(len(queries[split])) if i not in [489, 492, 494]] - ) + relevant_docs[split] = {} for index in range(len(split_dataset)): - if index not in [489, 492, 494]: - query_id = f"query-{split}-{index}" - doc_id = f"corpus-{split}-{index}" - if query_id not in relevant_docs[split]: - relevant_docs[split][query_id] = {} - relevant_docs[split][query_id][doc_id] = 1 + query_id = f"query-{split}-{index}" + doc_id = f"corpus-{split}-{index}" + if query_id not in relevant_docs[split]: + relevant_docs[split][query_id] = {} + relevant_docs[split][query_id][doc_id] = 1 return corpus, queries, relevant_docs @@ -110,13 +109,15 @@ class MemotionI2TRetrieval(AbsTaskAny2AnyRetrieval): dialect=[], modalities=["text", "image"], sample_creation="found", - bibtex_citation="""@inproceedings{sharma2020semeval, - title={SemEval-2020 Task 8: Memotion Analysis-the Visuo-Lingual Metaphor!}, - author={Sharma, Chhavi and Bhageria, Deepesh and Scott, William and Pykl, Srinivas and Das, Amitava and Chakraborty, Tanmoy and Pulabaigari, Viswanath and Gamb{\"a}ck, Bj{\"o}rn}, - booktitle={Proceedings of the Fourteenth Workshop on Semantic Evaluation}, - pages={759--773}, - year={2020} -}""", + bibtex_citation=r""" +@inproceedings{sharma2020semeval, + author = {Sharma, Chhavi and Bhageria, Deepesh and Scott, William and Pykl, Srinivas and Das, Amitava and Chakraborty, Tanmoy and Pulabaigari, Viswanath and Gamb{\"a}ck, Bj{\"o}rn}, + booktitle = {Proceedings of the Fourteenth Workshop on Semantic Evaluation}, + pages = {759--773}, + title = {SemEval-2020 Task 8: Memotion Analysis-the Visuo-Lingual Metaphor!}, + year = {2020}, +} +""", descriptive_stats={ "n_samples": None, "avg_character_length": { diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/MemotionT2IRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/MemotionT2IRetrieval.py index dff7746b5a..ebe784a78e 100644 --- a/mteb/tasks/Image/Any2AnyRetrieval/eng/MemotionT2IRetrieval.py +++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/MemotionT2IRetrieval.py @@ -25,10 +25,15 @@ def map_function(split_name): "modality": "image", } - # Apply the map function to each split and concatenate + split_datasets = {} + for split in dataset_splits: + split_datasets[split] = dataset[split].filter( + lambda example: example["text_corrected"] is not None + ) + shared_corpus = concatenate_datasets( [ - dataset[split].map( + split_datasets[split].map( map_function(split), with_indices=True, remove_columns=[ @@ -45,13 +50,10 @@ def map_function(split_name): for split in dataset_splits ] ) - # image corrupted - shared_corpus = shared_corpus.select( - [i for i in range(len(shared_corpus)) if i not in [4578, 6781, 6784, 6786]] - ) + for split in splits: corpus[split] = shared_corpus - split_dataset = dataset[split] + split_dataset = split_datasets[split] queries[split] = split_dataset.map( lambda x, idx: { "id": f"query-{split}-{idx}", @@ -71,18 +73,14 @@ def map_function(split_name): "text_corrected", ], ) - if split == "test": - queries[split] = queries[split].select( - [i for i in range(len(queries[split])) if i not in [489, 492, 494]] - ) + relevant_docs[split] = {} for index in range(len(split_dataset)): - if index not in [489, 492, 494]: - query_id = f"query-{split}-{index}" - doc_id = f"corpus-{split}-{index}" - if query_id not in relevant_docs[split]: - relevant_docs[split][query_id] = {} - relevant_docs[split][query_id][doc_id] = 1 + query_id = f"query-{split}-{index}" + doc_id = f"corpus-{split}-{index}" + if query_id not in relevant_docs[split]: + relevant_docs[split][query_id] = {} + relevant_docs[split][query_id][doc_id] = 1 return corpus, queries, relevant_docs @@ -109,13 +107,15 @@ class MemotionT2IRetrieval(AbsTaskAny2AnyRetrieval): dialect=[], modalities=["text", "image"], sample_creation="found", - bibtex_citation="""@inproceedings{sharma2020semeval, - title={SemEval-2020 Task 8: Memotion Analysis-the Visuo-Lingual Metaphor!}, - author={Sharma, Chhavi and Bhageria, Deepesh and Scott, William and Pykl, Srinivas and Das, Amitava and Chakraborty, Tanmoy and Pulabaigari, Viswanath and Gamb{\"a}ck, Bj{\"o}rn}, - booktitle={Proceedings of the Fourteenth Workshop on Semantic Evaluation}, - pages={759--773}, - year={2020} -}""", + bibtex_citation=r""" +@inproceedings{sharma2020semeval, + author = {Sharma, Chhavi and Bhageria, Deepesh and Scott, William and Pykl, Srinivas and Das, Amitava and Chakraborty, Tanmoy and Pulabaigari, Viswanath and Gamb{\"a}ck, Bj{\"o}rn}, + booktitle = {Proceedings of the Fourteenth Workshop on Semantic Evaluation}, + pages = {759--773}, + title = {SemEval-2020 Task 8: Memotion Analysis-the Visuo-Lingual Metaphor!}, + year = {2020}, +} +""", descriptive_stats={ "n_samples": None, "avg_character_length": { diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/NIGHTSI2IRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/NIGHTSI2IRetrieval.py index aa05ac6494..90fcfc2a8d 100644 --- a/mteb/tasks/Image/Any2AnyRetrieval/eng/NIGHTSI2IRetrieval.py +++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/NIGHTSI2IRetrieval.py @@ -26,13 +26,15 @@ class NIGHTSI2IRetrieval(AbsTaskAny2AnyRetrieval): dialect=[], modalities=["image"], sample_creation="created", - bibtex_citation="""@article{fu2024dreamsim, - title={DreamSim: Learning New Dimensions of Human Visual Similarity using Synthetic Data}, - author={Fu, Stephanie and Tamir, Netanel and Sundaram, Shobhita and Chai, Lucy and Zhang, Richard and Dekel, Tali and Isola, Phillip}, - journal={Advances in Neural Information Processing Systems}, - volume={36}, - year={2024} -}""", + bibtex_citation=r""" +@article{fu2024dreamsim, + author = {Fu, Stephanie and Tamir, Netanel and Sundaram, Shobhita and Chai, Lucy and Zhang, Richard and Dekel, Tali and Isola, Phillip}, + journal = {Advances in Neural Information Processing Systems}, + title = {DreamSim: Learning New Dimensions of Human Visual Similarity using Synthetic Data}, + volume = {36}, + year = {2024}, +} +""", prompt={ "query": "Find a day-to-day image that looks similar to the provided image." }, diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/OKVQAIT2TRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/OKVQAIT2TRetrieval.py index 65b1c3b202..69c0bd8372 100644 --- a/mteb/tasks/Image/Any2AnyRetrieval/eng/OKVQAIT2TRetrieval.py +++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/OKVQAIT2TRetrieval.py @@ -26,13 +26,15 @@ class OKVQAIT2TRetrieval(AbsTaskAny2AnyRetrieval): dialect=[], modalities=["image", "text"], sample_creation="created", - bibtex_citation="""@inproceedings{marino2019ok, - title={Ok-vqa: A visual question answering benchmark requiring external knowledge}, - author={Marino, Kenneth and Rastegari, Mohammad and Farhadi, Ali and Mottaghi, Roozbeh}, - booktitle={Proceedings of the IEEE/cvf conference on computer vision and pattern recognition}, - pages={3195--3204}, - year={2019} -}""", + bibtex_citation=r""" +@inproceedings{marino2019ok, + author = {Marino, Kenneth and Rastegari, Mohammad and Farhadi, Ali and Mottaghi, Roozbeh}, + booktitle = {Proceedings of the IEEE/cvf conference on computer vision and pattern recognition}, + pages = {3195--3204}, + title = {Ok-vqa: A visual question answering benchmark requiring external knowledge}, + year = {2019}, +} +""", prompt={ "query": "Retrieve documents that provide an answer to the question alongside the image." }, diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/OVENIT2ITRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/OVENIT2ITRetrieval.py index c6d1ef6baa..60603a9ef6 100644 --- a/mteb/tasks/Image/Any2AnyRetrieval/eng/OVENIT2ITRetrieval.py +++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/OVENIT2ITRetrieval.py @@ -26,13 +26,15 @@ class OVENIT2ITRetrieval(AbsTaskAny2AnyRetrieval): dialect=[], modalities=["image", "text"], sample_creation="created", - bibtex_citation="""@inproceedings{hu2023open, - title={Open-domain visual entity recognition: Towards recognizing millions of wikipedia entities}, - author={Hu, Hexiang and Luan, Yi and Chen, Yang and Khandelwal, Urvashi and Joshi, Mandar and Lee, Kenton and Toutanova, Kristina and Chang, Ming-Wei}, - booktitle={Proceedings of the IEEE/CVF International Conference on Computer Vision}, - pages={12065--12075}, - year={2023} -}""", + bibtex_citation=r""" +@inproceedings{hu2023open, + author = {Hu, Hexiang and Luan, Yi and Chen, Yang and Khandelwal, Urvashi and Joshi, Mandar and Lee, Kenton and Toutanova, Kristina and Chang, Ming-Wei}, + booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision}, + pages = {12065--12075}, + title = {Open-domain visual entity recognition: Towards recognizing millions of wikipedia entities}, + year = {2023}, +} +""", prompt={ "query": "Retrieve a Wikipedia image-description pair that provides evidence for the question of this image." }, diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/OVENIT2TRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/OVENIT2TRetrieval.py index 94898f4819..1c9ca29f24 100644 --- a/mteb/tasks/Image/Any2AnyRetrieval/eng/OVENIT2TRetrieval.py +++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/OVENIT2TRetrieval.py @@ -26,13 +26,15 @@ class OVENIT2TRetrieval(AbsTaskAny2AnyRetrieval): dialect=[], modalities=["text"], sample_creation="created", - bibtex_citation="""@inproceedings{hu2023open, - title={Open-domain visual entity recognition: Towards recognizing millions of wikipedia entities}, - author={Hu, Hexiang and Luan, Yi and Chen, Yang and Khandelwal, Urvashi and Joshi, Mandar and Lee, Kenton and Toutanova, Kristina and Chang, Ming-Wei}, - booktitle={Proceedings of the IEEE/CVF International Conference on Computer Vision}, - pages={12065--12075}, - year={2023} -}""", + bibtex_citation=r""" +@inproceedings{hu2023open, + author = {Hu, Hexiang and Luan, Yi and Chen, Yang and Khandelwal, Urvashi and Joshi, Mandar and Lee, Kenton and Toutanova, Kristina and Chang, Ming-Wei}, + booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision}, + pages = {12065--12075}, + title = {Open-domain visual entity recognition: Towards recognizing millions of wikipedia entities}, + year = {2023}, +} +""", prompt={ "query": "Retrieve a Wikipedia paragraph that provides an answer to the given query about the image." }, diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/ROxfordI2IRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/ROxfordI2IRetrieval.py index c7583a45df..a20fa52c67 100644 --- a/mteb/tasks/Image/Any2AnyRetrieval/eng/ROxfordI2IRetrieval.py +++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/ROxfordI2IRetrieval.py @@ -31,14 +31,15 @@ class ROxfordEasyI2IRetrieval(MultiChoiceEvaluationMixin, AbsTaskAny2AnyRetrieva dialect=[], modalities=["image"], sample_creation="created", - bibtex_citation="""@inproceedings{radenovic2018revisiting, - title={Revisiting oxford and paris: Large-scale image retrieval benchmarking}, - author={Radenovi{\'c}, Filip and Iscen, Ahmet and Tolias, Giorgos and Avrithis, Yannis and Chum, Ond{\v{r}}ej}, - booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition}, - pages={5706--5715}, - year={2018} + bibtex_citation=r""" +@inproceedings{radenovic2018revisiting, + author = {Radenovi{\'c}, Filip and Iscen, Ahmet and Tolias, Giorgos and Avrithis, Yannis and Chum, Ond{\v{r}}ej}, + booktitle = {Proceedings of the IEEE conference on computer vision and pattern recognition}, + pages = {5706--5715}, + title = {Revisiting oxford and paris: Large-scale image retrieval benchmarking}, + year = {2018}, } - """, +""", descriptive_stats={ "n_samples": {"test": 5063}, "avg_character_length": { @@ -77,14 +78,15 @@ class ROxfordMediumI2IRetrieval(MultiChoiceEvaluationMixin, AbsTaskAny2AnyRetrie dialect=[], modalities=["image"], sample_creation="created", - bibtex_citation="""@inproceedings{radenovic2018revisiting, - title={Revisiting oxford and paris: Large-scale image retrieval benchmarking}, - author={Radenovi{\'c}, Filip and Iscen, Ahmet and Tolias, Giorgos and Avrithis, Yannis and Chum, Ond{\v{r}}ej}, - booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition}, - pages={5706--5715}, - year={2018} + bibtex_citation=r""" +@inproceedings{radenovic2018revisiting, + author = {Radenovi{\'c}, Filip and Iscen, Ahmet and Tolias, Giorgos and Avrithis, Yannis and Chum, Ond{\v{r}}ej}, + booktitle = {Proceedings of the IEEE conference on computer vision and pattern recognition}, + pages = {5706--5715}, + title = {Revisiting oxford and paris: Large-scale image retrieval benchmarking}, + year = {2018}, } - """, +""", descriptive_stats={ "n_samples": {"test": 5063}, "avg_character_length": { @@ -123,14 +125,15 @@ class ROxfordHardI2IRetrieval(MultiChoiceEvaluationMixin, AbsTaskAny2AnyRetrieva dialect=[], modalities=["image"], sample_creation="created", - bibtex_citation="""@inproceedings{radenovic2018revisiting, - title={Revisiting oxford and paris: Large-scale image retrieval benchmarking}, - author={Radenovi{\'c}, Filip and Iscen, Ahmet and Tolias, Giorgos and Avrithis, Yannis and Chum, Ond{\v{r}}ej}, - booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition}, - pages={5706--5715}, - year={2018} + bibtex_citation=r""" +@inproceedings{radenovic2018revisiting, + author = {Radenovi{\'c}, Filip and Iscen, Ahmet and Tolias, Giorgos and Avrithis, Yannis and Chum, Ond{\v{r}}ej}, + booktitle = {Proceedings of the IEEE conference on computer vision and pattern recognition}, + pages = {5706--5715}, + title = {Revisiting oxford and paris: Large-scale image retrieval benchmarking}, + year = {2018}, } - """, +""", descriptive_stats={ "n_samples": {"test": 5063}, "avg_character_length": { diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/RP2kI2IRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/RP2kI2IRetrieval.py index 5d08f1ef91..598bc295f6 100644 --- a/mteb/tasks/Image/Any2AnyRetrieval/eng/RP2kI2IRetrieval.py +++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/RP2kI2IRetrieval.py @@ -26,13 +26,14 @@ class RP2kI2IRetrieval(AbsTaskAny2AnyRetrieval): dialect=[], modalities=["image"], sample_creation="created", - bibtex_citation="""@article{peng2020rp2k, - title={RP2K: A large-scale retail product dataset for fine-grained image classification}, - author={Peng, Jingtian and Xiao, Chang and Li, Yifan}, - journal={arXiv preprint arXiv:2006.12634}, - year={2020} + bibtex_citation=r""" +@article{peng2020rp2k, + author = {Peng, Jingtian and Xiao, Chang and Li, Yifan}, + journal = {arXiv preprint arXiv:2006.12634}, + title = {RP2K: A large-scale retail product dataset for fine-grained image classification}, + year = {2020}, } - """, +""", descriptive_stats={ "n_samples": {"test": 39457}, "avg_character_length": { diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/RParisI2IRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/RParisI2IRetrieval.py index 4cd698157b..d435961959 100644 --- a/mteb/tasks/Image/Any2AnyRetrieval/eng/RParisI2IRetrieval.py +++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/RParisI2IRetrieval.py @@ -27,14 +27,15 @@ class RParisEasyI2IRetrieval(MultiChoiceEvaluationMixin, AbsTaskAny2AnyRetrieval dialect=[], modalities=["image"], sample_creation="created", - bibtex_citation="""@inproceedings{radenovic2018revisiting, - title={Revisiting oxford and paris: Large-scale image retrieval benchmarking}, - author={Radenovi{\'c}, Filip and Iscen, Ahmet and Tolias, Giorgos and Avrithis, Yannis and Chum, Ond{\v{r}}ej}, - booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition}, - pages={5706--5715}, - year={2018} + bibtex_citation=r""" +@inproceedings{radenovic2018revisiting, + author = {Radenovi{\'c}, Filip and Iscen, Ahmet and Tolias, Giorgos and Avrithis, Yannis and Chum, Ond{\v{r}}ej}, + booktitle = {Proceedings of the IEEE conference on computer vision and pattern recognition}, + pages = {5706--5715}, + title = {Revisiting oxford and paris: Large-scale image retrieval benchmarking}, + year = {2018}, } - """, +""", descriptive_stats={ "n_samples": {"test": 6392}, "avg_character_length": { @@ -73,14 +74,15 @@ class RParisMediumI2IRetrieval(MultiChoiceEvaluationMixin, AbsTaskAny2AnyRetriev dialect=[], modalities=["image"], sample_creation="created", - bibtex_citation="""@inproceedings{radenovic2018revisiting, - title={Revisiting oxford and paris: Large-scale image retrieval benchmarking}, - author={Radenovi{\'c}, Filip and Iscen, Ahmet and Tolias, Giorgos and Avrithis, Yannis and Chum, Ond{\v{r}}ej}, - booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition}, - pages={5706--5715}, - year={2018} + bibtex_citation=r""" +@inproceedings{radenovic2018revisiting, + author = {Radenovi{\'c}, Filip and Iscen, Ahmet and Tolias, Giorgos and Avrithis, Yannis and Chum, Ond{\v{r}}ej}, + booktitle = {Proceedings of the IEEE conference on computer vision and pattern recognition}, + pages = {5706--5715}, + title = {Revisiting oxford and paris: Large-scale image retrieval benchmarking}, + year = {2018}, } - """, +""", descriptive_stats={ "n_samples": {"test": 6392}, "avg_character_length": { @@ -119,14 +121,15 @@ class RParisHardI2IRetrieval(MultiChoiceEvaluationMixin, AbsTaskAny2AnyRetrieval dialect=[], modalities=["image"], sample_creation="created", - bibtex_citation="""@inproceedings{radenovic2018revisiting, - title={Revisiting oxford and paris: Large-scale image retrieval benchmarking}, - author={Radenovi{\'c}, Filip and Iscen, Ahmet and Tolias, Giorgos and Avrithis, Yannis and Chum, Ond{\v{r}}ej}, - booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition}, - pages={5706--5715}, - year={2018} + bibtex_citation=r""" +@inproceedings{radenovic2018revisiting, + author = {Radenovi{\'c}, Filip and Iscen, Ahmet and Tolias, Giorgos and Avrithis, Yannis and Chum, Ond{\v{r}}ej}, + booktitle = {Proceedings of the IEEE conference on computer vision and pattern recognition}, + pages = {5706--5715}, + title = {Revisiting oxford and paris: Large-scale image retrieval benchmarking}, + year = {2018}, } - """, +""", descriptive_stats={ "n_samples": {"test": 6392}, "avg_character_length": { diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/ReMuQIT2TRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/ReMuQIT2TRetrieval.py index 648d2d2e44..205368a480 100644 --- a/mteb/tasks/Image/Any2AnyRetrieval/eng/ReMuQIT2TRetrieval.py +++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/ReMuQIT2TRetrieval.py @@ -7,7 +7,7 @@ class ReMuQIT2TRetrieval(AbsTaskAny2AnyRetrieval): metadata = TaskMetadata( name="ReMuQIT2TRetrieval", - description="Retrieval a Wiki passage to answer query about an image.", + description="Retrieval of a Wiki passage to answer a query about an image.", reference="https://github.com/luomancs/ReMuQ", dataset={ "path": "izhx/UMRB-ReMuQ", @@ -26,25 +26,27 @@ class ReMuQIT2TRetrieval(AbsTaskAny2AnyRetrieval): dialect=[], modalities=["image", "text"], sample_creation="created", - bibtex_citation="""@inproceedings{luo-etal-2023-end, - title = "End-to-end Knowledge Retrieval with Multi-modal Queries", - author = "Luo, Man and - Fang, Zhiyuan and - Gokhale, Tejas and - Yang, Yezhou and - Baral, Chitta", - editor = "Rogers, Anna and - Boyd-Graber, Jordan and - Okazaki, Naoaki", - booktitle = "Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)", - month = jul, - year = "2023", - address = "Toronto, Canada", - publisher = "Association for Computational Linguistics", - url = "https://aclanthology.org/2023.acl-long.478", - doi = "10.18653/v1/2023.acl-long.478", - pages = "8573--8589", -}""", + bibtex_citation=r""" +@inproceedings{luo-etal-2023-end, + address = {Toronto, Canada}, + author = {Luo, Man and +Fang, Zhiyuan and +Gokhale, Tejas and +Yang, Yezhou and +Baral, Chitta}, + booktitle = {Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)}, + doi = {10.18653/v1/2023.acl-long.478}, + editor = {Rogers, Anna and +Boyd-Graber, Jordan and +Okazaki, Naoaki}, + month = jul, + pages = {8573--8589}, + publisher = {Association for Computational Linguistics}, + title = {End-to-end Knowledge Retrieval with Multi-modal Queries}, + url = {https://aclanthology.org/2023.acl-long.478}, + year = {2023}, +} +""", prompt={ "query": "Retrieve a fact-based paragraph that provides an answer to the given query about the image." }, diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/SOPI2IRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/SOPI2IRetrieval.py index 0558f0ce26..722522ed62 100644 --- a/mteb/tasks/Image/Any2AnyRetrieval/eng/SOPI2IRetrieval.py +++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/SOPI2IRetrieval.py @@ -26,14 +26,15 @@ class SOPI2IRetrieval(AbsTaskAny2AnyRetrieval): dialect=[], modalities=["image"], sample_creation="created", - bibtex_citation="""@inproceedings{oh2016deep, - title={Deep metric learning via lifted structured feature embedding}, - author={Oh Song, Hyun and Xiang, Yu and Jegelka, Stefanie and Savarese, Silvio}, - booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition}, - pages={4004--4012}, - year={2016} + bibtex_citation=r""" +@inproceedings{oh2016deep, + author = {Oh Song, Hyun and Xiang, Yu and Jegelka, Stefanie and Savarese, Silvio}, + booktitle = {Proceedings of the IEEE conference on computer vision and pattern recognition}, + pages = {4004--4012}, + title = {Deep metric learning via lifted structured feature embedding}, + year = {2016}, } - """, +""", descriptive_stats={ "n_samples": {"test": 120053}, "avg_character_length": { diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/SciMMIRI2TRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/SciMMIRI2TRetrieval.py index a8aac928c4..e2ba7e9742 100644 --- a/mteb/tasks/Image/Any2AnyRetrieval/eng/SciMMIRI2TRetrieval.py +++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/SciMMIRI2TRetrieval.py @@ -86,12 +86,14 @@ class SciMMIRI2TRetrieval(AbsTaskAny2AnyRetrieval): dialect=[], modalities=["text", "image"], sample_creation="found", - bibtex_citation="""@article{wu2024scimmir, - title={SciMMIR: Benchmarking Scientific Multi-modal Information Retrieval}, - author={Wu, Siwei and Li, Yizhi and Zhu, Kang and Zhang, Ge and Liang, Yiming and Ma, Kaijing and Xiao, Chenghao and Zhang, Haoran and Yang, Bohao and Chen, Wenhu and others}, - journal={arXiv preprint arXiv:2401.13478}, - year={2024} -}""", + bibtex_citation=r""" +@article{wu2024scimmir, + author = {Wu, Siwei and Li, Yizhi and Zhu, Kang and Zhang, Ge and Liang, Yiming and Ma, Kaijing and Xiao, Chenghao and Zhang, Haoran and Yang, Bohao and Chen, Wenhu and others}, + journal = {arXiv preprint arXiv:2401.13478}, + title = {SciMMIR: Benchmarking Scientific Multi-modal Information Retrieval}, + year = {2024}, +} +""", descriptive_stats={ "n_samples": None, "avg_character_length": { diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/SciMMIRT2IRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/SciMMIRT2IRetrieval.py index 41fa6aebc1..420db0d882 100644 --- a/mteb/tasks/Image/Any2AnyRetrieval/eng/SciMMIRT2IRetrieval.py +++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/SciMMIRT2IRetrieval.py @@ -86,12 +86,14 @@ class SciMMIRT2IRetrieval(AbsTaskAny2AnyRetrieval): dialect=[], modalities=["text", "image"], sample_creation="found", - bibtex_citation="""@article{wu2024scimmir, - title={SciMMIR: Benchmarking Scientific Multi-modal Information Retrieval}, - author={Wu, Siwei and Li, Yizhi and Zhu, Kang and Zhang, Ge and Liang, Yiming and Ma, Kaijing and Xiao, Chenghao and Zhang, Haoran and Yang, Bohao and Chen, Wenhu and others}, - journal={arXiv preprint arXiv:2401.13478}, - year={2024} -}""", + bibtex_citation=r""" +@article{wu2024scimmir, + author = {Wu, Siwei and Li, Yizhi and Zhu, Kang and Zhang, Ge and Liang, Yiming and Ma, Kaijing and Xiao, Chenghao and Zhang, Haoran and Yang, Bohao and Chen, Wenhu and others}, + journal = {arXiv preprint arXiv:2401.13478}, + title = {SciMMIR: Benchmarking Scientific Multi-modal Information Retrieval}, + year = {2024}, +} +""", descriptive_stats={ "n_samples": None, "avg_character_length": { diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/SketchyI2IRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/SketchyI2IRetrieval.py index 002e1a39e2..5624b109e2 100644 --- a/mteb/tasks/Image/Any2AnyRetrieval/eng/SketchyI2IRetrieval.py +++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/SketchyI2IRetrieval.py @@ -26,13 +26,14 @@ class SketchyI2IRetrieval(AbsTaskAny2AnyRetrieval): dialect=[], modalities=["image"], sample_creation="created", - bibtex_citation="""@inproceedings{ypsilantis2021met, - title={The met dataset: Instance-level recognition for artworks}, - author={Ypsilantis, Nikolaos-Antonios and Garcia, Noa and Han, Guangxing and Ibrahimi, Sarah and Van Noord, Nanne and Tolias, Giorgos}, - booktitle={Thirty-fifth Conference on Neural Information Processing Systems Datasets and Benchmarks Track (Round 2)}, - year={2021} + bibtex_citation=r""" +@inproceedings{ypsilantis2021met, + author = {Ypsilantis, Nikolaos-Antonios and Garcia, Noa and Han, Guangxing and Ibrahimi, Sarah and Van Noord, Nanne and Tolias, Giorgos}, + booktitle = {Thirty-fifth Conference on Neural Information Processing Systems Datasets and Benchmarks Track (Round 2)}, + title = {The met dataset: Instance-level recognition for artworks}, + year = {2021}, } - """, +""", descriptive_stats={ "n_samples": {"test": 452886}, "avg_character_length": { diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/StanfordCarsI2IRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/StanfordCarsI2IRetrieval.py index e8d267eeaa..e3c7fc0a3d 100644 --- a/mteb/tasks/Image/Any2AnyRetrieval/eng/StanfordCarsI2IRetrieval.py +++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/StanfordCarsI2IRetrieval.py @@ -26,13 +26,14 @@ class StanfordCarsI2I(AbsTaskAny2AnyRetrieval): dialect=[], modalities=["image"], sample_creation="created", - bibtex_citation="""@inproceedings{Krause2013CollectingAL, - title={Collecting a Large-scale Dataset of Fine-grained Cars}, - author={Jonathan Krause and Jia Deng and Michael Stark and Li Fei-Fei}, - year={2013}, - url={https://api.semanticscholar.org/CorpusID:16632981} - } - """, + bibtex_citation=r""" +@inproceedings{Krause2013CollectingAL, + author = {Jonathan Krause and Jia Deng and Michael Stark and Li Fei-Fei}, + title = {Collecting a Large-scale Dataset of Fine-grained Cars}, + url = {https://api.semanticscholar.org/CorpusID:16632981}, + year = {2013}, +} +""", descriptive_stats={ "n_samples": {"default": 8041}, "avg_character_length": { diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/TUBerlinT2IRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/TUBerlinT2IRetrieval.py index b85cd1f94b..2d904f2a9a 100644 --- a/mteb/tasks/Image/Any2AnyRetrieval/eng/TUBerlinT2IRetrieval.py +++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/TUBerlinT2IRetrieval.py @@ -27,16 +27,18 @@ class TUBerlinT2IRetrieval(AbsTaskAny2AnyRetrieval): dialect=[], modalities=["text", "image"], sample_creation="found", - bibtex_citation="""@article{eitz2012humans, - title={How do humans sketch objects?}, - author={Eitz, Mathias and Hays, James and Alexa, Marc}, - journal={ACM Transactions on graphics (TOG)}, - volume={31}, - number={4}, - pages={1--10}, - year={2012}, - publisher={Acm New York, NY, USA} -}""", + bibtex_citation=r""" +@article{eitz2012humans, + author = {Eitz, Mathias and Hays, James and Alexa, Marc}, + journal = {ACM Transactions on graphics (TOG)}, + number = {4}, + pages = {1--10}, + publisher = {Acm New York, NY, USA}, + title = {How do humans sketch objects?}, + volume = {31}, + year = {2012}, +} +""", descriptive_stats={ "n_samples": {"test": 250}, "avg_character_length": { diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/VQA2IT2TRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/VQA2IT2TRetrieval.py index 39f07cf945..04c94f7365 100644 --- a/mteb/tasks/Image/Any2AnyRetrieval/eng/VQA2IT2TRetrieval.py +++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/VQA2IT2TRetrieval.py @@ -27,12 +27,13 @@ class VQA2IT2TRetrieval(AbsTaskAny2AnyRetrieval): dialect=[], modalities=["text", "image"], sample_creation="found", - bibtex_citation="""@InProceedings{Goyal_2017_CVPR, -author = {Goyal, Yash and Khot, Tejas and Summers-Stay, Douglas and Batra, Dhruv and Parikh, Devi}, -title = {Making the v in VQA Matter: Elevating the Role of Image Understanding in Visual Question Answering}, -booktitle = {Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR)}, -month = {July}, -year = {2017} + bibtex_citation=r""" +@inproceedings{Goyal_2017_CVPR, + author = {Goyal, Yash and Khot, Tejas and Summers-Stay, Douglas and Batra, Dhruv and Parikh, Devi}, + booktitle = {Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR)}, + month = {July}, + title = {Making the v in VQA Matter: Elevating the Role of Image Understanding in Visual Question Answering}, + year = {2017}, } """, descriptive_stats={ diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/VidoreBenchRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/VidoreBenchRetrieval.py index f279d9b277..0031778a98 100644 --- a/mteb/tasks/Image/Any2AnyRetrieval/eng/VidoreBenchRetrieval.py +++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/VidoreBenchRetrieval.py @@ -92,12 +92,14 @@ class VidoreArxivQARetrieval(AbsTaskAny2AnyRetrieval): dialect=[], modalities=["text", "image"], sample_creation="found", - bibtex_citation="""@article{faysse2024colpali, - title={ColPali: Efficient Document Retrieval with Vision Language Models}, - author={Faysse, Manuel and Sibille, Hugues and Wu, Tony and Viaud, Gautier and Hudelot, C{\'e}line and Colombo, Pierre}, - journal={arXiv preprint arXiv:2407.01449}, - year={2024} -}""", + bibtex_citation=r""" +@article{faysse2024colpali, + author = {Faysse, Manuel and Sibille, Hugues and Wu, Tony and Viaud, Gautier and Hudelot, C{\'e}line and Colombo, Pierre}, + journal = {arXiv preprint arXiv:2407.01449}, + title = {ColPali: Efficient Document Retrieval with Vision Language Models}, + year = {2024}, +} +""", prompt={"query": "Find a screenshot that relevant to the user's question."}, descriptive_stats={ "n_samples": None, @@ -146,12 +148,14 @@ class VidoreDocVQARetrieval(AbsTaskAny2AnyRetrieval): dialect=[], modalities=["text", "image"], sample_creation="found", - bibtex_citation="""@article{faysse2024colpali, - title={ColPali: Efficient Document Retrieval with Vision Language Models}, - author={Faysse, Manuel and Sibille, Hugues and Wu, Tony and Viaud, Gautier and Hudelot, C{\'e}line and Colombo, Pierre}, - journal={arXiv preprint arXiv:2407.01449}, - year={2024} -}""", + bibtex_citation=r""" +@article{faysse2024colpali, + author = {Faysse, Manuel and Sibille, Hugues and Wu, Tony and Viaud, Gautier and Hudelot, C{\'e}line and Colombo, Pierre}, + journal = {arXiv preprint arXiv:2407.01449}, + title = {ColPali: Efficient Document Retrieval with Vision Language Models}, + year = {2024}, +} +""", prompt={"query": "Find a screenshot that relevant to the user's question."}, descriptive_stats={ "n_samples": None, @@ -200,12 +204,14 @@ class VidoreInfoVQARetrieval(AbsTaskAny2AnyRetrieval): dialect=[], modalities=["text", "image"], sample_creation="found", - bibtex_citation="""@article{faysse2024colpali, - title={ColPali: Efficient Document Retrieval with Vision Language Models}, - author={Faysse, Manuel and Sibille, Hugues and Wu, Tony and Viaud, Gautier and Hudelot, C{\'e}line and Colombo, Pierre}, - journal={arXiv preprint arXiv:2407.01449}, - year={2024} -}""", + bibtex_citation=r""" +@article{faysse2024colpali, + author = {Faysse, Manuel and Sibille, Hugues and Wu, Tony and Viaud, Gautier and Hudelot, C{\'e}line and Colombo, Pierre}, + journal = {arXiv preprint arXiv:2407.01449}, + title = {ColPali: Efficient Document Retrieval with Vision Language Models}, + year = {2024}, +} +""", prompt={"query": "Find a screenshot that relevant to the user's question."}, descriptive_stats={ "n_samples": None, @@ -254,12 +260,14 @@ class VidoreTabfquadRetrieval(AbsTaskAny2AnyRetrieval): dialect=[], modalities=["text", "image"], sample_creation="found", - bibtex_citation="""@article{faysse2024colpali, - title={ColPali: Efficient Document Retrieval with Vision Language Models}, - author={Faysse, Manuel and Sibille, Hugues and Wu, Tony and Viaud, Gautier and Hudelot, C{\'e}line and Colombo, Pierre}, - journal={arXiv preprint arXiv:2407.01449}, - year={2024} -}""", + bibtex_citation=r""" +@article{faysse2024colpali, + author = {Faysse, Manuel and Sibille, Hugues and Wu, Tony and Viaud, Gautier and Hudelot, C{\'e}line and Colombo, Pierre}, + journal = {arXiv preprint arXiv:2407.01449}, + title = {ColPali: Efficient Document Retrieval with Vision Language Models}, + year = {2024}, +} +""", prompt={"query": "Find a screenshot that relevant to the user's question."}, descriptive_stats={ "n_samples": None, @@ -308,12 +316,14 @@ class VidoreTatdqaRetrieval(AbsTaskAny2AnyRetrieval): dialect=[], modalities=["text", "image"], sample_creation="found", - bibtex_citation="""@article{faysse2024colpali, - title={ColPali: Efficient Document Retrieval with Vision Language Models}, - author={Faysse, Manuel and Sibille, Hugues and Wu, Tony and Viaud, Gautier and Hudelot, C{\'e}line and Colombo, Pierre}, - journal={arXiv preprint arXiv:2407.01449}, - year={2024} -}""", + bibtex_citation=r""" +@article{faysse2024colpali, + author = {Faysse, Manuel and Sibille, Hugues and Wu, Tony and Viaud, Gautier and Hudelot, C{\'e}line and Colombo, Pierre}, + journal = {arXiv preprint arXiv:2407.01449}, + title = {ColPali: Efficient Document Retrieval with Vision Language Models}, + year = {2024}, +} +""", prompt={"query": "Find a screenshot that relevant to the user's question."}, descriptive_stats={ "n_samples": None, @@ -362,12 +372,14 @@ class VidoreShiftProjectRetrieval(AbsTaskAny2AnyRetrieval): dialect=[], modalities=["text", "image"], sample_creation="found", - bibtex_citation="""@article{faysse2024colpali, - title={ColPali: Efficient Document Retrieval with Vision Language Models}, - author={Faysse, Manuel and Sibille, Hugues and Wu, Tony and Viaud, Gautier and Hudelot, C{\'e}line and Colombo, Pierre}, - journal={arXiv preprint arXiv:2407.01449}, - year={2024} -}""", + bibtex_citation=r""" +@article{faysse2024colpali, + author = {Faysse, Manuel and Sibille, Hugues and Wu, Tony and Viaud, Gautier and Hudelot, C{\'e}line and Colombo, Pierre}, + journal = {arXiv preprint arXiv:2407.01449}, + title = {ColPali: Efficient Document Retrieval with Vision Language Models}, + year = {2024}, +} +""", prompt={"query": "Find a screenshot that relevant to the user's question."}, descriptive_stats={ "n_samples": None, @@ -416,12 +428,14 @@ class VidoreSyntheticDocQAAIRetrieval(AbsTaskAny2AnyRetrieval): dialect=[], modalities=["text", "image"], sample_creation="found", - bibtex_citation="""@article{faysse2024colpali, - title={ColPali: Efficient Document Retrieval with Vision Language Models}, - author={Faysse, Manuel and Sibille, Hugues and Wu, Tony and Viaud, Gautier and Hudelot, C{\'e}line and Colombo, Pierre}, - journal={arXiv preprint arXiv:2407.01449}, - year={2024} -}""", + bibtex_citation=r""" +@article{faysse2024colpali, + author = {Faysse, Manuel and Sibille, Hugues and Wu, Tony and Viaud, Gautier and Hudelot, C{\'e}line and Colombo, Pierre}, + journal = {arXiv preprint arXiv:2407.01449}, + title = {ColPali: Efficient Document Retrieval with Vision Language Models}, + year = {2024}, +} +""", prompt={"query": "Find a screenshot that relevant to the user's question."}, descriptive_stats={ "n_samples": None, @@ -470,12 +484,14 @@ class VidoreSyntheticDocQAEnergyRetrieval(AbsTaskAny2AnyRetrieval): dialect=[], modalities=["text", "image"], sample_creation="found", - bibtex_citation="""@article{faysse2024colpali, - title={ColPali: Efficient Document Retrieval with Vision Language Models}, - author={Faysse, Manuel and Sibille, Hugues and Wu, Tony and Viaud, Gautier and Hudelot, C{\'e}line and Colombo, Pierre}, - journal={arXiv preprint arXiv:2407.01449}, - year={2024} -}""", + bibtex_citation=r""" +@article{faysse2024colpali, + author = {Faysse, Manuel and Sibille, Hugues and Wu, Tony and Viaud, Gautier and Hudelot, C{\'e}line and Colombo, Pierre}, + journal = {arXiv preprint arXiv:2407.01449}, + title = {ColPali: Efficient Document Retrieval with Vision Language Models}, + year = {2024}, +} +""", prompt={"query": "Find a screenshot that relevant to the user's question."}, descriptive_stats={ "n_samples": None, @@ -524,12 +540,14 @@ class VidoreSyntheticDocQAGovernmentReportsRetrieval(AbsTaskAny2AnyRetrieval): dialect=[], modalities=["text", "image"], sample_creation="found", - bibtex_citation="""@article{faysse2024colpali, - title={ColPali: Efficient Document Retrieval with Vision Language Models}, - author={Faysse, Manuel and Sibille, Hugues and Wu, Tony and Viaud, Gautier and Hudelot, C{\'e}line and Colombo, Pierre}, - journal={arXiv preprint arXiv:2407.01449}, - year={2024} -}""", + bibtex_citation=r""" +@article{faysse2024colpali, + author = {Faysse, Manuel and Sibille, Hugues and Wu, Tony and Viaud, Gautier and Hudelot, C{\'e}line and Colombo, Pierre}, + journal = {arXiv preprint arXiv:2407.01449}, + title = {ColPali: Efficient Document Retrieval with Vision Language Models}, + year = {2024}, +} +""", prompt={"query": "Find a screenshot that relevant to the user's question."}, descriptive_stats={ "n_samples": None, @@ -578,12 +596,14 @@ class VidoreSyntheticDocQAHealthcareIndustryRetrieval(AbsTaskAny2AnyRetrieval): dialect=[], modalities=["text", "image"], sample_creation="found", - bibtex_citation="""@article{faysse2024colpali, - title={ColPali: Efficient Document Retrieval with Vision Language Models}, - author={Faysse, Manuel and Sibille, Hugues and Wu, Tony and Viaud, Gautier and Hudelot, C{\'e}line and Colombo, Pierre}, - journal={arXiv preprint arXiv:2407.01449}, - year={2024} -}""", + bibtex_citation=r""" +@article{faysse2024colpali, + author = {Faysse, Manuel and Sibille, Hugues and Wu, Tony and Viaud, Gautier and Hudelot, C{\'e}line and Colombo, Pierre}, + journal = {arXiv preprint arXiv:2407.01449}, + title = {ColPali: Efficient Document Retrieval with Vision Language Models}, + year = {2024}, +} +""", prompt={"query": "Find a screenshot that relevant to the user's question."}, descriptive_stats={ "n_samples": None, diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/VisualNewsI2TRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/VisualNewsI2TRetrieval.py index 2f79bfe9eb..5ed6d8635c 100644 --- a/mteb/tasks/Image/Any2AnyRetrieval/eng/VisualNewsI2TRetrieval.py +++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/VisualNewsI2TRetrieval.py @@ -26,13 +26,15 @@ class VisualNewsI2TRetrieval(AbsTaskAny2AnyRetrieval): dialect=[], modalities=["image", "text"], sample_creation="created", - bibtex_citation="""@inproceedings{liu2021visual, - title={Visual News: Benchmark and Challenges in News Image Captioning}, - author={Liu, Fuxiao and Wang, Yinghan and Wang, Tianlu and Ordonez, Vicente}, - booktitle={Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing}, - pages={6761--6771}, - year={2021} -}""", + bibtex_citation=r""" +@inproceedings{liu2021visual, + author = {Liu, Fuxiao and Wang, Yinghan and Wang, Tianlu and Ordonez, Vicente}, + booktitle = {Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing}, + pages = {6761--6771}, + title = {Visual News: Benchmark and Challenges in News Image Captioning}, + year = {2021}, +} +""", prompt={"query": "Find a caption for the news in the given photo."}, descriptive_stats={ "n_samples": {"test": 20000}, diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/VisualNewsT2IRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/VisualNewsT2IRetrieval.py index 1c5fa7fdbe..de8ddc4df5 100644 --- a/mteb/tasks/Image/Any2AnyRetrieval/eng/VisualNewsT2IRetrieval.py +++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/VisualNewsT2IRetrieval.py @@ -26,13 +26,15 @@ class VisualNewsT2IRetrieval(AbsTaskAny2AnyRetrieval): dialect=[], modalities=["image", "text"], sample_creation="created", - bibtex_citation="""@inproceedings{liu2021visual, - title={Visual News: Benchmark and Challenges in News Image Captioning}, - author={Liu, Fuxiao and Wang, Yinghan and Wang, Tianlu and Ordonez, Vicente}, - booktitle={Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing}, - pages={6761--6771}, - year={2021} -}""", + bibtex_citation=r""" +@inproceedings{liu2021visual, + author = {Liu, Fuxiao and Wang, Yinghan and Wang, Tianlu and Ordonez, Vicente}, + booktitle = {Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing}, + pages = {6761--6771}, + title = {Visual News: Benchmark and Challenges in News Image Captioning}, + year = {2021}, +} +""", prompt={ "query": "Identify the news-related image in line with the described event." }, diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/VizWizIT2TRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/VizWizIT2TRetrieval.py index 96bcac96c3..a6970d157d 100644 --- a/mteb/tasks/Image/Any2AnyRetrieval/eng/VizWizIT2TRetrieval.py +++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/VizWizIT2TRetrieval.py @@ -27,14 +27,14 @@ class VizWizIT2TRetrieval(AbsTaskAny2AnyRetrieval): dialect=[], modalities=["text", "image"], sample_creation="found", - bibtex_citation="""@inproceedings{gurari2018vizwiz, - title={Vizwiz grand challenge: Answering visual questions from blind people}, - author={Gurari, Danna and Li, Qing and Stangl, Abigale J and Guo, Anhong and Lin, Chi and Grauman, Kristen and Luo, Jiebo and Bigham, Jeffrey P}, - booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition}, - pages={3608--3617}, - year={2018} + bibtex_citation=r""" +@inproceedings{gurari2018vizwiz, + author = {Gurari, Danna and Li, Qing and Stangl, Abigale J and Guo, Anhong and Lin, Chi and Grauman, Kristen and Luo, Jiebo and Bigham, Jeffrey P}, + booktitle = {Proceedings of the IEEE conference on computer vision and pattern recognition}, + pages = {3608--3617}, + title = {Vizwiz grand challenge: Answering visual questions from blind people}, + year = {2018}, } - """, descriptive_stats={ "n_samples": {"test": 214354}, diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/WebQAT2ITRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/WebQAT2ITRetrieval.py index e3235c4912..307fe74259 100644 --- a/mteb/tasks/Image/Any2AnyRetrieval/eng/WebQAT2ITRetrieval.py +++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/WebQAT2ITRetrieval.py @@ -26,13 +26,15 @@ class WebQAT2ITRetrieval(AbsTaskAny2AnyRetrieval): dialect=[], modalities=["image", "text"], sample_creation="created", - bibtex_citation="""@inproceedings{chang2022webqa, - title={Webqa: Multihop and multimodal qa}, - author={Chang, Yingshan and Narang, Mridu and Suzuki, Hisami and Cao, Guihong and Gao, Jianfeng and Bisk, Yonatan}, - booktitle={Proceedings of the IEEE/CVF conference on computer vision and pattern recognition}, - pages={16495--16504}, - year={2022} - }""", + bibtex_citation=r""" +@inproceedings{chang2022webqa, + author = {Chang, Yingshan and Narang, Mridu and Suzuki, Hisami and Cao, Guihong and Gao, Jianfeng and Bisk, Yonatan}, + booktitle = {Proceedings of the IEEE/CVF conference on computer vision and pattern recognition}, + pages = {16495--16504}, + title = {Webqa: Multihop and multimodal qa}, + year = {2022}, +} +""", prompt={"query": "Find a Wikipedia image that answers this question."}, descriptive_stats={ "n_samples": {"test": 2511}, diff --git a/mteb/tasks/Image/Any2AnyRetrieval/eng/WebQAT2TRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/eng/WebQAT2TRetrieval.py index 4583e61221..cfc760cba0 100644 --- a/mteb/tasks/Image/Any2AnyRetrieval/eng/WebQAT2TRetrieval.py +++ b/mteb/tasks/Image/Any2AnyRetrieval/eng/WebQAT2TRetrieval.py @@ -26,13 +26,15 @@ class WebQAT2TRetrieval(AbsTaskAny2AnyRetrieval): dialect=[], modalities=["text"], sample_creation="created", - bibtex_citation="""@inproceedings{chang2022webqa, - title={Webqa: Multihop and multimodal qa}, - author={Chang, Yingshan and Narang, Mridu and Suzuki, Hisami and Cao, Guihong and Gao, Jianfeng and Bisk, Yonatan}, - booktitle={Proceedings of the IEEE/CVF conference on computer vision and pattern recognition}, - pages={16495--16504}, - year={2022} - }""", + bibtex_citation=r""" +@inproceedings{chang2022webqa, + author = {Chang, Yingshan and Narang, Mridu and Suzuki, Hisami and Cao, Guihong and Gao, Jianfeng and Bisk, Yonatan}, + booktitle = {Proceedings of the IEEE/CVF conference on computer vision and pattern recognition}, + pages = {16495--16504}, + title = {Webqa: Multihop and multimodal qa}, + year = {2022}, +} +""", prompt={ "query": "Retrieve passages from Wikipedia that provide answers to the following question." }, diff --git a/mteb/tasks/Image/Any2AnyRetrieval/multilingual/VdrMultilingualRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/multilingual/VdrMultilingualRetrieval.py index 8ebc7c30b3..0355144949 100644 --- a/mteb/tasks/Image/Any2AnyRetrieval/multilingual/VdrMultilingualRetrieval.py +++ b/mteb/tasks/Image/Any2AnyRetrieval/multilingual/VdrMultilingualRetrieval.py @@ -123,12 +123,14 @@ class VDRMultilingualRetrieval(MultilingualTask, AbsTaskAny2AnyRetrieval): annotations_creators="LM-generated", dialect=[], sample_creation="found", - bibtex_citation="""@misc{llamaindex2024vdrmultilingual, - title={Visual Document Retrieval Goes Multilingual}, - author={LlamaIndex}, - year={2025}, - howpublished={https://huggingface.co/datasets/llamaindex/vdr-multilingual-test}, -}""", + bibtex_citation=r""" +@misc{llamaindex2024vdrmultilingual, + author = {LlamaIndex}, + howpublished = {https://huggingface.co/datasets/llamaindex/vdr-multilingual-test}, + title = {Visual Document Retrieval Goes Multilingual}, + year = {2025}, +} +""", ) def load_data(self, **kwargs): diff --git a/mteb/tasks/Image/Any2AnyRetrieval/multilingual/WITT2IRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/multilingual/WITT2IRetrieval.py index 884729d8ab..b67c2c0262 100644 --- a/mteb/tasks/Image/Any2AnyRetrieval/multilingual/WITT2IRetrieval.py +++ b/mteb/tasks/Image/Any2AnyRetrieval/multilingual/WITT2IRetrieval.py @@ -114,14 +114,16 @@ class WITT2IRetrieval(MultilingualTask, AbsTaskAny2AnyRetrieval): dialect=[], modalities=["text", "image"], sample_creation="found", - bibtex_citation="""@inproceedings{bugliarello2022iglue, - title={IGLUE: A benchmark for transfer learning across modalities, tasks, and languages}, - author={Bugliarello, Emanuele and Liu, Fangyu and Pfeiffer, Jonas and Reddy, Siva and Elliott, Desmond and Ponti, Edoardo Maria and Vuli{\'c}, Ivan}, - booktitle={International Conference on Machine Learning}, - pages={2370--2392}, - year={2022}, - organization={PMLR} -}""", + bibtex_citation=r""" +@inproceedings{bugliarello2022iglue, + author = {Bugliarello, Emanuele and Liu, Fangyu and Pfeiffer, Jonas and Reddy, Siva and Elliott, Desmond and Ponti, Edoardo Maria and Vuli{\'c}, Ivan}, + booktitle = {International Conference on Machine Learning}, + organization = {PMLR}, + pages = {2370--2392}, + title = {IGLUE: A benchmark for transfer learning across modalities, tasks, and languages}, + year = {2022}, +} +""", descriptive_stats={ "n_samples": None, "avg_character_length": { diff --git a/mteb/tasks/Image/Any2AnyRetrieval/multilingual/XFlickr30kCoT2IRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/multilingual/XFlickr30kCoT2IRetrieval.py index 4370c2752c..0e08df2b9d 100644 --- a/mteb/tasks/Image/Any2AnyRetrieval/multilingual/XFlickr30kCoT2IRetrieval.py +++ b/mteb/tasks/Image/Any2AnyRetrieval/multilingual/XFlickr30kCoT2IRetrieval.py @@ -97,14 +97,16 @@ class XFlickr30kCoT2IRetrieval(MultilingualTask, AbsTaskAny2AnyRetrieval): dialect=[], modalities=["text", "image"], sample_creation="found", - bibtex_citation="""@inproceedings{bugliarello2022iglue, - title={IGLUE: A benchmark for transfer learning across modalities, tasks, and languages}, - author={Bugliarello, Emanuele and Liu, Fangyu and Pfeiffer, Jonas and Reddy, Siva and Elliott, Desmond and Ponti, Edoardo Maria and Vuli{\'c}, Ivan}, - booktitle={International Conference on Machine Learning}, - pages={2370--2392}, - year={2022}, - organization={PMLR} -}""", + bibtex_citation=r""" +@inproceedings{bugliarello2022iglue, + author = {Bugliarello, Emanuele and Liu, Fangyu and Pfeiffer, Jonas and Reddy, Siva and Elliott, Desmond and Ponti, Edoardo Maria and Vuli{\'c}, Ivan}, + booktitle = {International Conference on Machine Learning}, + organization = {PMLR}, + pages = {2370--2392}, + title = {IGLUE: A benchmark for transfer learning across modalities, tasks, and languages}, + year = {2022}, +} +""", descriptive_stats={ "n_samples": None, "avg_character_length": { diff --git a/mteb/tasks/Image/Any2AnyRetrieval/multilingual/XM3600T2IRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/multilingual/XM3600T2IRetrieval.py index 880c7aade8..17136c8e3a 100644 --- a/mteb/tasks/Image/Any2AnyRetrieval/multilingual/XM3600T2IRetrieval.py +++ b/mteb/tasks/Image/Any2AnyRetrieval/multilingual/XM3600T2IRetrieval.py @@ -142,13 +142,15 @@ class XM3600T2IRetrieval(MultilingualTask, AbsTaskAny2AnyRetrieval): dialect=[], modalities=["text", "image"], sample_creation="found", - bibtex_citation="""@inproceedings{thapliyal2022crossmodal, - title={Crossmodal-3600: A Massively Multilingual Multimodal Evaluation Dataset}, - author={Thapliyal, Ashish V and Tuset, Jordi Pont and Chen, Xi and Soricut, Radu}, - booktitle={Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing}, - pages={715--729}, - year={2022} -}""", + bibtex_citation=r""" +@inproceedings{thapliyal2022crossmodal, + author = {Thapliyal, Ashish V and Tuset, Jordi Pont and Chen, Xi and Soricut, Radu}, + booktitle = {Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing}, + pages = {715--729}, + title = {Crossmodal-3600: A Massively Multilingual Multimodal Evaluation Dataset}, + year = {2022}, +} +""", descriptive_stats={ "n_samples": None, "avg_character_length": { diff --git a/mteb/tasks/Image/ImageClassification/eng/BirdsnapClassification.py b/mteb/tasks/Image/ImageClassification/eng/BirdsnapClassification.py index 2e11094b09..ed1d732bed 100644 --- a/mteb/tasks/Image/ImageClassification/eng/BirdsnapClassification.py +++ b/mteb/tasks/Image/ImageClassification/eng/BirdsnapClassification.py @@ -29,14 +29,15 @@ class BirdsnapClassification(AbsTaskImageClassification): dialect=[], modalities=["image"], sample_creation="created", - bibtex_citation="""@InProceedings{Berg_2014_CVPR, - author = {Berg, Thomas and Liu, Jiongxin and Woo Lee, Seung and Alexander, Michelle L. and Jacobs, David W. and Belhumeur, Peter N.}, - title = {Birdsnap: Large-scale Fine-grained Visual Categorization of Birds}, - booktitle = {Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR)}, - month = {June}, - year = {2014} - } - """, + bibtex_citation=r""" +@inproceedings{Berg_2014_CVPR, + author = {Berg, Thomas and Liu, Jiongxin and Woo Lee, Seung and Alexander, Michelle L. and Jacobs, David W. and Belhumeur, Peter N.}, + booktitle = {Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR)}, + month = {June}, + title = {Birdsnap: Large-scale Fine-grained Visual Categorization of Birds}, + year = {2014}, +} +""", descriptive_stats={ "n_samples": {"test": 1851}, "avg_character_length": {"test": 431.4}, diff --git a/mteb/tasks/Image/ImageClassification/eng/CIFAR.py b/mteb/tasks/Image/ImageClassification/eng/CIFAR.py index abed2ad617..7c02ae79ce 100644 --- a/mteb/tasks/Image/ImageClassification/eng/CIFAR.py +++ b/mteb/tasks/Image/ImageClassification/eng/CIFAR.py @@ -29,13 +29,14 @@ class CIFAR10Classification(AbsTaskImageClassification): dialect=[], modalities=["image"], sample_creation="created", - bibtex_citation=""" @TECHREPORT{Krizhevsky09learningmultiple, - author = {Alex Krizhevsky}, - title = {Learning multiple layers of features from tiny images}, - institution = {}, - year = {2009} - } - """, + bibtex_citation=r""" +@techreport{Krizhevsky09learningmultiple, + author = {Alex Krizhevsky}, + institution = {}, + title = {Learning multiple layers of features from tiny images}, + year = {2009}, +} +""", descriptive_stats={ "n_samples": {"test": 10000}, "avg_character_length": {"test": 431.4}, @@ -69,13 +70,14 @@ class CIFAR100Classification(AbsTaskImageClassification): dialect=[], modalities=["image"], sample_creation="created", - bibtex_citation=""" @TECHREPORT{Krizhevsky09learningmultiple, - author = {Alex Krizhevsky}, - title = {Learning multiple layers of features from tiny images}, - institution = {}, - year = {2009} - } - """, + bibtex_citation=r""" +@techreport{Krizhevsky09learningmultiple, + author = {Alex Krizhevsky}, + institution = {}, + title = {Learning multiple layers of features from tiny images}, + year = {2009}, +} +""", descriptive_stats={ "n_samples": {"test": 10000}, "avg_character_length": {"test": 431.4}, diff --git a/mteb/tasks/Image/ImageClassification/eng/Caltech101Classification.py b/mteb/tasks/Image/ImageClassification/eng/Caltech101Classification.py index 1a696b7b48..112df6e793 100644 --- a/mteb/tasks/Image/ImageClassification/eng/Caltech101Classification.py +++ b/mteb/tasks/Image/ImageClassification/eng/Caltech101Classification.py @@ -31,17 +31,19 @@ class Caltech101Classification(AbsTaskImageClassification): dialect=[], modalities=["image"], sample_creation="created", - bibtex_citation="""@INPROCEEDINGS{1384978, - author={Li Fei-Fei and Fergus, R. and Perona, P.}, - booktitle={2004 Conference on Computer Vision and Pattern Recognition Workshop}, - title={Learning Generative Visual Models from Few Training Examples: An Incremental Bayesian Approach Tested on 101 Object Categories}, - year={2004}, - volume={}, - number={}, - pages={178-178}, - keywords={Bayesian methods;Testing;Humans;Maximum likelihood estimation;Assembly;Shape;Machine vision;Image recognition;Parameter estimation;Image databases}, - doi={10.1109/CVPR.2004.383}} - """, + bibtex_citation=r""" +@inproceedings{1384978, + author = {Li Fei-Fei and Fergus, R. and Perona, P.}, + booktitle = {2004 Conference on Computer Vision and Pattern Recognition Workshop}, + doi = {10.1109/CVPR.2004.383}, + keywords = {Bayesian methods;Testing;Humans;Maximum likelihood estimation;Assembly;Shape;Machine vision;Image recognition;Parameter estimation;Image databases}, + number = {}, + pages = {178-178}, + title = {Learning Generative Visual Models from Few Training Examples: An Incremental Bayesian Approach Tested on 101 Object Categories}, + volume = {}, + year = {2004}, +} +""", descriptive_stats={ "n_samples": {"test": 6084}, "avg_character_length": {"test": 431.4}, diff --git a/mteb/tasks/Image/ImageClassification/eng/Country211Classification.py b/mteb/tasks/Image/ImageClassification/eng/Country211Classification.py index b73f895595..1ea60abd63 100644 --- a/mteb/tasks/Image/ImageClassification/eng/Country211Classification.py +++ b/mteb/tasks/Image/ImageClassification/eng/Country211Classification.py @@ -29,12 +29,14 @@ class Country211Classification(AbsTaskImageClassification): dialect=[], modalities=["image"], sample_creation="created", - bibtex_citation="""@article{radford2021learning, - title={Learning Transferable Visual Models From Natural Language Supervision}, - author={Radford, Alec and Kim, Jong Wook and Hallacy, Chris and Ramesh, Aditya and Goh, Gabriel and Agarwal, Sandhini and Sastry, Girish and Askell, Amanda and Mishkin, Pamela and Clark, Jack and others}, - journal={arXiv preprint arXiv:2103.00020}, - year={2021} - }""", + bibtex_citation=r""" +@article{radford2021learning, + author = {Radford, Alec and Kim, Jong Wook and Hallacy, Chris and Ramesh, Aditya and Goh, Gabriel and Agarwal, Sandhini and Sastry, Girish and Askell, Amanda and Mishkin, Pamela and Clark, Jack and others}, + journal = {arXiv preprint arXiv:2103.00020}, + title = {Learning Transferable Visual Models From Natural Language Supervision}, + year = {2021}, +} +""", descriptive_stats={ "n_samples": {"test": 21100}, "avg_character_length": {"test": 0}, diff --git a/mteb/tasks/Image/ImageClassification/eng/DTDClassification.py b/mteb/tasks/Image/ImageClassification/eng/DTDClassification.py index eb7360f088..ec6dfc5708 100644 --- a/mteb/tasks/Image/ImageClassification/eng/DTDClassification.py +++ b/mteb/tasks/Image/ImageClassification/eng/DTDClassification.py @@ -29,12 +29,14 @@ class DTDClassification(AbsTaskImageClassification): dialect=[], modalities=["image"], sample_creation="created", - bibtex_citation="""@InProceedings{cimpoi14describing, - Author = {M. Cimpoi and S. Maji and I. Kokkinos and S. Mohamed and and A. Vedaldi}, - Title = {Describing Textures in the Wild}, - Booktitle = {Proceedings of the {IEEE} Conf. on Computer Vision and Pattern Recognition ({CVPR})}, - Year = {2014}} - """, + bibtex_citation=r""" +@inproceedings{cimpoi14describing, + author = {M. Cimpoi and S. Maji and I. Kokkinos and S. Mohamed and and A. Vedaldi}, + booktitle = {Proceedings of the {IEEE} Conf. on Computer Vision and Pattern Recognition ({CVPR})}, + title = {Describing Textures in the Wild}, + year = {2014}, +} +""", descriptive_stats={ "n_samples": {"test": 1880}, "avg_character_length": {"test": 456}, diff --git a/mteb/tasks/Image/ImageClassification/eng/EuroSATClassification.py b/mteb/tasks/Image/ImageClassification/eng/EuroSATClassification.py index 7db9f482dd..ed3d092c98 100644 --- a/mteb/tasks/Image/ImageClassification/eng/EuroSATClassification.py +++ b/mteb/tasks/Image/ImageClassification/eng/EuroSATClassification.py @@ -29,17 +29,19 @@ class EuroSATClassification(AbsTaskImageClassification): dialect=[], modalities=["image"], sample_creation="created", - bibtex_citation="""@ARTICLE{8736785, - author={Helber, Patrick and Bischke, Benjamin and Dengel, Andreas and Borth, Damian}, - journal={IEEE Journal of Selected Topics in Applied Earth Observations and Remote Sensing}, - title={EuroSAT: A Novel Dataset and Deep Learning Benchmark for Land Use and Land Cover Classification}, - year={2019}, - volume={12}, - number={7}, - pages={2217-2226}, - keywords={Satellites;Earth;Remote sensing;Machine learning;Spatial resolution;Feature extraction;Benchmark testing;Dataset;deep convolutional neural network;deep learning;earth observation;land cover classification;land use classification;machine learning;remote sensing;satellite image classification;satellite images}, - doi={10.1109/JSTARS.2019.2918242}} - """, + bibtex_citation=r""" +@article{8736785, + author = {Helber, Patrick and Bischke, Benjamin and Dengel, Andreas and Borth, Damian}, + doi = {10.1109/JSTARS.2019.2918242}, + journal = {IEEE Journal of Selected Topics in Applied Earth Observations and Remote Sensing}, + keywords = {Satellites;Earth;Remote sensing;Machine learning;Spatial resolution;Feature extraction;Benchmark testing;Dataset;deep convolutional neural network;deep learning;earth observation;land cover classification;land use classification;machine learning;remote sensing;satellite image classification;satellite images}, + number = {7}, + pages = {2217-2226}, + title = {EuroSAT: A Novel Dataset and Deep Learning Benchmark for Land Use and Land Cover Classification}, + volume = {12}, + year = {2019}, +} +""", descriptive_stats={ "n_samples": {"test": 5400}, "avg_character_length": {"test": 431.4}, diff --git a/mteb/tasks/Image/ImageClassification/eng/FER2013Classification.py b/mteb/tasks/Image/ImageClassification/eng/FER2013Classification.py index 7f55d7a665..2f32ae8629 100644 --- a/mteb/tasks/Image/ImageClassification/eng/FER2013Classification.py +++ b/mteb/tasks/Image/ImageClassification/eng/FER2013Classification.py @@ -29,16 +29,17 @@ class FER2013Classification(AbsTaskImageClassification): dialect=[], modalities=["image"], sample_creation="created", - bibtex_citation="""@misc{goodfellow2015explainingharnessingadversarialexamples, - title={Explaining and Harnessing Adversarial Examples}, - author={Ian J. Goodfellow and Jonathon Shlens and Christian Szegedy}, - year={2015}, - eprint={1412.6572}, - archivePrefix={arXiv}, - primaryClass={stat.ML}, - url={https://arxiv.org/abs/1412.6572}, - } - """, + bibtex_citation=r""" +@misc{goodfellow2015explainingharnessingadversarialexamples, + archiveprefix = {arXiv}, + author = {Ian J. Goodfellow and Jonathon Shlens and Christian Szegedy}, + eprint = {1412.6572}, + primaryclass = {stat.ML}, + title = {Explaining and Harnessing Adversarial Examples}, + url = {https://arxiv.org/abs/1412.6572}, + year = {2015}, +} +""", descriptive_stats={ "n_samples": {"test": 7178}, "avg_character_length": {"test": 431.4}, diff --git a/mteb/tasks/Image/ImageClassification/eng/FGVCAircraftClassification.py b/mteb/tasks/Image/ImageClassification/eng/FGVCAircraftClassification.py index 21f14a300e..96d525a95e 100644 --- a/mteb/tasks/Image/ImageClassification/eng/FGVCAircraftClassification.py +++ b/mteb/tasks/Image/ImageClassification/eng/FGVCAircraftClassification.py @@ -30,16 +30,17 @@ class FGVCAircraftClassification(AbsTaskImageClassification): dialect=[], modalities=["image"], sample_creation="created", - bibtex_citation="""@misc{maji2013finegrainedvisualclassificationaircraft, - title={Fine-Grained Visual Classification of Aircraft}, - author={Subhransu Maji and Esa Rahtu and Juho Kannala and Matthew Blaschko and Andrea Vedaldi}, - year={2013}, - eprint={1306.5151}, - archivePrefix={arXiv}, - primaryClass={cs.CV}, - url={https://arxiv.org/abs/1306.5151}, - } - """, + bibtex_citation=r""" +@misc{maji2013finegrainedvisualclassificationaircraft, + archiveprefix = {arXiv}, + author = {Subhransu Maji and Esa Rahtu and Juho Kannala and Matthew Blaschko and Andrea Vedaldi}, + eprint = {1306.5151}, + primaryclass = {cs.CV}, + title = {Fine-Grained Visual Classification of Aircraft}, + url = {https://arxiv.org/abs/1306.5151}, + year = {2013}, +} +""", descriptive_stats={ "n_samples": {"test": 3333}, "avg_character_length": {"test": 431.4}, diff --git a/mteb/tasks/Image/ImageClassification/eng/Food101Classification.py b/mteb/tasks/Image/ImageClassification/eng/Food101Classification.py index 1bbe8e106b..a53b064ec6 100644 --- a/mteb/tasks/Image/ImageClassification/eng/Food101Classification.py +++ b/mteb/tasks/Image/ImageClassification/eng/Food101Classification.py @@ -29,13 +29,14 @@ class Food101Classification(AbsTaskImageClassification): dialect=[], modalities=["image"], sample_creation="created", - bibtex_citation=""" @inproceedings{bossard14, - title = {Food-101 -- Mining Discriminative Components with Random Forests}, - author = {Bossard, Lukas and Guillaumin, Matthieu and Van Gool, Luc}, - booktitle = {European Conference on Computer Vision}, - year = {2014} - } - """, + bibtex_citation=r""" +@inproceedings{bossard14, + author = {Bossard, Lukas and Guillaumin, Matthieu and Van Gool, Luc}, + booktitle = {European Conference on Computer Vision}, + title = {Food-101 -- Mining Discriminative Components with Random Forests}, + year = {2014}, +} +""", descriptive_stats={ "n_samples": {"validation": 25300}, "avg_character_length": {"validation": 431.4}, diff --git a/mteb/tasks/Image/ImageClassification/eng/GTSRBClassification.py b/mteb/tasks/Image/ImageClassification/eng/GTSRBClassification.py index a6feffede4..528f50f7b1 100644 --- a/mteb/tasks/Image/ImageClassification/eng/GTSRBClassification.py +++ b/mteb/tasks/Image/ImageClassification/eng/GTSRBClassification.py @@ -29,16 +29,18 @@ class GTSRBClassification(AbsTaskImageClassification): dialect=[], modalities=["image"], sample_creation="created", - bibtex_citation="""@INPROCEEDINGS{6033395, - author={Stallkamp, Johannes and Schlipsing, Marc and Salmen, Jan and Igel, Christian}, - booktitle={The 2011 International Joint Conference on Neural Networks}, - title={The German Traffic Sign Recognition Benchmark: A multi-class classification competition}, - year={2011}, - volume={}, - number={}, - pages={1453-1460}, - keywords={Humans;Training;Image color analysis;Benchmark testing;Lead;Histograms;Image resolution}, - doi={10.1109/IJCNN.2011.6033395}} + bibtex_citation=r""" +@inproceedings{6033395, + author = {Stallkamp, Johannes and Schlipsing, Marc and Salmen, Jan and Igel, Christian}, + booktitle = {The 2011 International Joint Conference on Neural Networks}, + doi = {10.1109/IJCNN.2011.6033395}, + keywords = {Humans;Training;Image color analysis;Benchmark testing;Lead;Histograms;Image resolution}, + number = {}, + pages = {1453-1460}, + title = {The German Traffic Sign Recognition Benchmark: A multi-class classification competition}, + volume = {}, + year = {2011}, +} """, descriptive_stats={ "n_samples": {"test": 12630}, diff --git a/mteb/tasks/Image/ImageClassification/eng/Imagenet1k.py b/mteb/tasks/Image/ImageClassification/eng/Imagenet1k.py index bed879d282..0a9155cc8f 100644 --- a/mteb/tasks/Image/ImageClassification/eng/Imagenet1k.py +++ b/mteb/tasks/Image/ImageClassification/eng/Imagenet1k.py @@ -29,14 +29,16 @@ class Imagenet1kClassification(AbsTaskImageClassification): dialect=[], modalities=["image"], sample_creation="created", - bibtex_citation="""@article{deng2009imagenet, - title={ImageNet: A large-scale hierarchical image database}, - author={Deng, Jia and Dong, Wei and Socher, Richard and Li, Li-Jia and Li, Kai and Fei-Fei, Li}, - journal={2009 IEEE Conference on Computer Vision and Pattern Recognition}, - pages={248--255}, - year={2009}, - organization={Ieee} - }""", + bibtex_citation=r""" +@article{deng2009imagenet, + author = {Deng, Jia and Dong, Wei and Socher, Richard and Li, Li-Jia and Li, Kai and Fei-Fei, Li}, + journal = {2009 IEEE Conference on Computer Vision and Pattern Recognition}, + organization = {Ieee}, + pages = {248--255}, + title = {ImageNet: A large-scale hierarchical image database}, + year = {2009}, +} +""", descriptive_stats={ "n_samples": {"test": 37200}, "avg_character_length": {"test": 0}, diff --git a/mteb/tasks/Image/ImageClassification/eng/MNISTClassification.py b/mteb/tasks/Image/ImageClassification/eng/MNISTClassification.py index 8230938a14..3a86d8b999 100644 --- a/mteb/tasks/Image/ImageClassification/eng/MNISTClassification.py +++ b/mteb/tasks/Image/ImageClassification/eng/MNISTClassification.py @@ -29,14 +29,15 @@ class MNISTClassification(AbsTaskImageClassification): dialect=[], modalities=["image"], sample_creation="created", - bibtex_citation="""@article{lecun2010mnist, - title={MNIST handwritten digit database}, - author={LeCun, Yann and Cortes, Corinna and Burges, CJ}, - journal={ATT Labs [Online]. Available: http://yann.lecun.com/exdb/mnist}, - volume={2}, - year={2010} - } - """, + bibtex_citation=r""" +@article{lecun2010mnist, + author = {LeCun, Yann and Cortes, Corinna and Burges, CJ}, + journal = {ATT Labs [Online]. Available: http://yann.lecun.com/exdb/mnist}, + title = {MNIST handwritten digit database}, + volume = {2}, + year = {2010}, +} +""", descriptive_stats={ "n_samples": {"test": 10000}, "avg_character_length": {"test": 431.4}, diff --git a/mteb/tasks/Image/ImageClassification/eng/OxfordFlowersClassification.py b/mteb/tasks/Image/ImageClassification/eng/OxfordFlowersClassification.py index 9411f88baf..d9453b8308 100644 --- a/mteb/tasks/Image/ImageClassification/eng/OxfordFlowersClassification.py +++ b/mteb/tasks/Image/ImageClassification/eng/OxfordFlowersClassification.py @@ -29,16 +29,19 @@ class OxfordFlowersClassification(AbsTaskImageClassification): dialect=[], modalities=["image"], sample_creation="found", - bibtex_citation="""@INPROCEEDINGS{4756141, - author={Nilsback, Maria-Elena and Zisserman, Andrew}, - booktitle={2008 Sixth Indian Conference on Computer Vision, Graphics & Image Processing}, - title={Automated Flower Classification over a Large Number of Classes}, - year={2008}, - volume={}, - number={}, - pages={722-729}, - keywords={Shape;Kernel;Distributed computing;Support vector machines;Support vector machine classification;object classification;segmentation}, - doi={10.1109/ICVGIP.2008.47}}""", + bibtex_citation=r""" +@inproceedings{4756141, + author = {Nilsback, Maria-Elena and Zisserman, Andrew}, + booktitle = {2008 Sixth Indian Conference on Computer Vision, Graphics & Image Processing}, + doi = {10.1109/ICVGIP.2008.47}, + keywords = {Shape;Kernel;Distributed computing;Support vector machines;Support vector machine classification;object classification;segmentation}, + number = {}, + pages = {722-729}, + title = {Automated Flower Classification over a Large Number of Classes}, + volume = {}, + year = {2008}, +} +""", descriptive_stats={ "n_samples": {"test": 400000}, "avg_character_length": {"test": 431.4}, diff --git a/mteb/tasks/Image/ImageClassification/eng/OxfordPetsClassification.py b/mteb/tasks/Image/ImageClassification/eng/OxfordPetsClassification.py index 9608deae75..5a287ba5a7 100644 --- a/mteb/tasks/Image/ImageClassification/eng/OxfordPetsClassification.py +++ b/mteb/tasks/Image/ImageClassification/eng/OxfordPetsClassification.py @@ -29,17 +29,19 @@ class OxfordPetsClassification(AbsTaskImageClassification): dialect=[], modalities=["image"], sample_creation="created", - bibtex_citation="""@INPROCEEDINGS{6248092, - author={Parkhi, Omkar M and Vedaldi, Andrea and Zisserman, Andrew and Jawahar, C. V.}, - booktitle={2012 IEEE Conference on Computer Vision and Pattern Recognition}, - title={Cats and dogs}, - year={2012}, - volume={}, - number={}, - pages={3498-3505}, - keywords={Positron emission tomography;Image segmentation;Cats;Dogs;Layout;Deformable models;Head}, - doi={10.1109/CVPR.2012.6248092}} - """, + bibtex_citation=r""" +@inproceedings{6248092, + author = {Parkhi, Omkar M and Vedaldi, Andrea and Zisserman, Andrew and Jawahar, C. V.}, + booktitle = {2012 IEEE Conference on Computer Vision and Pattern Recognition}, + doi = {10.1109/CVPR.2012.6248092}, + keywords = {Positron emission tomography;Image segmentation;Cats;Dogs;Layout;Deformable models;Head}, + number = {}, + pages = {3498-3505}, + title = {Cats and dogs}, + volume = {}, + year = {2012}, +} +""", descriptive_stats={ "n_samples": {"test": 3669}, "avg_character_length": {"test": 431.4}, diff --git a/mteb/tasks/Image/ImageClassification/eng/PatchCamelyonClassification.py b/mteb/tasks/Image/ImageClassification/eng/PatchCamelyonClassification.py index 27508c8c17..0baab06893 100644 --- a/mteb/tasks/Image/ImageClassification/eng/PatchCamelyonClassification.py +++ b/mteb/tasks/Image/ImageClassification/eng/PatchCamelyonClassification.py @@ -29,25 +29,26 @@ class PatchCamelyonClassification(AbsTaskImageClassification): dialect=[], modalities=["image"], sample_creation="created", - bibtex_citation="""@InProceedings{10.1007/978-3-030-00934-2_24, -author="Veeling, Bastiaan S. + bibtex_citation=r""" +@inproceedings{10.1007/978-3-030-00934-2_24, + abstract = {We propose a new model for digital pathology segmentation, based on the observation that histopathology images are inherently symmetric under rotation and reflection. Utilizing recent findings on rotation equivariant CNNs, the proposed model leverages these symmetries in a principled manner. We present a visual analysis showing improved stability on predictions, and demonstrate that exploiting rotation equivariance significantly improves tumor detection performance on a challenging lymph node metastases dataset. We further present a novel derived dataset to enable principled comparison of machine learning models, in combination with an initial benchmark. Through this dataset, the task of histopathology diagnosis becomes accessible as a challenging benchmark for fundamental machine learning research.}, + address = {Cham}, + author = {Veeling, Bastiaan S. and Linmans, Jasper and Winkens, Jim and Cohen, Taco -and Welling, Max", -editor="Frangi, Alejandro F. +and Welling, Max}, + booktitle = {Medical Image Computing and Computer Assisted Intervention -- MICCAI 2018}, + editor = {Frangi, Alejandro F. and Schnabel, Julia A. and Davatzikos, Christos and Alberola-L{\'o}pez, Carlos -and Fichtinger, Gabor", -title="Rotation Equivariant CNNs for Digital Pathology", -booktitle="Medical Image Computing and Computer Assisted Intervention -- MICCAI 2018", -year="2018", -publisher="Springer International Publishing", -address="Cham", -pages="210--218", -abstract="We propose a new model for digital pathology segmentation, based on the observation that histopathology images are inherently symmetric under rotation and reflection. Utilizing recent findings on rotation equivariant CNNs, the proposed model leverages these symmetries in a principled manner. We present a visual analysis showing improved stability on predictions, and demonstrate that exploiting rotation equivariance significantly improves tumor detection performance on a challenging lymph node metastases dataset. We further present a novel derived dataset to enable principled comparison of machine learning models, in combination with an initial benchmark. Through this dataset, the task of histopathology diagnosis becomes accessible as a challenging benchmark for fundamental machine learning research.", -isbn="978-3-030-00934-2" +and Fichtinger, Gabor}, + isbn = {978-3-030-00934-2}, + pages = {210--218}, + publisher = {Springer International Publishing}, + title = {Rotation Equivariant CNNs for Digital Pathology}, + year = {2018}, } """, descriptive_stats={ diff --git a/mteb/tasks/Image/ImageClassification/eng/RESISC45Classification.py b/mteb/tasks/Image/ImageClassification/eng/RESISC45Classification.py index a590f66690..0c1bfe3e14 100644 --- a/mteb/tasks/Image/ImageClassification/eng/RESISC45Classification.py +++ b/mteb/tasks/Image/ImageClassification/eng/RESISC45Classification.py @@ -29,17 +29,19 @@ class RESISC45Classification(AbsTaskImageClassification): dialect=[], modalities=["image"], sample_creation="created", - bibtex_citation="""@ARTICLE{7891544, - author={Cheng, Gong and Han, Junwei and Lu, Xiaoqiang}, - journal={Proceedings of the IEEE}, - title={Remote Sensing Image Scene Classification: Benchmark and State of the Art}, - year={2017}, - volume={105}, - number={10}, - pages={1865-1883}, - keywords={Remote sensing;Benchmark testing;Spatial resolution;Social network services;Satellites;Image analysis;Machine learning;Unsupervised learning;Classification;Benchmark data set;deep learning;handcrafted features;remote sensing image;scene classification;unsupervised feature learning}, - doi={10.1109/JPROC.2017.2675998}} - """, + bibtex_citation=r""" +@article{7891544, + author = {Cheng, Gong and Han, Junwei and Lu, Xiaoqiang}, + doi = {10.1109/JPROC.2017.2675998}, + journal = {Proceedings of the IEEE}, + keywords = {Remote sensing;Benchmark testing;Spatial resolution;Social network services;Satellites;Image analysis;Machine learning;Unsupervised learning;Classification;Benchmark data set;deep learning;handcrafted features;remote sensing image;scene classification;unsupervised feature learning}, + number = {10}, + pages = {1865-1883}, + title = {Remote Sensing Image Scene Classification: Benchmark and State of the Art}, + volume = {105}, + year = {2017}, +} +""", descriptive_stats={ "n_samples": {"test": 6300}, "avg_character_length": {"test": 256}, diff --git a/mteb/tasks/Image/ImageClassification/eng/STL10Classification.py b/mteb/tasks/Image/ImageClassification/eng/STL10Classification.py index 11ea833477..8ba861f1e8 100644 --- a/mteb/tasks/Image/ImageClassification/eng/STL10Classification.py +++ b/mteb/tasks/Image/ImageClassification/eng/STL10Classification.py @@ -29,22 +29,23 @@ class STL10Classification(AbsTaskImageClassification): dialect=[], modalities=["image"], sample_creation="created", - bibtex_citation="""@InProceedings{pmlr-v15-coates11a, - title = {An Analysis of Single-Layer Networks in Unsupervised Feature Learning}, - author = {Coates, Adam and Ng, Andrew and Lee, Honglak}, - booktitle = {Proceedings of the Fourteenth International Conference on Artificial Intelligence and Statistics}, - pages = {215--223}, - year = {2011}, - editor = {Gordon, Geoffrey and Dunson, David and Dudík, Miroslav}, - volume = {15}, - series = {Proceedings of Machine Learning Research}, - address = {Fort Lauderdale, FL, USA}, - month = {11--13 Apr}, - publisher = {PMLR}, - pdf = {http://proceedings.mlr.press/v15/coates11a/coates11a.pdf}, - url = {https://proceedings.mlr.press/v15/coates11a.html}, - } - """, + bibtex_citation=r""" +@inproceedings{pmlr-v15-coates11a, + address = {Fort Lauderdale, FL, USA}, + author = {Coates, Adam and Ng, Andrew and Lee, Honglak}, + booktitle = {Proceedings of the Fourteenth International Conference on Artificial Intelligence and Statistics}, + editor = {Gordon, Geoffrey and Dunson, David and Dudík, Miroslav}, + month = {11--13 Apr}, + pages = {215--223}, + pdf = {http://proceedings.mlr.press/v15/coates11a/coates11a.pdf}, + publisher = {PMLR}, + series = {Proceedings of Machine Learning Research}, + title = {An Analysis of Single-Layer Networks in Unsupervised Feature Learning}, + url = {https://proceedings.mlr.press/v15/coates11a.html}, + volume = {15}, + year = {2011}, +} +""", descriptive_stats={ "n_samples": {"test": 8000}, "avg_character_length": {"test": 431.4}, diff --git a/mteb/tasks/Image/ImageClassification/eng/SUN397Classification.py b/mteb/tasks/Image/ImageClassification/eng/SUN397Classification.py index 08d053fc2d..e6383a46e6 100644 --- a/mteb/tasks/Image/ImageClassification/eng/SUN397Classification.py +++ b/mteb/tasks/Image/ImageClassification/eng/SUN397Classification.py @@ -29,16 +29,18 @@ class SUN397Classification(AbsTaskImageClassification): dialect=[], modalities=["image"], sample_creation="created", - bibtex_citation="""@INPROCEEDINGS{5539970, - author={Xiao, Jianxiong and Hays, James and Ehinger, Krista A. and Oliva, Aude and Torralba, Antonio}, - booktitle={2010 IEEE Computer Society Conference on Computer Vision and Pattern Recognition}, - title={SUN database: Large-scale scene recognition from abbey to zoo}, - year={2010}, - volume={}, - number={}, - pages={3485-3492}, - doi={10.1109/CVPR.2010.5539970}} - """, + bibtex_citation=r""" +@inproceedings{5539970, + author = {Xiao, Jianxiong and Hays, James and Ehinger, Krista A. and Oliva, Aude and Torralba, Antonio}, + booktitle = {2010 IEEE Computer Society Conference on Computer Vision and Pattern Recognition}, + doi = {10.1109/CVPR.2010.5539970}, + number = {}, + pages = {3485-3492}, + title = {SUN database: Large-scale scene recognition from abbey to zoo}, + volume = {}, + year = {2010}, +} +""", descriptive_stats={ "n_samples": {"test": 21750}, "avg_character_length": {"test": 256}, diff --git a/mteb/tasks/Image/ImageClassification/eng/StanfordCarsClassification.py b/mteb/tasks/Image/ImageClassification/eng/StanfordCarsClassification.py index 74fa5e92b8..34f1fe2cb2 100644 --- a/mteb/tasks/Image/ImageClassification/eng/StanfordCarsClassification.py +++ b/mteb/tasks/Image/ImageClassification/eng/StanfordCarsClassification.py @@ -29,13 +29,14 @@ class StanfordCarsClassification(AbsTaskImageClassification): dialect=[], modalities=["image"], sample_creation="created", - bibtex_citation="""@inproceedings{Krause2013CollectingAL, - title={Collecting a Large-scale Dataset of Fine-grained Cars}, - author={Jonathan Krause and Jia Deng and Michael Stark and Li Fei-Fei}, - year={2013}, - url={https://api.semanticscholar.org/CorpusID:16632981} - } - """, + bibtex_citation=r""" +@inproceedings{Krause2013CollectingAL, + author = {Jonathan Krause and Jia Deng and Michael Stark and Li Fei-Fei}, + title = {Collecting a Large-scale Dataset of Fine-grained Cars}, + url = {https://api.semanticscholar.org/CorpusID:16632981}, + year = {2013}, +} +""", descriptive_stats={ "n_samples": {"test": 8041}, "avg_character_length": {"test": 431.4}, diff --git a/mteb/tasks/Image/ImageClassification/eng/UCF101Classification.py b/mteb/tasks/Image/ImageClassification/eng/UCF101Classification.py index 5f2a621cc7..8ffa24823b 100644 --- a/mteb/tasks/Image/ImageClassification/eng/UCF101Classification.py +++ b/mteb/tasks/Image/ImageClassification/eng/UCF101Classification.py @@ -33,15 +33,17 @@ class UCF101Classification(AbsTaskImageClassification): dialect=[], modalities=["image"], sample_creation="created", - bibtex_citation="""@misc{soomro2012ucf101dataset101human, - title={UCF101: A Dataset of 101 Human Actions Classes From Videos in The Wild}, - author={Khurram Soomro and Amir Roshan Zamir and Mubarak Shah}, - year={2012}, - eprint={1212.0402}, - archivePrefix={arXiv}, - primaryClass={cs.CV}, - url={https://arxiv.org/abs/1212.0402}, -}""", + bibtex_citation=r""" +@misc{soomro2012ucf101dataset101human, + archiveprefix = {arXiv}, + author = {Khurram Soomro and Amir Roshan Zamir and Mubarak Shah}, + eprint = {1212.0402}, + primaryclass = {cs.CV}, + title = {UCF101: A Dataset of 101 Human Actions Classes From Videos in The Wild}, + url = {https://arxiv.org/abs/1212.0402}, + year = {2012}, +} +""", descriptive_stats={ "n_samples": {"test": 697222}, "avg_character_length": {"test": 0}, diff --git a/mteb/tasks/Image/Clustering/__init__.py b/mteb/tasks/Image/ImageClustering/__init__.py similarity index 100% rename from mteb/tasks/Image/Clustering/__init__.py rename to mteb/tasks/Image/ImageClustering/__init__.py diff --git a/mteb/tasks/Image/Clustering/eng/CIFAR.py b/mteb/tasks/Image/ImageClustering/eng/CIFAR.py similarity index 81% rename from mteb/tasks/Image/Clustering/eng/CIFAR.py rename to mteb/tasks/Image/ImageClustering/eng/CIFAR.py index a10906d105..f64b728f14 100644 --- a/mteb/tasks/Image/Clustering/eng/CIFAR.py +++ b/mteb/tasks/Image/ImageClustering/eng/CIFAR.py @@ -29,13 +29,14 @@ class CIFAR10Clustering(AbsTaskImageClustering): dialect=[], modalities=["image"], sample_creation="created", - bibtex_citation=""" @TECHREPORT{Krizhevsky09learningmultiple, - author = {Alex Krizhevsky}, - title = {Learning multiple layers of features from tiny images}, - institution = {}, - year = {2009} - } - """, + bibtex_citation=r""" +@techreport{Krizhevsky09learningmultiple, + author = {Alex Krizhevsky}, + institution = {}, + title = {Learning multiple layers of features from tiny images}, + year = {2009}, +} +""", descriptive_stats={ "n_samples": {"test": 10000}, "avg_character_length": {"test": 431.4}, @@ -70,13 +71,14 @@ class CIFAR100Clustering(AbsTaskImageClustering): dialect=[], modalities=["image"], sample_creation="created", - bibtex_citation=""" @TECHREPORT{Krizhevsky09learningmultiple, - author = {Alex Krizhevsky}, - title = {Learning multiple layers of features from tiny images}, - institution = {}, - year = {2009} - } - """, + bibtex_citation=r""" +@techreport{Krizhevsky09learningmultiple, + author = {Alex Krizhevsky}, + institution = {}, + title = {Learning multiple layers of features from tiny images}, + year = {2009}, +} +""", descriptive_stats={ "n_samples": {"test": 10000}, "avg_character_length": {"test": 431.4}, diff --git a/mteb/tasks/Image/Clustering/eng/ImageNet.py b/mteb/tasks/Image/ImageClustering/eng/ImageNet.py similarity index 66% rename from mteb/tasks/Image/Clustering/eng/ImageNet.py rename to mteb/tasks/Image/ImageClustering/eng/ImageNet.py index 5969396dc5..2de235666f 100644 --- a/mteb/tasks/Image/Clustering/eng/ImageNet.py +++ b/mteb/tasks/Image/ImageClustering/eng/ImageNet.py @@ -26,17 +26,19 @@ class ImageNetDog15Clustering(AbsTaskImageClustering): dialect=[], modalities=["image"], sample_creation="created", - bibtex_citation=""" @INPROCEEDINGS{5206848, - author={Deng, Jia and Dong, Wei and Socher, Richard and Li, Li-Jia and Kai Li and Li Fei-Fei}, - booktitle={2009 IEEE Conference on Computer Vision and Pattern Recognition}, - title={ImageNet: A large-scale hierarchical image database}, - year={2009}, - volume={}, - number={}, - pages={248-255}, - keywords={Large-scale systems;Image databases;Explosions;Internet;Robustness;Information retrieval;Image retrieval;Multimedia databases;Ontologies;Spine}, - doi={10.1109/CVPR.2009.5206848}} - """, + bibtex_citation=r""" +@inproceedings{5206848, + author = {Deng, Jia and Dong, Wei and Socher, Richard and Li, Li-Jia and Kai Li and Li Fei-Fei}, + booktitle = {2009 IEEE Conference on Computer Vision and Pattern Recognition}, + doi = {10.1109/CVPR.2009.5206848}, + keywords = {Large-scale systems;Image databases;Explosions;Internet;Robustness;Information retrieval;Image retrieval;Multimedia databases;Ontologies;Spine}, + number = {}, + pages = {248-255}, + title = {ImageNet: A large-scale hierarchical image database}, + volume = {}, + year = {2009}, +} +""", descriptive_stats={ "n_samples": {"test": 1076, "train": 1500}, # "avg_character_length": {"test": 431.4}, @@ -66,17 +68,19 @@ class ImageNet10Clustering(AbsTaskImageClustering): dialect=[], modalities=["image"], sample_creation="created", - bibtex_citation=""" @INPROCEEDINGS{5206848, - author={Deng, Jia and Dong, Wei and Socher, Richard and Li, Li-Jia and Kai Li and Li Fei-Fei}, - booktitle={2009 IEEE Conference on Computer Vision and Pattern Recognition}, - title={ImageNet: A large-scale hierarchical image database}, - year={2009}, - volume={}, - number={}, - pages={248-255}, - keywords={Large-scale systems;Image databases;Explosions;Internet;Robustness;Information retrieval;Image retrieval;Multimedia databases;Ontologies;Spine}, - doi={10.1109/CVPR.2009.5206848}} - """, + bibtex_citation=r""" +@inproceedings{5206848, + author = {Deng, Jia and Dong, Wei and Socher, Richard and Li, Li-Jia and Kai Li and Li Fei-Fei}, + booktitle = {2009 IEEE Conference on Computer Vision and Pattern Recognition}, + doi = {10.1109/CVPR.2009.5206848}, + keywords = {Large-scale systems;Image databases;Explosions;Internet;Robustness;Information retrieval;Image retrieval;Multimedia databases;Ontologies;Spine}, + number = {}, + pages = {248-255}, + title = {ImageNet: A large-scale hierarchical image database}, + volume = {}, + year = {2009}, +} +""", descriptive_stats={ "n_samples": {"test": 13000}, # "avg_character_length": {"test": 431.4}, diff --git a/mteb/tasks/Image/Clustering/eng/TinyImageNet.py b/mteb/tasks/Image/ImageClustering/eng/TinyImageNet.py similarity index 97% rename from mteb/tasks/Image/Clustering/eng/TinyImageNet.py rename to mteb/tasks/Image/ImageClustering/eng/TinyImageNet.py index d49ebbfde6..dafc9686d9 100644 --- a/mteb/tasks/Image/Clustering/eng/TinyImageNet.py +++ b/mteb/tasks/Image/ImageClustering/eng/TinyImageNet.py @@ -29,7 +29,7 @@ class TinyImageNet(AbsTaskImageClustering): dialect=[], modalities=["image"], sample_creation="found", - bibtex_citation="""d""", + bibtex_citation="", descriptive_stats={ "n_samples": {"valid": 10000}, "avg_character_length": {"valid": 431.4}, diff --git a/mteb/tasks/Image/Clustering/eng/__init__.py b/mteb/tasks/Image/ImageClustering/eng/__init__.py similarity index 100% rename from mteb/tasks/Image/Clustering/eng/__init__.py rename to mteb/tasks/Image/ImageClustering/eng/__init__.py diff --git a/mteb/tasks/Image/ImageMultilabelClassification/eng/PascalVOC2007.py b/mteb/tasks/Image/ImageMultilabelClassification/eng/PascalVOC2007.py index e3839505d6..d7907983c2 100644 --- a/mteb/tasks/Image/ImageMultilabelClassification/eng/PascalVOC2007.py +++ b/mteb/tasks/Image/ImageMultilabelClassification/eng/PascalVOC2007.py @@ -35,17 +35,18 @@ class VOC2007Classification(AbsTaskImageMultilabelClassification): dialect=[], modalities=["image"], sample_creation="created", - bibtex_citation="""@Article{Everingham10, - author = "Everingham, M. and Van~Gool, L. and Williams, C. K. I. and Winn, J. and Zisserman, A.", - title = "The Pascal Visual Object Classes (VOC) Challenge", - journal = "International Journal of Computer Vision", - volume = "88", - year = "2010", - number = "2", - month = jun, - pages = "303--338", - } - """, + bibtex_citation=r""" +@article{Everingham10, + author = {Everingham, M. and Van~Gool, L. and Williams, C. K. I. and Winn, J. and Zisserman, A.}, + journal = {International Journal of Computer Vision}, + month = jun, + number = {2}, + pages = {303--338}, + title = {The Pascal Visual Object Classes (VOC) Challenge}, + volume = {88}, + year = {2010}, +} +""", descriptive_stats={ "n_samples": {"test": 4952}, "avg_character_length": {"test": 431.4}, diff --git a/mteb/tasks/Image/ImageTextPairClassification/AROCocoOrder.py b/mteb/tasks/Image/ImageTextPairClassification/AROCocoOrder.py index 8a227494ae..f538c5b716 100644 --- a/mteb/tasks/Image/ImageTextPairClassification/AROCocoOrder.py +++ b/mteb/tasks/Image/ImageTextPairClassification/AROCocoOrder.py @@ -41,12 +41,14 @@ class AROCocoOrder(AbsTaskImageTextPairClassification): dialect=[], modalities=["text", "image"], sample_creation="created", - bibtex_citation="""@inproceedings{yuksekgonul2023and, - title={When and why vision-language models behave like bags-of-words, and what to do about it?}, - author={Yuksekgonul, Mert and Bianchi, Federico and Kalluri, Pratyusha and Jurafsky, Dan and Zou, James}, - booktitle={The Eleventh International Conference on Learning Representations}, - year={2023} -}""", + bibtex_citation=r""" +@inproceedings{yuksekgonul2023and, + author = {Yuksekgonul, Mert and Bianchi, Federico and Kalluri, Pratyusha and Jurafsky, Dan and Zou, James}, + booktitle = {The Eleventh International Conference on Learning Representations}, + title = {When and why vision-language models behave like bags-of-words, and what to do about it?}, + year = {2023}, +} +""", descriptive_stats={ "n_samples": {"test": 25010}, "avg_character_length": {"test": 1}, diff --git a/mteb/tasks/Image/ImageTextPairClassification/AROFlickrOrder.py b/mteb/tasks/Image/ImageTextPairClassification/AROFlickrOrder.py index 78fc0b8c79..b3a44cfd3a 100644 --- a/mteb/tasks/Image/ImageTextPairClassification/AROFlickrOrder.py +++ b/mteb/tasks/Image/ImageTextPairClassification/AROFlickrOrder.py @@ -41,12 +41,14 @@ class AROFlickrOrder(AbsTaskImageTextPairClassification): dialect=[], modalities=["text", "image"], sample_creation="created", - bibtex_citation="""@inproceedings{yuksekgonul2023and, - title={When and why vision-language models behave like bags-of-words, and what to do about it?}, - author={Yuksekgonul, Mert and Bianchi, Federico and Kalluri, Pratyusha and Jurafsky, Dan and Zou, James}, - booktitle={The Eleventh International Conference on Learning Representations}, - year={2023} -}""", + bibtex_citation=r""" +@inproceedings{yuksekgonul2023and, + author = {Yuksekgonul, Mert and Bianchi, Federico and Kalluri, Pratyusha and Jurafsky, Dan and Zou, James}, + booktitle = {The Eleventh International Conference on Learning Representations}, + title = {When and why vision-language models behave like bags-of-words, and what to do about it?}, + year = {2023}, +} +""", descriptive_stats={ "n_samples": {"test": 5000}, "avg_character_length": {"test": 1}, diff --git a/mteb/tasks/Image/ImageTextPairClassification/AROVisualAttribution.py b/mteb/tasks/Image/ImageTextPairClassification/AROVisualAttribution.py index b43ac87a00..d051e59470 100644 --- a/mteb/tasks/Image/ImageTextPairClassification/AROVisualAttribution.py +++ b/mteb/tasks/Image/ImageTextPairClassification/AROVisualAttribution.py @@ -34,12 +34,14 @@ class AROVisualAttribution(AbsTaskImageTextPairClassification): dialect=[], modalities=["text", "image"], sample_creation="created", - bibtex_citation="""@inproceedings{yuksekgonul2023and, - title={When and why vision-language models behave like bags-of-words, and what to do about it?}, - author={Yuksekgonul, Mert and Bianchi, Federico and Kalluri, Pratyusha and Jurafsky, Dan and Zou, James}, - booktitle={The Eleventh International Conference on Learning Representations}, - year={2023} -}""", + bibtex_citation=r""" +@inproceedings{yuksekgonul2023and, + author = {Yuksekgonul, Mert and Bianchi, Federico and Kalluri, Pratyusha and Jurafsky, Dan and Zou, James}, + booktitle = {The Eleventh International Conference on Learning Representations}, + title = {When and why vision-language models behave like bags-of-words, and what to do about it?}, + year = {2023}, +} +""", descriptive_stats={ "n_samples": {"test": 28748}, "avg_character_length": {"test": 1}, diff --git a/mteb/tasks/Image/ImageTextPairClassification/AROVisualRelation.py b/mteb/tasks/Image/ImageTextPairClassification/AROVisualRelation.py index 1d74de646c..170e0f07cb 100644 --- a/mteb/tasks/Image/ImageTextPairClassification/AROVisualRelation.py +++ b/mteb/tasks/Image/ImageTextPairClassification/AROVisualRelation.py @@ -34,12 +34,14 @@ class AROVisualRelation(AbsTaskImageTextPairClassification): dialect=[], modalities=["text", "image"], sample_creation="created", - bibtex_citation="""@inproceedings{yuksekgonul2023and, - title={When and why vision-language models behave like bags-of-words, and what to do about it?}, - author={Yuksekgonul, Mert and Bianchi, Federico and Kalluri, Pratyusha and Jurafsky, Dan and Zou, James}, - booktitle={The Eleventh International Conference on Learning Representations}, - year={2023} -}""", + bibtex_citation=r""" +@inproceedings{yuksekgonul2023and, + author = {Yuksekgonul, Mert and Bianchi, Federico and Kalluri, Pratyusha and Jurafsky, Dan and Zou, James}, + booktitle = {The Eleventh International Conference on Learning Representations}, + title = {When and why vision-language models behave like bags-of-words, and what to do about it?}, + year = {2023}, +} +""", descriptive_stats={ "n_samples": {"test": 23937}, "avg_character_length": {"test": 1}, diff --git a/mteb/tasks/Image/ImageTextPairClassification/ImageCoDe.py b/mteb/tasks/Image/ImageTextPairClassification/ImageCoDe.py index a413a78ccc..5361a0dce6 100644 --- a/mteb/tasks/Image/ImageTextPairClassification/ImageCoDe.py +++ b/mteb/tasks/Image/ImageTextPairClassification/ImageCoDe.py @@ -45,11 +45,12 @@ class ImageCoDe(AbsTaskImageTextPairClassification): dialect=[], modalities=["text", "image"], sample_creation="found", - bibtex_citation="""@article{krojer2022image, - title={Image retrieval from contextual descriptions}, - author={Krojer, Benno and Adlakha, Vaibhav and Vineet, Vibhav and Goyal, Yash and Ponti, Edoardo and Reddy, Siva}, - journal={arXiv preprint arXiv:2203.15867}, - year={2022} + bibtex_citation=r""" +@article{krojer2022image, + author = {Krojer, Benno and Adlakha, Vaibhav and Vineet, Vibhav and Goyal, Yash and Ponti, Edoardo and Reddy, Siva}, + journal = {arXiv preprint arXiv:2203.15867}, + title = {Image retrieval from contextual descriptions}, + year = {2022}, } """, descriptive_stats={ diff --git a/mteb/tasks/Image/ImageTextPairClassification/SugarCrepe.py b/mteb/tasks/Image/ImageTextPairClassification/SugarCrepe.py index 94114f100e..0225865cb2 100644 --- a/mteb/tasks/Image/ImageTextPairClassification/SugarCrepe.py +++ b/mteb/tasks/Image/ImageTextPairClassification/SugarCrepe.py @@ -36,13 +36,15 @@ class SugarCrepe(AbsTaskImageTextPairClassification): dialect=[], modalities=["text", "image"], sample_creation="created", - bibtex_citation="""@article{hsieh2024sugarcrepe, - title={Sugarcrepe: Fixing hackable benchmarks for vision-language compositionality}, - author={Hsieh, Cheng-Yu and Zhang, Jieyu and Ma, Zixian and Kembhavi, Aniruddha and Krishna, Ranjay}, - journal={Advances in neural information processing systems}, - volume={36}, - year={2024} -}""", + bibtex_citation=r""" +@article{hsieh2024sugarcrepe, + author = {Hsieh, Cheng-Yu and Zhang, Jieyu and Ma, Zixian and Kembhavi, Aniruddha and Krishna, Ranjay}, + journal = {Advances in neural information processing systems}, + title = {Sugarcrepe: Fixing hackable benchmarks for vision-language compositionality}, + volume = {36}, + year = {2024}, +} +""", descriptive_stats={ "n_samples": {"test": 7511}, "avg_character_length": {"test": 1}, diff --git a/mteb/tasks/Image/ImageTextPairClassification/Winoground.py b/mteb/tasks/Image/ImageTextPairClassification/Winoground.py index ff0c435fb7..f226a55466 100644 --- a/mteb/tasks/Image/ImageTextPairClassification/Winoground.py +++ b/mteb/tasks/Image/ImageTextPairClassification/Winoground.py @@ -34,15 +34,17 @@ class Winoground(AbsTaskImageTextPairClassification): dialect=[], modalities=["text", "image"], sample_creation="created", - bibtex_citation="""@misc{thrush2022winogroundprobingvisionlanguage, - title={Winoground: Probing Vision and Language Models for Visio-Linguistic Compositionality}, - author={Tristan Thrush and Ryan Jiang and Max Bartolo and Amanpreet Singh and Adina Williams and Douwe Kiela and Candace Ross}, - year={2022}, - eprint={2204.03162}, - archivePrefix={arXiv}, - primaryClass={cs.CV}, - url={https://arxiv.org/abs/2204.03162}, - }""", + bibtex_citation=r""" +@misc{thrush2022winogroundprobingvisionlanguage, + archiveprefix = {arXiv}, + author = {Tristan Thrush and Ryan Jiang and Max Bartolo and Amanpreet Singh and Adina Williams and Douwe Kiela and Candace Ross}, + eprint = {2204.03162}, + primaryclass = {cs.CV}, + title = {Winoground: Probing Vision and Language Models for Visio-Linguistic Compositionality}, + url = {https://arxiv.org/abs/2204.03162}, + year = {2022}, +} +""", descriptive_stats={ "n_samples": {"test": 400}, "avg_character_length": {"test": 431.4}, diff --git a/mteb/tasks/Image/VisualSTS/__init__.py b/mteb/tasks/Image/VisualSTS/__init__.py index eb785d5d85..832708310c 100644 --- a/mteb/tasks/Image/VisualSTS/__init__.py +++ b/mteb/tasks/Image/VisualSTS/__init__.py @@ -1,9 +1,9 @@ from __future__ import annotations -from .en.STS12VisualSTS import * -from .en.STS13VisualSTS import * -from .en.STS14VisualSTS import * -from .en.STS15VisualSTS import * -from .en.STS16VisualSTS import * +from .eng.STS12VisualSTS import * +from .eng.STS13VisualSTS import * +from .eng.STS14VisualSTS import * +from .eng.STS15VisualSTS import * +from .eng.STS16VisualSTS import * from .multilingual.STS17MultilingualVisualSTS import * from .multilingual.STSBenchmarkMultilingualVisualSTS import * diff --git a/mteb/tasks/Image/VisualSTS/en/STS12VisualSTS.py b/mteb/tasks/Image/VisualSTS/eng/STS12VisualSTS.py similarity index 78% rename from mteb/tasks/Image/VisualSTS/en/STS12VisualSTS.py rename to mteb/tasks/Image/VisualSTS/eng/STS12VisualSTS.py index 3a99e9fc47..1bc81485c0 100644 --- a/mteb/tasks/Image/VisualSTS/en/STS12VisualSTS.py +++ b/mteb/tasks/Image/VisualSTS/eng/STS12VisualSTS.py @@ -26,12 +26,14 @@ class STS12VisualSTS(AbsTaskVisualSTS): annotations_creators="human-annotated", dialect=[], sample_creation="rendered", - bibtex_citation="""@article{xiao2024pixel, - title={Pixel Sentence Representation Learning}, - author={Xiao, Chenghao and Huang, Zhuoxu and Chen, Danlu and Hudson, G Thomas and Li, Yizhi and Duan, Haoran and Lin, Chenghua and Fu, Jie and Han, Jungong and Moubayed, Noura Al}, - journal={arXiv preprint arXiv:2402.08183}, - year={2024} -}""", + bibtex_citation=r""" +@article{xiao2024pixel, + author = {Xiao, Chenghao and Huang, Zhuoxu and Chen, Danlu and Hudson, G Thomas and Li, Yizhi and Duan, Haoran and Lin, Chenghua and Fu, Jie and Han, Jungong and Moubayed, Noura Al}, + journal = {arXiv preprint arXiv:2402.08183}, + title = {Pixel Sentence Representation Learning}, + year = {2024}, +} +""", descriptive_stats={ "n_samples": {"test": 5342}, "avg_character_length": {"dev": 1.0, "test": 1.0}, diff --git a/mteb/tasks/Image/VisualSTS/en/STS13VisualSTS.py b/mteb/tasks/Image/VisualSTS/eng/STS13VisualSTS.py similarity index 78% rename from mteb/tasks/Image/VisualSTS/en/STS13VisualSTS.py rename to mteb/tasks/Image/VisualSTS/eng/STS13VisualSTS.py index b66678c851..fde965fb93 100644 --- a/mteb/tasks/Image/VisualSTS/en/STS13VisualSTS.py +++ b/mteb/tasks/Image/VisualSTS/eng/STS13VisualSTS.py @@ -26,12 +26,14 @@ class STS13VisualSTS(AbsTaskVisualSTS): annotations_creators="human-annotated", dialect=[], sample_creation="rendered", - bibtex_citation="""@article{xiao2024pixel, - title={Pixel Sentence Representation Learning}, - author={Xiao, Chenghao and Huang, Zhuoxu and Chen, Danlu and Hudson, G Thomas and Li, Yizhi and Duan, Haoran and Lin, Chenghua and Fu, Jie and Han, Jungong and Moubayed, Noura Al}, - journal={arXiv preprint arXiv:2402.08183}, - year={2024} -}""", + bibtex_citation=r""" +@article{xiao2024pixel, + author = {Xiao, Chenghao and Huang, Zhuoxu and Chen, Danlu and Hudson, G Thomas and Li, Yizhi and Duan, Haoran and Lin, Chenghua and Fu, Jie and Han, Jungong and Moubayed, Noura Al}, + journal = {arXiv preprint arXiv:2402.08183}, + title = {Pixel Sentence Representation Learning}, + year = {2024}, +} +""", descriptive_stats={ "n_samples": {"test": 1500}, "avg_character_length": {"dev": 1.0, "test": 1.0}, diff --git a/mteb/tasks/Image/VisualSTS/en/STS14VisualSTS.py b/mteb/tasks/Image/VisualSTS/eng/STS14VisualSTS.py similarity index 79% rename from mteb/tasks/Image/VisualSTS/en/STS14VisualSTS.py rename to mteb/tasks/Image/VisualSTS/eng/STS14VisualSTS.py index 0820ed7823..39c9d725e2 100644 --- a/mteb/tasks/Image/VisualSTS/en/STS14VisualSTS.py +++ b/mteb/tasks/Image/VisualSTS/eng/STS14VisualSTS.py @@ -27,12 +27,14 @@ class STS14VisualSTS(AbsTaskVisualSTS): annotations_creators="derived", dialect=[], sample_creation="rendered", - bibtex_citation="""@article{xiao2024pixel, - title={Pixel Sentence Representation Learning}, - author={Xiao, Chenghao and Huang, Zhuoxu and Chen, Danlu and Hudson, G Thomas and Li, Yizhi and Duan, Haoran and Lin, Chenghua and Fu, Jie and Han, Jungong and Moubayed, Noura Al}, - journal={arXiv preprint arXiv:2402.08183}, - year={2024} -}""", + bibtex_citation=r""" +@article{xiao2024pixel, + author = {Xiao, Chenghao and Huang, Zhuoxu and Chen, Danlu and Hudson, G Thomas and Li, Yizhi and Duan, Haoran and Lin, Chenghua and Fu, Jie and Han, Jungong and Moubayed, Noura Al}, + journal = {arXiv preprint arXiv:2402.08183}, + title = {Pixel Sentence Representation Learning}, + year = {2024}, +} +""", descriptive_stats={ "n_samples": {"test": 3750}, "avg_character_length": {"dev": 1.0, "test": 1.0}, diff --git a/mteb/tasks/Image/VisualSTS/en/STS15VisualSTS.py b/mteb/tasks/Image/VisualSTS/eng/STS15VisualSTS.py similarity index 78% rename from mteb/tasks/Image/VisualSTS/en/STS15VisualSTS.py rename to mteb/tasks/Image/VisualSTS/eng/STS15VisualSTS.py index 8a9b8c682b..3ca654b2e4 100644 --- a/mteb/tasks/Image/VisualSTS/en/STS15VisualSTS.py +++ b/mteb/tasks/Image/VisualSTS/eng/STS15VisualSTS.py @@ -26,12 +26,14 @@ class STS15VisualSTS(AbsTaskVisualSTS): annotations_creators="human-annotated", dialect=[], sample_creation="rendered", - bibtex_citation="""@article{xiao2024pixel, - title={Pixel Sentence Representation Learning}, - author={Xiao, Chenghao and Huang, Zhuoxu and Chen, Danlu and Hudson, G Thomas and Li, Yizhi and Duan, Haoran and Lin, Chenghua and Fu, Jie and Han, Jungong and Moubayed, Noura Al}, - journal={arXiv preprint arXiv:2402.08183}, - year={2024} -}""", + bibtex_citation=r""" +@article{xiao2024pixel, + author = {Xiao, Chenghao and Huang, Zhuoxu and Chen, Danlu and Hudson, G Thomas and Li, Yizhi and Duan, Haoran and Lin, Chenghua and Fu, Jie and Han, Jungong and Moubayed, Noura Al}, + journal = {arXiv preprint arXiv:2402.08183}, + title = {Pixel Sentence Representation Learning}, + year = {2024}, +} +""", descriptive_stats={ "n_samples": {"test": 3000}, "avg_character_length": {"dev": 1.0, "test": 1.0}, diff --git a/mteb/tasks/Image/VisualSTS/en/STS16VisualSTS.py b/mteb/tasks/Image/VisualSTS/eng/STS16VisualSTS.py similarity index 78% rename from mteb/tasks/Image/VisualSTS/en/STS16VisualSTS.py rename to mteb/tasks/Image/VisualSTS/eng/STS16VisualSTS.py index ea82fa5a8b..40b52e8c1f 100644 --- a/mteb/tasks/Image/VisualSTS/en/STS16VisualSTS.py +++ b/mteb/tasks/Image/VisualSTS/eng/STS16VisualSTS.py @@ -26,12 +26,14 @@ class STS16VisualSTS(AbsTaskVisualSTS): annotations_creators="human-annotated", dialect=[], sample_creation="rendered", - bibtex_citation="""@article{xiao2024pixel, - title={Pixel Sentence Representation Learning}, - author={Xiao, Chenghao and Huang, Zhuoxu and Chen, Danlu and Hudson, G Thomas and Li, Yizhi and Duan, Haoran and Lin, Chenghua and Fu, Jie and Han, Jungong and Moubayed, Noura Al}, - journal={arXiv preprint arXiv:2402.08183}, - year={2024} -}""", + bibtex_citation=r""" +@article{xiao2024pixel, + author = {Xiao, Chenghao and Huang, Zhuoxu and Chen, Danlu and Hudson, G Thomas and Li, Yizhi and Duan, Haoran and Lin, Chenghua and Fu, Jie and Han, Jungong and Moubayed, Noura Al}, + journal = {arXiv preprint arXiv:2402.08183}, + title = {Pixel Sentence Representation Learning}, + year = {2024}, +} +""", descriptive_stats={ "n_samples": {"test": 1186}, "avg_character_length": {"dev": 1.0, "test": 1.0}, diff --git a/mteb/tasks/Image/VisualSTS/en/__init__.py b/mteb/tasks/Image/VisualSTS/eng/__init__.py similarity index 100% rename from mteb/tasks/Image/VisualSTS/en/__init__.py rename to mteb/tasks/Image/VisualSTS/eng/__init__.py diff --git a/mteb/tasks/Image/VisualSTS/multilingual/STS17MultilingualVisualSTS.py b/mteb/tasks/Image/VisualSTS/multilingual/STS17MultilingualVisualSTS.py index 2bf15406f3..06471ba36b 100644 --- a/mteb/tasks/Image/VisualSTS/multilingual/STS17MultilingualVisualSTS.py +++ b/mteb/tasks/Image/VisualSTS/multilingual/STS17MultilingualVisualSTS.py @@ -46,12 +46,14 @@ class STS17MultilingualVisualSTS(AbsTaskVisualSTS, MultilingualTask): annotations_creators="human-annotated", dialect=[], sample_creation="rendered", - bibtex_citation="""@article{xiao2024pixel, - title={Pixel Sentence Representation Learning}, - author={Xiao, Chenghao and Huang, Zhuoxu and Chen, Danlu and Hudson, G Thomas and Li, Yizhi and Duan, Haoran and Lin, Chenghua and Fu, Jie and Han, Jungong and Moubayed, Noura Al}, - journal={arXiv preprint arXiv:2402.08183}, - year={2024} -}""", + bibtex_citation=r""" +@article{xiao2024pixel, + author = {Xiao, Chenghao and Huang, Zhuoxu and Chen, Danlu and Hudson, G Thomas and Li, Yizhi and Duan, Haoran and Lin, Chenghua and Fu, Jie and Han, Jungong and Moubayed, Noura Al}, + journal = {arXiv preprint arXiv:2402.08183}, + title = {Pixel Sentence Representation Learning}, + year = {2024}, +} +""", descriptive_stats={ "n_samples": {"test": 10692}, "avg_character_length": {"dev": 1.0, "test": 1.0}, diff --git a/mteb/tasks/Image/VisualSTS/multilingual/STSBenchmarkMultilingualVisualSTS.py b/mteb/tasks/Image/VisualSTS/multilingual/STSBenchmarkMultilingualVisualSTS.py index 9cab4e2f45..b32cf20711 100644 --- a/mteb/tasks/Image/VisualSTS/multilingual/STSBenchmarkMultilingualVisualSTS.py +++ b/mteb/tasks/Image/VisualSTS/multilingual/STSBenchmarkMultilingualVisualSTS.py @@ -47,12 +47,14 @@ class STSBenchmarkMultilingualVisualSTS(AbsTaskVisualSTS, MultilingualTask): annotations_creators="human-annotated", dialect=[], sample_creation="rendered", - bibtex_citation="""@article{xiao2024pixel, - title={Pixel Sentence Representation Learning}, - author={Xiao, Chenghao and Huang, Zhuoxu and Chen, Danlu and Hudson, G Thomas and Li, Yizhi and Duan, Haoran and Lin, Chenghua and Fu, Jie and Han, Jungong and Moubayed, Noura Al}, - journal={arXiv preprint arXiv:2402.08183}, - year={2024} -}""", + bibtex_citation=r""" +@article{xiao2024pixel, + author = {Xiao, Chenghao and Huang, Zhuoxu and Chen, Danlu and Hudson, G Thomas and Li, Yizhi and Duan, Haoran and Lin, Chenghua and Fu, Jie and Han, Jungong and Moubayed, Noura Al}, + journal = {arXiv preprint arXiv:2402.08183}, + title = {Pixel Sentence Representation Learning}, + year = {2024}, +} +""", descriptive_stats={ "n_samples": {"dev": 15000, "test": 13790}, "avg_character_length": {"dev": 1.0, "test": 1.0}, diff --git a/mteb/tasks/Image/ZeroShotClassification/eng/Birdsnap.py b/mteb/tasks/Image/ZeroShotClassification/eng/Birdsnap.py index d8f6ecc279..280c2f2ee5 100644 --- a/mteb/tasks/Image/ZeroShotClassification/eng/Birdsnap.py +++ b/mteb/tasks/Image/ZeroShotClassification/eng/Birdsnap.py @@ -31,14 +31,15 @@ class BirdsnapZeroShotClassification(AbsTaskZeroShotClassification): dialect=[], modalities=["image", "text"], sample_creation="created", - bibtex_citation="""@InProceedings{Berg_2014_CVPR, - author = {Berg, Thomas and Liu, Jiongxin and Woo Lee, Seung and Alexander, Michelle L. and Jacobs, David W. and Belhumeur, Peter N.}, - title = {Birdsnap: Large-scale Fine-grained Visual Categorization of Birds}, - booktitle = {Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR)}, - month = {June}, - year = {2014} - } - """, + bibtex_citation=r""" +@inproceedings{Berg_2014_CVPR, + author = {Berg, Thomas and Liu, Jiongxin and Woo Lee, Seung and Alexander, Michelle L. and Jacobs, David W. and Belhumeur, Peter N.}, + booktitle = {Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR)}, + month = {June}, + title = {Birdsnap: Large-scale Fine-grained Visual Categorization of Birds}, + year = {2014}, +} +""", descriptive_stats={ "n_samples": {"test": 1851}, "avg_character_length": {"test": 431.4}, diff --git a/mteb/tasks/Image/ZeroShotClassification/eng/CIFAR.py b/mteb/tasks/Image/ZeroShotClassification/eng/CIFAR.py index cf3d349f22..56ce00d786 100644 --- a/mteb/tasks/Image/ZeroShotClassification/eng/CIFAR.py +++ b/mteb/tasks/Image/ZeroShotClassification/eng/CIFAR.py @@ -31,13 +31,14 @@ class CIFAR10ZeroShotClassification(AbsTaskZeroShotClassification): dialect=[], modalities=["text", "image"], sample_creation="created", - bibtex_citation=""" @TECHREPORT{Krizhevsky09learningmultiple, - author = {Alex Krizhevsky}, - title = {Learning multiple layers of features from tiny images}, - institution = {}, - year = {2009} - } - """, + bibtex_citation=r""" +@techreport{Krizhevsky09learningmultiple, + author = {Alex Krizhevsky}, + institution = {}, + title = {Learning multiple layers of features from tiny images}, + year = {2009}, +} +""", descriptive_stats={ "n_samples": {"test": 10000}, "avg_character_length": {"test": 431.4}, @@ -77,13 +78,14 @@ class CIFAR100ZeroShotClassification(AbsTaskZeroShotClassification): dialect=[], modalities=["text", "image"], sample_creation="created", - bibtex_citation=""" @TECHREPORT{Krizhevsky09learningmultiple, - author = {Alex Krizhevsky}, - title = {Learning multiple layers of features from tiny images}, - institution = {}, - year = {2009} - } - """, + bibtex_citation=r""" +@techreport{Krizhevsky09learningmultiple, + author = {Alex Krizhevsky}, + institution = {}, + title = {Learning multiple layers of features from tiny images}, + year = {2009}, +} +""", descriptive_stats={ "n_samples": {"test": 10000}, "avg_character_length": {"test": 431.4}, diff --git a/mteb/tasks/Image/ZeroShotClassification/eng/CLEVR.py b/mteb/tasks/Image/ZeroShotClassification/eng/CLEVR.py index 4dc8768443..a44c0141ca 100644 --- a/mteb/tasks/Image/ZeroShotClassification/eng/CLEVR.py +++ b/mteb/tasks/Image/ZeroShotClassification/eng/CLEVR.py @@ -28,14 +28,15 @@ class CLEVR(AbsTaskZeroShotClassification): dialect=[], modalities=["text", "image"], sample_creation="created", - bibtex_citation="""\ -@InProceedings{Johnson_2017_CVPR, -author = {Johnson, Justin and Hariharan, Bharath and van der Maaten, Laurens and Fei-Fei, Li and Lawrence Zitnick, C. and Girshick, Ross}, -title = {CLEVR: A Diagnostic Dataset for Compositional Language and Elementary Visual Reasoning}, -booktitle = {Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR)}, -month = {July}, -year = {2017} -}""", + bibtex_citation=r""" +@inproceedings{Johnson_2017_CVPR, + author = {Johnson, Justin and Hariharan, Bharath and van der Maaten, Laurens and Fei-Fei, Li and Lawrence Zitnick, C. and Girshick, Ross}, + booktitle = {Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR)}, + month = {July}, + title = {CLEVR: A Diagnostic Dataset for Compositional Language and Elementary Visual Reasoning}, + year = {2017}, +} +""", descriptive_stats={ "n_samples": {"test": 15000}, "avg_character_length": {"test": 0}, @@ -80,14 +81,15 @@ class CLEVRCount(AbsTaskZeroShotClassification): dialect=[], modalities=["text", "image"], sample_creation="created", - bibtex_citation="""\ -@InProceedings{Johnson_2017_CVPR, -author = {Johnson, Justin and Hariharan, Bharath and van der Maaten, Laurens and Fei-Fei, Li and Lawrence Zitnick, C. and Girshick, Ross}, -title = {CLEVR: A Diagnostic Dataset for Compositional Language and Elementary Visual Reasoning}, -booktitle = {Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR)}, -month = {July}, -year = {2017} -}""", + bibtex_citation=r""" +@inproceedings{Johnson_2017_CVPR, + author = {Johnson, Justin and Hariharan, Bharath and van der Maaten, Laurens and Fei-Fei, Li and Lawrence Zitnick, C. and Girshick, Ross}, + booktitle = {Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR)}, + month = {July}, + title = {CLEVR: A Diagnostic Dataset for Compositional Language and Elementary Visual Reasoning}, + year = {2017}, +} +""", descriptive_stats={ "n_samples": {"test": 15000}, "avg_character_length": {"test": 0}, diff --git a/mteb/tasks/Image/ZeroShotClassification/eng/Caltech101.py b/mteb/tasks/Image/ZeroShotClassification/eng/Caltech101.py index f75a03f822..6e897d0cdf 100644 --- a/mteb/tasks/Image/ZeroShotClassification/eng/Caltech101.py +++ b/mteb/tasks/Image/ZeroShotClassification/eng/Caltech101.py @@ -33,17 +33,19 @@ class Caltech101ZeroShotClassification(AbsTaskZeroShotClassification): dialect=[], modalities=["text", "image"], sample_creation="created", - bibtex_citation="""@INPROCEEDINGS{1384978, - author={Li Fei-Fei and Fergus, R. and Perona, P.}, - booktitle={2004 Conference on Computer Vision and Pattern Recognition Workshop}, - title={Learning Generative Visual Models from Few Training Examples: An Incremental Bayesian Approach Tested on 101 Object Categories}, - year={2004}, - volume={}, - number={}, - pages={178-178}, - keywords={Bayesian methods;Testing;Humans;Maximum likelihood estimation;Assembly;Shape;Machine vision;Image recognition;Parameter estimation;Image databases}, - doi={10.1109/CVPR.2004.383}} - """, + bibtex_citation=r""" +@inproceedings{1384978, + author = {Li Fei-Fei and Fergus, R. and Perona, P.}, + booktitle = {2004 Conference on Computer Vision and Pattern Recognition Workshop}, + doi = {10.1109/CVPR.2004.383}, + keywords = {Bayesian methods;Testing;Humans;Maximum likelihood estimation;Assembly;Shape;Machine vision;Image recognition;Parameter estimation;Image databases}, + number = {}, + pages = {178-178}, + title = {Learning Generative Visual Models from Few Training Examples: An Incremental Bayesian Approach Tested on 101 Object Categories}, + volume = {}, + year = {2004}, +} +""", descriptive_stats={ "n_samples": {"test": 6084}, "avg_character_length": {"test": 431.4}, diff --git a/mteb/tasks/Image/ZeroShotClassification/eng/Country211.py b/mteb/tasks/Image/ZeroShotClassification/eng/Country211.py index 19b81350a3..67bddc3c4c 100644 --- a/mteb/tasks/Image/ZeroShotClassification/eng/Country211.py +++ b/mteb/tasks/Image/ZeroShotClassification/eng/Country211.py @@ -31,14 +31,16 @@ class Country211ZeroShotClassification(AbsTaskZeroShotClassification): license="cc-by-sa-4.0", annotations_creators="derived", dialect=[], - modalities=["image"], + modalities=["image", "text"], sample_creation="created", - bibtex_citation="""@article{radford2021learning, - title={Learning Transferable Visual Models From Natural Language Supervision}, - author={Radford, Alec and Kim, Jong Wook and Hallacy, Chris and Ramesh, Aditya and Goh, Gabriel and Agarwal, Sandhini and Sastry, Girish and Askell, Amanda and Mishkin, Pamela and Clark, Jack and others}, - journal={arXiv preprint arXiv:2103.00020}, - year={2021} - }""", + bibtex_citation=r""" +@article{radford2021learning, + author = {Radford, Alec and Kim, Jong Wook and Hallacy, Chris and Ramesh, Aditya and Goh, Gabriel and Agarwal, Sandhini and Sastry, Girish and Askell, Amanda and Mishkin, Pamela and Clark, Jack and others}, + journal = {arXiv preprint arXiv:2103.00020}, + title = {Learning Transferable Visual Models From Natural Language Supervision}, + year = {2021}, +} +""", descriptive_stats={ "n_samples": {"test": 21100}, "avg_character_length": {"test": 0}, diff --git a/mteb/tasks/Image/ZeroShotClassification/eng/DTD.py b/mteb/tasks/Image/ZeroShotClassification/eng/DTD.py index 5ebaee93ab..5be1a00507 100644 --- a/mteb/tasks/Image/ZeroShotClassification/eng/DTD.py +++ b/mteb/tasks/Image/ZeroShotClassification/eng/DTD.py @@ -31,12 +31,14 @@ class DTDZeroShotClassification(AbsTaskZeroShotClassification): dialect=[], modalities=["image", "text"], sample_creation="created", - bibtex_citation="""@InProceedings{cimpoi14describing, - Author = {M. Cimpoi and S. Maji and I. Kokkinos and S. Mohamed and and A. Vedaldi}, - Title = {Describing Textures in the Wild}, - Booktitle = {Proceedings of the {IEEE} Conf. on Computer Vision and Pattern Recognition ({CVPR})}, - Year = {2014}} - """, + bibtex_citation=r""" +@inproceedings{cimpoi14describing, + author = {M. Cimpoi and S. Maji and I. Kokkinos and S. Mohamed and and A. Vedaldi}, + booktitle = {Proceedings of the {IEEE} Conf. on Computer Vision and Pattern Recognition ({CVPR})}, + title = {Describing Textures in the Wild}, + year = {2014}, +} +""", descriptive_stats={ "n_samples": {"test": 1880}, "avg_character_length": {"test": 456}, diff --git a/mteb/tasks/Image/ZeroShotClassification/eng/EuroSAT.py b/mteb/tasks/Image/ZeroShotClassification/eng/EuroSAT.py index e09ece2463..028ecde4ff 100644 --- a/mteb/tasks/Image/ZeroShotClassification/eng/EuroSAT.py +++ b/mteb/tasks/Image/ZeroShotClassification/eng/EuroSAT.py @@ -31,17 +31,19 @@ class EuroSATZeroShotClassification(AbsTaskZeroShotClassification): dialect=[], modalities=["image", "text"], sample_creation="created", - bibtex_citation="""@ARTICLE{8736785, - author={Helber, Patrick and Bischke, Benjamin and Dengel, Andreas and Borth, Damian}, - journal={IEEE Journal of Selected Topics in Applied Earth Observations and Remote Sensing}, - title={EuroSAT: A Novel Dataset and Deep Learning Benchmark for Land Use and Land Cover Classification}, - year={2019}, - volume={12}, - number={7}, - pages={2217-2226}, - keywords={Satellites;Earth;Remote sensing;Machine learning;Spatial resolution;Feature extraction;Benchmark testing;Dataset;deep convolutional neural network;deep learning;earth observation;land cover classification;land use classification;machine learning;remote sensing;satellite image classification;satellite images}, - doi={10.1109/JSTARS.2019.2918242}} - """, + bibtex_citation=r""" +@article{8736785, + author = {Helber, Patrick and Bischke, Benjamin and Dengel, Andreas and Borth, Damian}, + doi = {10.1109/JSTARS.2019.2918242}, + journal = {IEEE Journal of Selected Topics in Applied Earth Observations and Remote Sensing}, + keywords = {Satellites;Earth;Remote sensing;Machine learning;Spatial resolution;Feature extraction;Benchmark testing;Dataset;deep convolutional neural network;deep learning;earth observation;land cover classification;land use classification;machine learning;remote sensing;satellite image classification;satellite images}, + number = {7}, + pages = {2217-2226}, + title = {EuroSAT: A Novel Dataset and Deep Learning Benchmark for Land Use and Land Cover Classification}, + volume = {12}, + year = {2019}, +} +""", descriptive_stats={ "n_samples": {"test": 5400}, "avg_character_length": {"test": 431.4}, diff --git a/mteb/tasks/Image/ZeroShotClassification/eng/FER2013.py b/mteb/tasks/Image/ZeroShotClassification/eng/FER2013.py index abdd5c2d29..0dc00a62b7 100644 --- a/mteb/tasks/Image/ZeroShotClassification/eng/FER2013.py +++ b/mteb/tasks/Image/ZeroShotClassification/eng/FER2013.py @@ -31,16 +31,17 @@ class FER2013ZeroShotClassification(AbsTaskZeroShotClassification): dialect=[], modalities=["image", "text"], sample_creation="created", - bibtex_citation="""@misc{goodfellow2015explainingharnessingadversarialexamples, - title={Explaining and Harnessing Adversarial Examples}, - author={Ian J. Goodfellow and Jonathon Shlens and Christian Szegedy}, - year={2015}, - eprint={1412.6572}, - archivePrefix={arXiv}, - primaryClass={stat.ML}, - url={https://arxiv.org/abs/1412.6572}, - } - """, + bibtex_citation=r""" +@misc{goodfellow2015explainingharnessingadversarialexamples, + archiveprefix = {arXiv}, + author = {Ian J. Goodfellow and Jonathon Shlens and Christian Szegedy}, + eprint = {1412.6572}, + primaryclass = {stat.ML}, + title = {Explaining and Harnessing Adversarial Examples}, + url = {https://arxiv.org/abs/1412.6572}, + year = {2015}, +} +""", descriptive_stats={ "n_samples": {"test": 7178}, "avg_character_length": {"test": 431.4}, diff --git a/mteb/tasks/Image/ZeroShotClassification/eng/FGVCAircraft.py b/mteb/tasks/Image/ZeroShotClassification/eng/FGVCAircraft.py index c15600a25a..7e8ea8257c 100644 --- a/mteb/tasks/Image/ZeroShotClassification/eng/FGVCAircraft.py +++ b/mteb/tasks/Image/ZeroShotClassification/eng/FGVCAircraft.py @@ -32,16 +32,17 @@ class FGVCAircraftZeroShotClassification(AbsTaskZeroShotClassification): dialect=[], modalities=["text", "image"], sample_creation="created", - bibtex_citation="""@misc{maji2013finegrainedvisualclassificationaircraft, - title={Fine-Grained Visual Classification of Aircraft}, - author={Subhransu Maji and Esa Rahtu and Juho Kannala and Matthew Blaschko and Andrea Vedaldi}, - year={2013}, - eprint={1306.5151}, - archivePrefix={arXiv}, - primaryClass={cs.CV}, - url={https://arxiv.org/abs/1306.5151}, - } - """, + bibtex_citation=r""" +@misc{maji2013finegrainedvisualclassificationaircraft, + archiveprefix = {arXiv}, + author = {Subhransu Maji and Esa Rahtu and Juho Kannala and Matthew Blaschko and Andrea Vedaldi}, + eprint = {1306.5151}, + primaryclass = {cs.CV}, + title = {Fine-Grained Visual Classification of Aircraft}, + url = {https://arxiv.org/abs/1306.5151}, + year = {2013}, +} +""", descriptive_stats={ "n_samples": {"test": 3333}, "avg_character_length": {"test": 431.4}, diff --git a/mteb/tasks/Image/ZeroShotClassification/eng/Food101.py b/mteb/tasks/Image/ZeroShotClassification/eng/Food101.py index 0480e663bf..e53022f53c 100644 --- a/mteb/tasks/Image/ZeroShotClassification/eng/Food101.py +++ b/mteb/tasks/Image/ZeroShotClassification/eng/Food101.py @@ -31,13 +31,14 @@ class Food101ZeroShotClassification(AbsTaskZeroShotClassification): dialect=[], modalities=["text", "image"], sample_creation="created", - bibtex_citation=""" @inproceedings{bossard14, - title = {Food-101 -- Mining Discriminative Components with Random Forests}, - author = {Bossard, Lukas and Guillaumin, Matthieu and Van Gool, Luc}, - booktitle = {European Conference on Computer Vision}, - year = {2014} - } - """, + bibtex_citation=r""" +@inproceedings{bossard14, + author = {Bossard, Lukas and Guillaumin, Matthieu and Van Gool, Luc}, + booktitle = {European Conference on Computer Vision}, + title = {Food-101 -- Mining Discriminative Components with Random Forests}, + year = {2014}, +} +""", descriptive_stats={ "n_samples": {"validation": 25300}, "avg_character_length": {"validation": 431.4}, diff --git a/mteb/tasks/Image/ZeroShotClassification/eng/GTSRB.py b/mteb/tasks/Image/ZeroShotClassification/eng/GTSRB.py index e8dededb12..d68d8928b4 100644 --- a/mteb/tasks/Image/ZeroShotClassification/eng/GTSRB.py +++ b/mteb/tasks/Image/ZeroShotClassification/eng/GTSRB.py @@ -31,19 +31,21 @@ class GTSRBZeroShotClassification(AbsTaskZeroShotClassification): license="not specified", annotations_creators="derived", dialect=[], - modalities=["image"], + modalities=["image", "text"], sample_creation="created", - bibtex_citation="""@INPROCEEDINGS{6033395, - author={Stallkamp, Johannes and Schlipsing, Marc and Salmen, Jan and Igel, Christian}, - booktitle={The 2011 International Joint Conference on Neural Networks}, - title={The German Traffic Sign Recognition Benchmark: A multi-class classification competition}, - year={2011}, - volume={}, - number={}, - pages={1453-1460}, - keywords={Humans;Training;Image color analysis;Benchmark testing;Lead;Histograms;Image resolution}, - doi={10.1109/IJCNN.2011.6033395}} - """, + bibtex_citation=r""" +@inproceedings{6033395, + author = {Stallkamp, Johannes and Schlipsing, Marc and Salmen, Jan and Igel, Christian}, + booktitle = {The 2011 International Joint Conference on Neural Networks}, + doi = {10.1109/IJCNN.2011.6033395}, + keywords = {Humans;Training;Image color analysis;Benchmark testing;Lead;Histograms;Image resolution}, + number = {}, + pages = {1453-1460}, + title = {The German Traffic Sign Recognition Benchmark: A multi-class classification competition}, + volume = {}, + year = {2011}, +} +""", descriptive_stats={ "n_samples": {"test": 12630}, "avg_character_length": {"test": 0}, diff --git a/mteb/tasks/Image/ZeroShotClassification/eng/Imagenet1k.py b/mteb/tasks/Image/ZeroShotClassification/eng/Imagenet1k.py index 80ce7163fb..179fb953ce 100644 --- a/mteb/tasks/Image/ZeroShotClassification/eng/Imagenet1k.py +++ b/mteb/tasks/Image/ZeroShotClassification/eng/Imagenet1k.py @@ -31,16 +31,18 @@ class Imagenet1kZeroShotClassification(AbsTaskZeroShotClassification): license="not specified", annotations_creators="human-annotated", dialect=[], - modalities=["image"], + modalities=["image", "text"], sample_creation="created", - bibtex_citation="""@article{deng2009imagenet, - title={ImageNet: A large-scale hierarchical image database}, - author={Deng, Jia and Dong, Wei and Socher, Richard and Li, Li-Jia and Li, Kai and Fei-Fei, Li}, - journal={2009 IEEE Conference on Computer Vision and Pattern Recognition}, - pages={248--255}, - year={2009}, - organization={Ieee} - }""", + bibtex_citation=r""" +@article{deng2009imagenet, + author = {Deng, Jia and Dong, Wei and Socher, Richard and Li, Li-Jia and Li, Kai and Fei-Fei, Li}, + journal = {2009 IEEE Conference on Computer Vision and Pattern Recognition}, + organization = {Ieee}, + pages = {248--255}, + title = {ImageNet: A large-scale hierarchical image database}, + year = {2009}, +} +""", descriptive_stats={ "n_samples": {"test": 37200}, "avg_character_length": {"test": 0}, diff --git a/mteb/tasks/Image/ZeroShotClassification/eng/MNIST.py b/mteb/tasks/Image/ZeroShotClassification/eng/MNIST.py index 4b4735f269..e8b14abbe8 100644 --- a/mteb/tasks/Image/ZeroShotClassification/eng/MNIST.py +++ b/mteb/tasks/Image/ZeroShotClassification/eng/MNIST.py @@ -31,14 +31,15 @@ class MNISTZeroShotClassification(AbsTaskZeroShotClassification): dialect=[], modalities=["image", "text"], sample_creation="created", - bibtex_citation="""@article{lecun2010mnist, - title={MNIST handwritten digit database}, - author={LeCun, Yann and Cortes, Corinna and Burges, CJ}, - journal={ATT Labs [Online]. Available: http://yann.lecun.com/exdb/mnist}, - volume={2}, - year={2010} - } - """, + bibtex_citation=r""" +@article{lecun2010mnist, + author = {LeCun, Yann and Cortes, Corinna and Burges, CJ}, + journal = {ATT Labs [Online]. Available: http://yann.lecun.com/exdb/mnist}, + title = {MNIST handwritten digit database}, + volume = {2}, + year = {2010}, +} +""", descriptive_stats={ "n_samples": {"test": 10000}, "avg_character_length": {"test": 431.4}, diff --git a/mteb/tasks/Image/ZeroShotClassification/eng/OxfordPets.py b/mteb/tasks/Image/ZeroShotClassification/eng/OxfordPets.py index af939e794c..1eef6bbd49 100644 --- a/mteb/tasks/Image/ZeroShotClassification/eng/OxfordPets.py +++ b/mteb/tasks/Image/ZeroShotClassification/eng/OxfordPets.py @@ -31,16 +31,17 @@ class OxfordPetsZeroShotClassification(AbsTaskZeroShotClassification): dialect=[], modalities=["text", "image"], sample_creation="created", - bibtex_citation="""@misc{maji2013finegrainedvisualclassificationaircraft, - title={Fine-Grained Visual Classification of Aircraft}, - author={Subhransu Maji and Esa Rahtu and Juho Kannala and Matthew Blaschko and Andrea Vedaldi}, - year={2013}, - eprint={1306.5151}, - archivePrefix={arXiv}, - primaryClass={cs.CV}, - url={https://arxiv.org/abs/1306.5151}, - } - """, + bibtex_citation=r""" +@misc{maji2013finegrainedvisualclassificationaircraft, + archiveprefix = {arXiv}, + author = {Subhransu Maji and Esa Rahtu and Juho Kannala and Matthew Blaschko and Andrea Vedaldi}, + eprint = {1306.5151}, + primaryclass = {cs.CV}, + title = {Fine-Grained Visual Classification of Aircraft}, + url = {https://arxiv.org/abs/1306.5151}, + year = {2013}, +} +""", descriptive_stats={ "n_samples": {"test": 3669}, "avg_character_length": {"test": 431.4}, diff --git a/mteb/tasks/Image/ZeroShotClassification/eng/PatchCamelyon.py b/mteb/tasks/Image/ZeroShotClassification/eng/PatchCamelyon.py index 11e155247b..50fe78866e 100644 --- a/mteb/tasks/Image/ZeroShotClassification/eng/PatchCamelyon.py +++ b/mteb/tasks/Image/ZeroShotClassification/eng/PatchCamelyon.py @@ -31,27 +31,28 @@ class PatchCamelyonZeroShotClassification(AbsTaskZeroShotClassification): license="not specified", annotations_creators="derived", dialect=[], - modalities=["image"], + modalities=["image", "text"], sample_creation="created", - bibtex_citation="""@InProceedings{10.1007/978-3-030-00934-2_24, -author="Veeling, Bastiaan S. + bibtex_citation=r""" +@inproceedings{10.1007/978-3-030-00934-2_24, + abstract = {We propose a new model for digital pathology segmentation, based on the observation that histopathology images are inherently symmetric under rotation and reflection. Utilizing recent findings on rotation equivariant CNNs, the proposed model leverages these symmetries in a principled manner. We present a visual analysis showing improved stability on predictions, and demonstrate that exploiting rotation equivariance significantly improves tumor detection performance on a challenging lymph node metastases dataset. We further present a novel derived dataset to enable principled comparison of machine learning models, in combination with an initial benchmark. Through this dataset, the task of histopathology diagnosis becomes accessible as a challenging benchmark for fundamental machine learning research.}, + address = {Cham}, + author = {Veeling, Bastiaan S. and Linmans, Jasper and Winkens, Jim and Cohen, Taco -and Welling, Max", -editor="Frangi, Alejandro F. +and Welling, Max}, + booktitle = {Medical Image Computing and Computer Assisted Intervention -- MICCAI 2018}, + editor = {Frangi, Alejandro F. and Schnabel, Julia A. and Davatzikos, Christos and Alberola-L{\'o}pez, Carlos -and Fichtinger, Gabor", -title="Rotation Equivariant CNNs for Digital Pathology", -booktitle="Medical Image Computing and Computer Assisted Intervention -- MICCAI 2018", -year="2018", -publisher="Springer International Publishing", -address="Cham", -pages="210--218", -abstract="We propose a new model for digital pathology segmentation, based on the observation that histopathology images are inherently symmetric under rotation and reflection. Utilizing recent findings on rotation equivariant CNNs, the proposed model leverages these symmetries in a principled manner. We present a visual analysis showing improved stability on predictions, and demonstrate that exploiting rotation equivariance significantly improves tumor detection performance on a challenging lymph node metastases dataset. We further present a novel derived dataset to enable principled comparison of machine learning models, in combination with an initial benchmark. Through this dataset, the task of histopathology diagnosis becomes accessible as a challenging benchmark for fundamental machine learning research.", -isbn="978-3-030-00934-2" +and Fichtinger, Gabor}, + isbn = {978-3-030-00934-2}, + pages = {210--218}, + publisher = {Springer International Publishing}, + title = {Rotation Equivariant CNNs for Digital Pathology}, + year = {2018}, } """, descriptive_stats={ diff --git a/mteb/tasks/Image/ZeroShotClassification/eng/RESISC45.py b/mteb/tasks/Image/ZeroShotClassification/eng/RESISC45.py index b4e2d4d20e..1b16853cd0 100644 --- a/mteb/tasks/Image/ZeroShotClassification/eng/RESISC45.py +++ b/mteb/tasks/Image/ZeroShotClassification/eng/RESISC45.py @@ -31,17 +31,19 @@ class RESISC45ZeroShotClassification(AbsTaskZeroShotClassification): dialect=[], modalities=["image", "text"], sample_creation="created", - bibtex_citation="""@ARTICLE{7891544, - author={Cheng, Gong and Han, Junwei and Lu, Xiaoqiang}, - journal={Proceedings of the IEEE}, - title={Remote Sensing Image Scene Classification: Benchmark and State of the Art}, - year={2017}, - volume={105}, - number={10}, - pages={1865-1883}, - keywords={Remote sensing;Benchmark testing;Spatial resolution;Social network services;Satellites;Image analysis;Machine learning;Unsupervised learning;Classification;Benchmark data set;deep learning;handcrafted features;remote sensing image;scene classification;unsupervised feature learning}, - doi={10.1109/JPROC.2017.2675998}} - """, + bibtex_citation=r""" +@article{7891544, + author = {Cheng, Gong and Han, Junwei and Lu, Xiaoqiang}, + doi = {10.1109/JPROC.2017.2675998}, + journal = {Proceedings of the IEEE}, + keywords = {Remote sensing;Benchmark testing;Spatial resolution;Social network services;Satellites;Image analysis;Machine learning;Unsupervised learning;Classification;Benchmark data set;deep learning;handcrafted features;remote sensing image;scene classification;unsupervised feature learning}, + number = {10}, + pages = {1865-1883}, + title = {Remote Sensing Image Scene Classification: Benchmark and State of the Art}, + volume = {105}, + year = {2017}, +} +""", descriptive_stats={ "n_samples": {"test": 6300}, "avg_character_length": {"test": 256}, diff --git a/mteb/tasks/Image/ZeroShotClassification/eng/RenderedSST2.py b/mteb/tasks/Image/ZeroShotClassification/eng/RenderedSST2.py index 01152f6d66..81fde5e1d3 100644 --- a/mteb/tasks/Image/ZeroShotClassification/eng/RenderedSST2.py +++ b/mteb/tasks/Image/ZeroShotClassification/eng/RenderedSST2.py @@ -28,7 +28,7 @@ class RenderedSST2(AbsTaskZeroShotClassification): dialect=[], modalities=["text", "image"], sample_creation="created", - bibtex_citation="""d""", + bibtex_citation="", descriptive_stats={ "n_samples": {"test": 1820}, "avg_character_length": {"test": 10.0}, diff --git a/mteb/tasks/Image/ZeroShotClassification/eng/STL10.py b/mteb/tasks/Image/ZeroShotClassification/eng/STL10.py index e5f41760a8..7bceacd8ed 100644 --- a/mteb/tasks/Image/ZeroShotClassification/eng/STL10.py +++ b/mteb/tasks/Image/ZeroShotClassification/eng/STL10.py @@ -31,22 +31,23 @@ class STL10ZeroShotClassification(AbsTaskZeroShotClassification): dialect=[], modalities=["image", "text"], sample_creation="created", - bibtex_citation="""@InProceedings{pmlr-v15-coates11a, - title = {An Analysis of Single-Layer Networks in Unsupervised Feature Learning}, - author = {Coates, Adam and Ng, Andrew and Lee, Honglak}, - booktitle = {Proceedings of the Fourteenth International Conference on Artificial Intelligence and Statistics}, - pages = {215--223}, - year = {2011}, - editor = {Gordon, Geoffrey and Dunson, David and Dudík, Miroslav}, - volume = {15}, - series = {Proceedings of Machine Learning Research}, - address = {Fort Lauderdale, FL, USA}, - month = {11--13 Apr}, - publisher = {PMLR}, - pdf = {http://proceedings.mlr.press/v15/coates11a/coates11a.pdf}, - url = {https://proceedings.mlr.press/v15/coates11a.html}, - } - """, + bibtex_citation=r""" +@inproceedings{pmlr-v15-coates11a, + address = {Fort Lauderdale, FL, USA}, + author = {Coates, Adam and Ng, Andrew and Lee, Honglak}, + booktitle = {Proceedings of the Fourteenth International Conference on Artificial Intelligence and Statistics}, + editor = {Gordon, Geoffrey and Dunson, David and Dudík, Miroslav}, + month = {11--13 Apr}, + pages = {215--223}, + pdf = {http://proceedings.mlr.press/v15/coates11a/coates11a.pdf}, + publisher = {PMLR}, + series = {Proceedings of Machine Learning Research}, + title = {An Analysis of Single-Layer Networks in Unsupervised Feature Learning}, + url = {https://proceedings.mlr.press/v15/coates11a.html}, + volume = {15}, + year = {2011}, +} +""", descriptive_stats={ "n_samples": {"test": 8000}, "avg_character_length": {"test": 431.4}, diff --git a/mteb/tasks/Image/ZeroShotClassification/eng/SUN397.py b/mteb/tasks/Image/ZeroShotClassification/eng/SUN397.py index 04f4caa71d..b307621503 100644 --- a/mteb/tasks/Image/ZeroShotClassification/eng/SUN397.py +++ b/mteb/tasks/Image/ZeroShotClassification/eng/SUN397.py @@ -31,16 +31,18 @@ class SUN397ZeroShotClassification(AbsTaskZeroShotClassification): dialect=[], modalities=["image", "text"], sample_creation="created", - bibtex_citation="""@INPROCEEDINGS{5539970, - author={Xiao, Jianxiong and Hays, James and Ehinger, Krista A. and Oliva, Aude and Torralba, Antonio}, - booktitle={2010 IEEE Computer Society Conference on Computer Vision and Pattern Recognition}, - title={SUN database: Large-scale scene recognition from abbey to zoo}, - year={2010}, - volume={}, - number={}, - pages={3485-3492}, - doi={10.1109/CVPR.2010.5539970}} - """, + bibtex_citation=r""" +@inproceedings{5539970, + author = {Xiao, Jianxiong and Hays, James and Ehinger, Krista A. and Oliva, Aude and Torralba, Antonio}, + booktitle = {2010 IEEE Computer Society Conference on Computer Vision and Pattern Recognition}, + doi = {10.1109/CVPR.2010.5539970}, + number = {}, + pages = {3485-3492}, + title = {SUN database: Large-scale scene recognition from abbey to zoo}, + volume = {}, + year = {2010}, +} +""", descriptive_stats={ "n_samples": {"test": 21750}, "avg_character_length": {"test": 256}, diff --git a/mteb/tasks/Image/ZeroShotClassification/eng/SciMMIR.py b/mteb/tasks/Image/ZeroShotClassification/eng/SciMMIR.py index 19b34bb174..51fc044236 100644 --- a/mteb/tasks/Image/ZeroShotClassification/eng/SciMMIR.py +++ b/mteb/tasks/Image/ZeroShotClassification/eng/SciMMIR.py @@ -28,16 +28,17 @@ class SciMMIR(AbsTaskZeroShotClassification): dialect=[], modalities=["text", "image"], sample_creation="created", - bibtex_citation="""\ + bibtex_citation=r""" @misc{wu2024scimmirbenchmarkingscientificmultimodal, - title={SciMMIR: Benchmarking Scientific Multi-modal Information Retrieval}, - author={Siwei Wu and Yizhi Li and Kang Zhu and Ge Zhang and Yiming Liang and Kaijing Ma and Chenghao Xiao and Haoran Zhang and Bohao Yang and Wenhu Chen and Wenhao Huang and Noura Al Moubayed and Jie Fu and Chenghua Lin}, - year={2024}, - eprint={2401.13478}, - archivePrefix={arXiv}, - primaryClass={cs.IR}, - url={https://arxiv.org/abs/2401.13478}, -}""", + archiveprefix = {arXiv}, + author = {Siwei Wu and Yizhi Li and Kang Zhu and Ge Zhang and Yiming Liang and Kaijing Ma and Chenghao Xiao and Haoran Zhang and Bohao Yang and Wenhu Chen and Wenhao Huang and Noura Al Moubayed and Jie Fu and Chenghua Lin}, + eprint = {2401.13478}, + primaryclass = {cs.IR}, + title = {SciMMIR: Benchmarking Scientific Multi-modal Information Retrieval}, + url = {https://arxiv.org/abs/2401.13478}, + year = {2024}, +} +""", descriptive_stats={ "n_samples": {"test": 16263}, "avg_character_length": {"test": 0}, diff --git a/mteb/tasks/Image/ZeroShotClassification/eng/StanfordCars.py b/mteb/tasks/Image/ZeroShotClassification/eng/StanfordCars.py index 66a380330e..134de6d80a 100644 --- a/mteb/tasks/Image/ZeroShotClassification/eng/StanfordCars.py +++ b/mteb/tasks/Image/ZeroShotClassification/eng/StanfordCars.py @@ -31,12 +31,14 @@ class StanfordCarsZeroShotClassification(AbsTaskZeroShotClassification): dialect=[], modalities=["image", "text"], sample_creation="created", - bibtex_citation="""@inproceedings{Krause2013CollectingAL, - title={Collecting a Large-scale Dataset of Fine-grained Cars}, - author={Jonathan Krause and Jia Deng and Michael Stark and Li Fei-Fei}, - year={2013}, - url={https://api.semanticscholar.org/CorpusID:16632981} - }""", + bibtex_citation=r""" +@inproceedings{Krause2013CollectingAL, + author = {Jonathan Krause and Jia Deng and Michael Stark and Li Fei-Fei}, + title = {Collecting a Large-scale Dataset of Fine-grained Cars}, + url = {https://api.semanticscholar.org/CorpusID:16632981}, + year = {2013}, +} +""", descriptive_stats={ "n_samples": {"test": 8041}, "avg_character_length": {"test": 431.4}, diff --git a/mteb/tasks/Image/ZeroShotClassification/eng/UCF101.py b/mteb/tasks/Image/ZeroShotClassification/eng/UCF101.py index ce8f2fc674..dd874d9356 100644 --- a/mteb/tasks/Image/ZeroShotClassification/eng/UCF101.py +++ b/mteb/tasks/Image/ZeroShotClassification/eng/UCF101.py @@ -33,17 +33,19 @@ class UCF101ZeroShotClassification(AbsTaskZeroShotClassification): license="not specified", annotations_creators="derived", dialect=[], - modalities=["image"], + modalities=["image", "text"], sample_creation="created", - bibtex_citation="""@misc{soomro2012ucf101dataset101human, - title={UCF101: A Dataset of 101 Human Actions Classes From Videos in The Wild}, - author={Khurram Soomro and Amir Roshan Zamir and Mubarak Shah}, - year={2012}, - eprint={1212.0402}, - archivePrefix={arXiv}, - primaryClass={cs.CV}, - url={https://arxiv.org/abs/1212.0402}, -}""", + bibtex_citation=r""" +@misc{soomro2012ucf101dataset101human, + archiveprefix = {arXiv}, + author = {Khurram Soomro and Amir Roshan Zamir and Mubarak Shah}, + eprint = {1212.0402}, + primaryclass = {cs.CV}, + title = {UCF101: A Dataset of 101 Human Actions Classes From Videos in The Wild}, + url = {https://arxiv.org/abs/1212.0402}, + year = {2012}, +} +""", descriptive_stats={ "n_samples": {"test": 697222}, "avg_character_length": {"test": 0}, diff --git a/mteb/tasks/Image/__init__.py b/mteb/tasks/Image/__init__.py index 5d7970e912..140c75d652 100644 --- a/mteb/tasks/Image/__init__.py +++ b/mteb/tasks/Image/__init__.py @@ -2,8 +2,8 @@ from .Any2AnyMultiChoice import * from .Any2AnyRetrieval import * -from .Clustering import * from .ImageClassification import * +from .ImageClustering import * from .ImageMultilabelClassification import * from .ImageTextPairClassification import * from .VisualSTS import * diff --git a/mteb/tasks/InstructionRetrieval/eng/Core17InstructionRetrieval.py b/mteb/tasks/InstructionRetrieval/eng/Core17InstructionRetrieval.py index dc9614a28b..6717d5f80b 100644 --- a/mteb/tasks/InstructionRetrieval/eng/Core17InstructionRetrieval.py +++ b/mteb/tasks/InstructionRetrieval/eng/Core17InstructionRetrieval.py @@ -27,12 +27,14 @@ class Core17InstructionRetrieval(AbsTaskInstructionRetrieval): annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation="""@misc{weller2024followir, - title={FollowIR: Evaluating and Teaching Information Retrieval Models to Follow Instructions}, - author={Orion Weller and Benjamin Chang and Sean MacAvaney and Kyle Lo and Arman Cohan and Benjamin Van Durme and Dawn Lawrie and Luca Soldaini}, - year={2024}, - eprint={2403.15246}, - archivePrefix={arXiv}, - primaryClass={cs.IR} -}""", + bibtex_citation=r""" +@misc{weller2024followir, + archiveprefix = {arXiv}, + author = {Orion Weller and Benjamin Chang and Sean MacAvaney and Kyle Lo and Arman Cohan and Benjamin Van Durme and Dawn Lawrie and Luca Soldaini}, + eprint = {2403.15246}, + primaryclass = {cs.IR}, + title = {FollowIR: Evaluating and Teaching Information Retrieval Models to Follow Instructions}, + year = {2024}, +} +""", ) diff --git a/mteb/tasks/InstructionRetrieval/eng/News21InstructionRetrieval.py b/mteb/tasks/InstructionRetrieval/eng/News21InstructionRetrieval.py index e20833128d..3c4c628589 100644 --- a/mteb/tasks/InstructionRetrieval/eng/News21InstructionRetrieval.py +++ b/mteb/tasks/InstructionRetrieval/eng/News21InstructionRetrieval.py @@ -27,12 +27,14 @@ class News21InstructionRetrieval(AbsTaskInstructionRetrieval): annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation="""@misc{weller2024followir, - title={FollowIR: Evaluating and Teaching Information Retrieval Models to Follow Instructions}, - author={Orion Weller and Benjamin Chang and Sean MacAvaney and Kyle Lo and Arman Cohan and Benjamin Van Durme and Dawn Lawrie and Luca Soldaini}, - year={2024}, - eprint={2403.15246}, - archivePrefix={arXiv}, - primaryClass={cs.IR} -}""", + bibtex_citation=r""" +@misc{weller2024followir, + archiveprefix = {arXiv}, + author = {Orion Weller and Benjamin Chang and Sean MacAvaney and Kyle Lo and Arman Cohan and Benjamin Van Durme and Dawn Lawrie and Luca Soldaini}, + eprint = {2403.15246}, + primaryclass = {cs.IR}, + title = {FollowIR: Evaluating and Teaching Information Retrieval Models to Follow Instructions}, + year = {2024}, +} +""", ) diff --git a/mteb/tasks/InstructionRetrieval/eng/Robust04InstructionRetrieval.py b/mteb/tasks/InstructionRetrieval/eng/Robust04InstructionRetrieval.py index 6056624309..f740d37ed9 100644 --- a/mteb/tasks/InstructionRetrieval/eng/Robust04InstructionRetrieval.py +++ b/mteb/tasks/InstructionRetrieval/eng/Robust04InstructionRetrieval.py @@ -27,12 +27,14 @@ class Robust04InstructionRetrieval(AbsTaskInstructionRetrieval): annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation="""@misc{weller2024followir, - title={FollowIR: Evaluating and Teaching Information Retrieval Models to Follow Instructions}, - author={Orion Weller and Benjamin Chang and Sean MacAvaney and Kyle Lo and Arman Cohan and Benjamin Van Durme and Dawn Lawrie and Luca Soldaini}, - year={2024}, - eprint={2403.15246}, - archivePrefix={arXiv}, - primaryClass={cs.IR} -}""", + bibtex_citation=r""" +@misc{weller2024followir, + archiveprefix = {arXiv}, + author = {Orion Weller and Benjamin Chang and Sean MacAvaney and Kyle Lo and Arman Cohan and Benjamin Van Durme and Dawn Lawrie and Luca Soldaini}, + eprint = {2403.15246}, + primaryclass = {cs.IR}, + title = {FollowIR: Evaluating and Teaching Information Retrieval Models to Follow Instructions}, + year = {2024}, +} +""", ) diff --git a/mteb/tasks/InstructionRetrieval/multilingual/mFollowIR.py b/mteb/tasks/InstructionRetrieval/multilingual/mFollowIR.py index 9452beb8de..b738c5067b 100644 --- a/mteb/tasks/InstructionRetrieval/multilingual/mFollowIR.py +++ b/mteb/tasks/InstructionRetrieval/multilingual/mFollowIR.py @@ -194,12 +194,14 @@ class mFollowIRCrossLingual(MultilingualTask, AbsTaskInstructionRetrieval): annotations_creators="expert-annotated", dialect=[], sample_creation="found", - bibtex_citation="""@article{weller2024mfollowir, - title={{mFollowIR: a Multilingual Benchmark for Instruction Following in Retrieval}}, - author={Weller, Orion and Chang, Benjamin and Yang, Eugene and Yarmohammadi, Mahsa and Barham, Sam and MacAvaney, Sean and Cohan, Arman and Soldaini, Luca and Van Durme, Benjamin and Lawrie, Dawn}, - journal={arXiv preprint TODO}, - year={2024} -}""", + bibtex_citation=r""" +@article{weller2024mfollowir, + author = {Weller, Orion and Chang, Benjamin and Yang, Eugene and Yarmohammadi, Mahsa and Barham, Sam and MacAvaney, Sean and Cohan, Arman and Soldaini, Luca and Van Durme, Benjamin and Lawrie, Dawn}, + journal = {arXiv preprint TODO}, + title = {{mFollowIR: a Multilingual Benchmark for Instruction Following in Retrieval}}, + year = {2024}, +} +""", ) def load_data(self, **kwargs): @@ -247,12 +249,14 @@ class mFollowIR(MultilingualTask, AbsTaskInstructionRetrieval): annotations_creators="expert-annotated", dialect=[], sample_creation="found", - bibtex_citation="""@article{weller2024mfollowir, - title={{mFollowIR: a Multilingual Benchmark for Instruction Following in Retrieval}}, - author={Weller, Orion and Chang, Benjamin and Yang, Eugene and Yarmohammadi, Mahsa and Barham, Sam and MacAvaney, Sean and Cohan, Arman and Soldaini, Luca and Van Durme, Benjamin and Lawrie, Dawn}, - journal={arXiv preprint TODO}, - year={2024} -}""", + bibtex_citation=r""" +@article{weller2024mfollowir, + author = {Weller, Orion and Chang, Benjamin and Yang, Eugene and Yarmohammadi, Mahsa and Barham, Sam and MacAvaney, Sean and Cohan, Arman and Soldaini, Luca and Van Durme, Benjamin and Lawrie, Dawn}, + journal = {arXiv preprint TODO}, + title = {{mFollowIR: a Multilingual Benchmark for Instruction Following in Retrieval}}, + year = {2024}, +} +""", ) def load_data(self, **kwargs): diff --git a/mteb/tasks/MultiLabelClassification/__init__.py b/mteb/tasks/MultiLabelClassification/__init__.py index 0cf8c1bf6a..6b8ab6b7d4 100644 --- a/mteb/tasks/MultiLabelClassification/__init__.py +++ b/mteb/tasks/MultiLabelClassification/__init__.py @@ -5,4 +5,5 @@ from .multilingual.MultiEURLEXMultilabelClassification import * from .por.BrazilianToxicTweetsClassification import * from .rus.CEDRClassification import * +from .rus.ru_toixic_multilabelclassification_okmlcup import * from .rus.SensitiveTopicsClassification import * diff --git a/mteb/tasks/MultiLabelClassification/kor/KorHateSpeechMLClassification.py b/mteb/tasks/MultiLabelClassification/kor/KorHateSpeechMLClassification.py index 42b5a40a45..8104875705 100644 --- a/mteb/tasks/MultiLabelClassification/kor/KorHateSpeechMLClassification.py +++ b/mteb/tasks/MultiLabelClassification/kor/KorHateSpeechMLClassification.py @@ -37,24 +37,25 @@ class KorHateSpeechMLClassification(AbsTaskMultilabelClassification): annotations_creators="expert-annotated", dialect=[], sample_creation="found", - bibtex_citation=""" - @inproceedings{lee-etal-2022-k, - title = "K-{MH}a{S}: A Multi-label Hate Speech Detection Dataset in {K}orean Online News Comment", - author = "Lee, Jean and - Lim, Taejun and - Lee, Heejun and - Jo, Bogeun and - Kim, Yangsok and - Yoon, Heegeun and - Han, Soyeon Caren", - booktitle = "Proceedings of the 29th International Conference on Computational Linguistics", - month = oct, - year = "2022", - address = "Gyeongju, Republic of Korea", - publisher = "International Committee on Computational Linguistics", - url = "https://aclanthology.org/2022.coling-1.311", - pages = "3530--3538", - }""", + bibtex_citation=r""" +@inproceedings{lee-etal-2022-k, + address = {Gyeongju, Republic of Korea}, + author = {Lee, Jean and +Lim, Taejun and +Lee, Heejun and +Jo, Bogeun and +Kim, Yangsok and +Yoon, Heegeun and +Han, Soyeon Caren}, + booktitle = {Proceedings of the 29th International Conference on Computational Linguistics}, + month = oct, + pages = {3530--3538}, + publisher = {International Committee on Computational Linguistics}, + title = {K-{MH}a{S}: A Multi-label Hate Speech Detection Dataset in {K}orean Online News Comment}, + url = {https://aclanthology.org/2022.coling-1.311}, + year = {2022}, +} +""", ) def dataset_transform(self): diff --git a/mteb/tasks/MultiLabelClassification/mlt/MalteseNewsClassification.py b/mteb/tasks/MultiLabelClassification/mlt/MalteseNewsClassification.py index 528d18f396..196396bc83 100644 --- a/mteb/tasks/MultiLabelClassification/mlt/MalteseNewsClassification.py +++ b/mteb/tasks/MultiLabelClassification/mlt/MalteseNewsClassification.py @@ -33,16 +33,18 @@ class MalteseNewsClassification(AbsTaskMultilabelClassification): annotations_creators="expert-annotated", dialect=[], sample_creation="found", - bibtex_citation="""@inproceedings{maltese-news-datasets, - title = "Topic Classification and Headline Generation for {M}altese using a Public News Corpus", - author = "Chaudhary, Amit Kumar and - Micallef, Kurt and - Borg, Claudia", - booktitle = "Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation", - month = may, - year = "2024", - publisher = "Association for Computational Linguistics", - }""", + bibtex_citation=r""" +@inproceedings{maltese-news-datasets, + author = {Chaudhary, Amit Kumar and +Micallef, Kurt and +Borg, Claudia}, + booktitle = {Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation}, + month = may, + publisher = {Association for Computational Linguistics}, + title = {Topic Classification and Headline Generation for {M}altese using a Public News Corpus}, + year = {2024}, +} +""", ) def dataset_transform(self): diff --git a/mteb/tasks/MultiLabelClassification/multilingual/MultiEURLEXMultilabelClassification.py b/mteb/tasks/MultiLabelClassification/multilingual/MultiEURLEXMultilabelClassification.py index d4dadd1d63..9b7787d185 100644 --- a/mteb/tasks/MultiLabelClassification/multilingual/MultiEURLEXMultilabelClassification.py +++ b/mteb/tasks/MultiLabelClassification/multilingual/MultiEURLEXMultilabelClassification.py @@ -55,19 +55,19 @@ class MultiEURLEXMultilabelClassification( annotations_creators="expert-annotated", dialect=[], sample_creation="found", - bibtex_citation=""" -@InProceedings{chalkidis-etal-2021-multieurlex, + bibtex_citation=r""" +@inproceedings{chalkidis-etal-2021-multieurlex, author = {Chalkidis, Ilias - and Fergadiotis, Manos - and Androutsopoulos, Ion}, - title = {MultiEURLEX -- A multi-lingual and multi-label legal document - classification dataset for zero-shot cross-lingual transfer}, +and Fergadiotis, Manos +and Androutsopoulos, Ion}, booktitle = {Proceedings of the 2021 Conference on Empirical Methods - in Natural Language Processing}, - year = {2021}, - publisher = {Association for Computational Linguistics}, +in Natural Language Processing}, location = {Punta Cana, Dominican Republic}, - url = {https://arxiv.org/abs/2109.00904} + publisher = {Association for Computational Linguistics}, + title = {MultiEURLEX -- A multi-lingual and multi-label legal document +classification dataset for zero-shot cross-lingual transfer}, + url = {https://arxiv.org/abs/2109.00904}, + year = {2021}, } - """, +""", ) diff --git a/mteb/tasks/MultiLabelClassification/por/BrazilianToxicTweetsClassification.py b/mteb/tasks/MultiLabelClassification/por/BrazilianToxicTweetsClassification.py index f56fa78d06..d2430d927b 100644 --- a/mteb/tasks/MultiLabelClassification/por/BrazilianToxicTweetsClassification.py +++ b/mteb/tasks/MultiLabelClassification/por/BrazilianToxicTweetsClassification.py @@ -33,21 +33,23 @@ class BrazilianToxicTweetsClassification(AbsTaskMultilabelClassification): annotations_creators="expert-annotated", dialect=["brazilian"], sample_creation="found", - bibtex_citation="""@article{DBLP:journals/corr/abs-2010-04543, - author = {Joao Augusto Leite and - Diego F. Silva and - Kalina Bontcheva and - Carolina Scarton}, - title = {Toxic Language Detection in Social Media for Brazilian Portuguese: - New Dataset and Multilingual Analysis}, - journal = {CoRR}, - volume = {abs/2010.04543}, - year = {2020}, - url = {https://arxiv.org/abs/2010.04543}, - eprinttype = {arXiv}, - eprint = {2010.04543}, - timestamp = {Tue, 15 Dec 2020 16:10:16 +0100}, - }""", + bibtex_citation=r""" +@article{DBLP:journals/corr/abs-2010-04543, + author = {Joao Augusto Leite and +Diego F. Silva and +Kalina Bontcheva and +Carolina Scarton}, + eprint = {2010.04543}, + eprinttype = {arXiv}, + journal = {CoRR}, + timestamp = {Tue, 15 Dec 2020 16:10:16 +0100}, + title = {Toxic Language Detection in Social Media for Brazilian Portuguese: +New Dataset and Multilingual Analysis}, + url = {https://arxiv.org/abs/2010.04543}, + volume = {abs/2010.04543}, + year = {2020}, +} +""", ) def dataset_transform(self): diff --git a/mteb/tasks/MultiLabelClassification/rus/CEDRClassification.py b/mteb/tasks/MultiLabelClassification/rus/CEDRClassification.py index 87795138d4..400dc0b9ae 100644 --- a/mteb/tasks/MultiLabelClassification/rus/CEDRClassification.py +++ b/mteb/tasks/MultiLabelClassification/rus/CEDRClassification.py @@ -28,15 +28,16 @@ class CEDRClassification(AbsTaskMultilabelClassification): annotations_creators="human-annotated", dialect=[], sample_creation="found", - bibtex_citation="""@article{sboev2021data, - title={Data-Driven Model for Emotion Detection in Russian Texts}, - author={Sboev, Alexander and Naumov, Aleksandr and Rybka, Roman}, - journal={Procedia Computer Science}, - volume={190}, - pages={637--642}, - year={2021}, - publisher={Elsevier} - } - """, + bibtex_citation=r""" +@article{sboev2021data, + author = {Sboev, Alexander and Naumov, Aleksandr and Rybka, Roman}, + journal = {Procedia Computer Science}, + pages = {637--642}, + publisher = {Elsevier}, + title = {Data-Driven Model for Emotion Detection in Russian Texts}, + volume = {190}, + year = {2021}, +} +""", prompt="Given a comment as query, find expressed emotions (joy, sadness, surprise, fear, and anger)", ) diff --git a/mteb/tasks/MultiLabelClassification/rus/SensitiveTopicsClassification.py b/mteb/tasks/MultiLabelClassification/rus/SensitiveTopicsClassification.py index fc199313d6..d2bb2fea9f 100644 --- a/mteb/tasks/MultiLabelClassification/rus/SensitiveTopicsClassification.py +++ b/mteb/tasks/MultiLabelClassification/rus/SensitiveTopicsClassification.py @@ -28,33 +28,35 @@ class SensitiveTopicsClassification(AbsTaskMultilabelClassification): annotations_creators="human-annotated", dialect=[], sample_creation="found", - bibtex_citation="""@inproceedings{babakov-etal-2021-detecting, - title = "Detecting Inappropriate Messages on Sensitive Topics that Could Harm a Company{'}s Reputation", - author = "Babakov, Nikolay and - Logacheva, Varvara and - Kozlova, Olga and - Semenov, Nikita and - Panchenko, Alexander", - editor = "Babych, Bogdan and - Kanishcheva, Olga and - Nakov, Preslav and - Piskorski, Jakub and - Pivovarova, Lidia and - Starko, Vasyl and - Steinberger, Josef and - Yangarber, Roman and - Marci{\'n}czuk, Micha{\l} and - Pollak, Senja and - P{\v{r}}ib{\'a}{\v{n}}, Pavel and - Robnik-{\v{S}}ikonja, Marko", - booktitle = "Proceedings of the 8th Workshop on Balto-Slavic Natural Language Processing", - month = apr, - year = "2021", - address = "Kiyv, Ukraine", - publisher = "Association for Computational Linguistics", - url = "https://aclanthology.org/2021.bsnlp-1.4", - pages = "26--36", - abstract = "Not all topics are equally {``}flammable{''} in terms of toxicity: a calm discussion of turtles or fishing less often fuels inappropriate toxic dialogues than a discussion of politics or sexual minorities. We define a set of sensitive topics that can yield inappropriate and toxic messages and describe the methodology of collecting and labelling a dataset for appropriateness. While toxicity in user-generated data is well-studied, we aim at defining a more fine-grained notion of inappropriateness. The core of inappropriateness is that it can harm the reputation of a speaker. This is different from toxicity in two respects: (i) inappropriateness is topic-related, and (ii) inappropriate message is not toxic but still unacceptable. We collect and release two datasets for Russian: a topic-labelled dataset and an appropriateness-labelled dataset. We also release pre-trained classification models trained on this data.", - }""", + bibtex_citation=r""" +@inproceedings{babakov-etal-2021-detecting, + abstract = {Not all topics are equally {``}flammable{''} in terms of toxicity: a calm discussion of turtles or fishing less often fuels inappropriate toxic dialogues than a discussion of politics or sexual minorities. We define a set of sensitive topics that can yield inappropriate and toxic messages and describe the methodology of collecting and labelling a dataset for appropriateness. While toxicity in user-generated data is well-studied, we aim at defining a more fine-grained notion of inappropriateness. The core of inappropriateness is that it can harm the reputation of a speaker. This is different from toxicity in two respects: (i) inappropriateness is topic-related, and (ii) inappropriate message is not toxic but still unacceptable. We collect and release two datasets for Russian: a topic-labelled dataset and an appropriateness-labelled dataset. We also release pre-trained classification models trained on this data.}, + address = {Kiyv, Ukraine}, + author = {Babakov, Nikolay and +Logacheva, Varvara and +Kozlova, Olga and +Semenov, Nikita and +Panchenko, Alexander}, + booktitle = {Proceedings of the 8th Workshop on Balto-Slavic Natural Language Processing}, + editor = {Babych, Bogdan and +Kanishcheva, Olga and +Nakov, Preslav and +Piskorski, Jakub and +Pivovarova, Lidia and +Starko, Vasyl and +Steinberger, Josef and +Yangarber, Roman and +Marci{\'n}czuk, Micha{\l} and +Pollak, Senja and +P{\v{r}}ib{\'a}{\v{n}}, Pavel and +Robnik-{\v{S}}ikonja, Marko}, + month = apr, + pages = {26--36}, + publisher = {Association for Computational Linguistics}, + title = {Detecting Inappropriate Messages on Sensitive Topics that Could Harm a Company{'}s Reputation}, + url = {https://aclanthology.org/2021.bsnlp-1.4}, + year = {2021}, +} +""", prompt="Given a sentence as query, find sensitive topics", ) diff --git a/mteb/tasks/MultiLabelClassification/rus/ru_toixic_multilabelclassification_okmlcup.py b/mteb/tasks/MultiLabelClassification/rus/ru_toixic_multilabelclassification_okmlcup.py new file mode 100644 index 0000000000..e7a0b8a158 --- /dev/null +++ b/mteb/tasks/MultiLabelClassification/rus/ru_toixic_multilabelclassification_okmlcup.py @@ -0,0 +1,35 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskMultilabelClassification import ( + AbsTaskMultilabelClassification, +) +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class RuToxicOKMLCUPMultilabelClassification(AbsTaskMultilabelClassification): + metadata = TaskMetadata( + name="RuToxicOKMLCUPMultilabelClassification", + dataset={ + "path": "mteb/RuToxicOKMLCUPClassification", + "revision": "13722b7320ef4b6a471f9e8b379f3f49167d0517", + }, + description="On the Odnoklassniki social network, users post a huge number of comments of various directions and nature every day.", + reference="https://cups.online/ru/contests/okmlcup2020", + type="Classification", + category="t2t", + modalities=["text"], + eval_splits=["test"], + eval_langs=["rus-Cyrl"], + main_score="accuracy", + date=("2015-01-01", "2024-01-01"), + domains=[], + task_subtypes=["Sentiment/Hate speech"], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation="""""", + ) + + def dataset_transform(self): + self.dataset = self.dataset.rename_column("labels", "label") diff --git a/mteb/tasks/PairClassification/__init__.py b/mteb/tasks/PairClassification/__init__.py index 6cd75ea144..f562879bd8 100644 --- a/mteb/tasks/PairClassification/__init__.py +++ b/mteb/tasks/PairClassification/__init__.py @@ -2,6 +2,7 @@ from .ara.ArEntail import * from .ces.CTKFactsNLI import * +from .dan.TalemaaderPC import * from .deu.FalseFriendsDeEnPC import * from .eng.LegalBenchPC import * from .eng.PubChemAISentenceParaphrasePC import * diff --git a/mteb/tasks/PairClassification/ara/ArEntail.py b/mteb/tasks/PairClassification/ara/ArEntail.py index 9afce29d71..a427f2ba30 100644 --- a/mteb/tasks/PairClassification/ara/ArEntail.py +++ b/mteb/tasks/PairClassification/ara/ArEntail.py @@ -29,14 +29,16 @@ class ArEntail(AbsTaskPairClassification): annotations_creators="human-annotated", dialect=[], sample_creation="found", - bibtex_citation="""@article{obeidat2024arentail, - title={ArEntail: manually-curated Arabic natural language inference dataset from news headlines}, - author={Obeidat, Rasha and Al-Harahsheh, Yara and Al-Ayyoub, Mahmoud and Gharaibeh, Maram}, - journal={Language Resources and Evaluation}, - pages={1--27}, - year={2024}, - publisher={Springer} - }""", + bibtex_citation=r""" +@article{obeidat2024arentail, + author = {Obeidat, Rasha and Al-Harahsheh, Yara and Al-Ayyoub, Mahmoud and Gharaibeh, Maram}, + journal = {Language Resources and Evaluation}, + pages = {1--27}, + publisher = {Springer}, + title = {ArEntail: manually-curated Arabic natural language inference dataset from news headlines}, + year = {2024}, +} +""", ) def dataset_transform(self): diff --git a/mteb/tasks/PairClassification/ces/CTKFactsNLI.py b/mteb/tasks/PairClassification/ces/CTKFactsNLI.py index 0083f2c8c7..cb51d912f3 100644 --- a/mteb/tasks/PairClassification/ces/CTKFactsNLI.py +++ b/mteb/tasks/PairClassification/ces/CTKFactsNLI.py @@ -27,16 +27,18 @@ class CTKFactsNLI(AbsTaskPairClassification): annotations_creators="human-annotated", dialect=[], sample_creation="found", - bibtex_citation="""@article{ullrich2023csfever, - title={CsFEVER and CTKFacts: acquiring Czech data for fact verification}, - author={Ullrich, Herbert and Drchal, Jan and R{\\`y}par, Martin and Vincourov{\\'a}, Hana and Moravec, V{\\'a}clav}, - journal={Language Resources and Evaluation}, - volume={57}, - number={4}, - pages={1571--1605}, - year={2023}, - publisher={Springer} - }""", # after removing label 1=NOT ENOUGH INFO + bibtex_citation=r""" +@article{ullrich2023csfever, + author = {Ullrich, Herbert and Drchal, Jan and R{\\`y}par, Martin and Vincourov{\\'a}, Hana and Moravec, V{\\'a}clav}, + journal = {Language Resources and Evaluation}, + number = {4}, + pages = {1571--1605}, + publisher = {Springer}, + title = {CsFEVER and CTKFacts: acquiring Czech data for fact verification}, + volume = {57}, + year = {2023}, +} +""", # after removing label 1=NOT ENOUGH INFO ) def dataset_transform(self): diff --git a/mteb/tasks/PairClassification/dan/TalemaaderPC.py b/mteb/tasks/PairClassification/dan/TalemaaderPC.py new file mode 100644 index 0000000000..8b9c5d2430 --- /dev/null +++ b/mteb/tasks/PairClassification/dan/TalemaaderPC.py @@ -0,0 +1,59 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskPairClassification import AbsTaskPairClassification +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class TalemaaderPC(AbsTaskPairClassification): + metadata = TaskMetadata( + name="TalemaaderPC", + description="""\ +The Danish Language and Literature Society has developed a dataset for evaluating language models in Danish. +The dataset contains a total of 1000 Danish idioms and fixed expressions with transferred meanings based on the Danish Dictionary's collection of fixed expressions with associated definitions. +For each of the 1000 idioms and fixed expressions, three false definitions have also been prepared. +The dataset can be used to test the performance of language models in identifying correct definitions for Danish idioms and fixed expressions. +""", + reference="https://sprogteknologi.dk/dataset/1000-talemader-evalueringsdatasaet", + dataset={ + "path": "mteb/talemaader_pc", + "revision": "e714d53c059ca83d56c41d22f800da8245bb87fc", + }, + type="PairClassification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["dan-Latn"], + main_score="max_accuracy", + date=("2024-11-20", "2024-11-20"), + domains=["Academic", "Written"], + task_subtypes=[], + license="cc-by-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="created", + bibtex_citation=r""" +@misc{DSLDK1000Talemader, + author = {{Det Danske Sprog- og Litteraturselskab}}, + howpublished = {Sprogteknologi.dk}, + language = {Danish}, + note = {CC-BY licensed dataset of 1000 Danish sayings and expressions}, + publisher = {Digitaliseringsstyrelsen \& Det Danske Sprog- og Litteraturselskab}, + title = {1000 danske talemåder - evalueringsdatasæt}, + url = {https://sprogteknologi.dk/dataset/1000-talemader-evalueringsdatasaet}, + year = {2024}, +} +""", + ) + + def dataset_transform(self): + _dataset = {} + for split in self.metadata.eval_splits: + hf_dataset = self.dataset[split] + _dataset[split] = [ + { + "sentence1": hf_dataset["sentence1"], + "sentence2": hf_dataset["sentence2"], + "labels": hf_dataset["label"], + } + ] + self.dataset = _dataset diff --git a/mteb/tasks/PairClassification/deu/FalseFriendsDeEnPC.py b/mteb/tasks/PairClassification/deu/FalseFriendsDeEnPC.py index bd1edf2f0e..62c163ba0e 100644 --- a/mteb/tasks/PairClassification/deu/FalseFriendsDeEnPC.py +++ b/mteb/tasks/PairClassification/deu/FalseFriendsDeEnPC.py @@ -27,15 +27,15 @@ class FalseFriendsDeEnPC(AbsTaskPairClassification): annotations_creators="human-annotated", dialect=[], sample_creation="created", - bibtex_citation=""" - @misc{Chibb_2022, - title="German-English False Friends in Multilingual Transformer Models: An Evaluation on Robustness and Word-to-Word Fine-Tuning", - author="Chibb, Aaron", - year="2022", - month="Sep" - abstract="This paper explores the robustness of multilingual language models against false friends. False friends are words that sound or are written the same in two different languages but have different meaning. Generally, it is argued that multilingual models, such as XLM-RoBERTA, can outperform monolingual models in most tasks on conventional datasets. However, false friends are not considered in these tests. In this paper, experiments with a false friends dataset show that multilingual models are not robust against false friends; they have problems creating monolingual representations and differentiating between meanings of similarly written words in different languages. An attempt of word-based finetuning multilingual models on false friends pairs is promising, however the results do not generally solve the presented problem and still, monolingual models are more robust against false friends." - } - """, + bibtex_citation=r""" +@misc{Chibb_2022, + abstract = {{This paper explores the robustness of multilingual language models against false friends. False friends are words that sound or are written the same in two different languages but have different meaning. Generally, it is argued that multilingual models, such as XLM-RoBERTA, can outperform monolingual models in most tasks on conventional datasets. However, false friends are not considered in these tests. In this paper, experiments with a false friends dataset show that multilingual models are not robust against false friends; they have problems creating monolingual representations and differentiating between meanings of similarly written words in different languages. An attempt of word-based finetuning multilingual models on false friends pairs is promising, however the results do not generally solve the presented problem and still, monolingual models are more robust against false friends.}}, + author = {Chibb, Aaron}, + month = {Sep}, + title = {{German-English False Friends in Multilingual Transformer Models: An Evaluation on Robustness and Word-to-Word Fine-Tuning}}, + year = {2022}, +} +""", ) def dataset_transform(self): diff --git a/mteb/tasks/PairClassification/eng/LegalBenchPC.py b/mteb/tasks/PairClassification/eng/LegalBenchPC.py index a8d02f469b..ff767bc178 100644 --- a/mteb/tasks/PairClassification/eng/LegalBenchPC.py +++ b/mteb/tasks/PairClassification/eng/LegalBenchPC.py @@ -84,39 +84,42 @@ class LegalBenchPC(AbsTaskPairClassification): annotations_creators="expert-annotated", dialect=[], sample_creation="found", - bibtex_citation=""" - @misc{guha2023legalbench, - title={LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, - author={Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, - year={2023}, - eprint={2308.11462}, - archivePrefix={arXiv}, - primaryClass={cs.CL} - } - @article{kolt2022predicting, - title={Predicting consumer contracts}, - author={Kolt, Noam}, - journal={Berkeley Tech. LJ}, - volume={37}, - pages={71}, - year={2022}, - publisher={HeinOnline} - } - @article{zimmeck2019maps, - title={Maps: Scaling privacy compliance analysis to a million apps}, - author={Zimmeck, Sebastian and Story, Peter and Smullen, Daniel and Ravichander, Abhilasha and Wang, Ziqi and Reidenberg, Joel R and Russell, N Cameron and Sadeh, Norman}, - journal={Proc. Priv. Enhancing Tech.}, - volume={2019}, - pages={66}, - year={2019} - } - @article{ravichander2019question, - title={Question answering for privacy policies: Combining computational and legal perspectives}, - author={Ravichander, Abhilasha and Black, Alan W and Wilson, Shomir and Norton, Thomas and Sadeh, Norman}, - journal={arXiv preprint arXiv:1911.00841}, - year={2019} - } - """, + bibtex_citation=r""" +@misc{guha2023legalbench, + archiveprefix = {arXiv}, + author = {Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, + eprint = {2308.11462}, + primaryclass = {cs.CL}, + title = {LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, + year = {2023}, +} + +@article{kolt2022predicting, + author = {Kolt, Noam}, + journal = {Berkeley Tech. LJ}, + pages = {71}, + publisher = {HeinOnline}, + title = {Predicting consumer contracts}, + volume = {37}, + year = {2022}, +} + +@article{ravichander2019question, + author = {Ravichander, Abhilasha and Black, Alan W and Wilson, Shomir and Norton, Thomas and Sadeh, Norman}, + journal = {arXiv preprint arXiv:1911.00841}, + title = {Question answering for privacy policies: Combining computational and legal perspectives}, + year = {2019}, +} + +@article{zimmeck2019maps, + author = {Zimmeck, Sebastian and Story, Peter and Smullen, Daniel and Ravichander, Abhilasha and Wang, Ziqi and Reidenberg, Joel R and Russell, N Cameron and Sadeh, Norman}, + journal = {Proc. Priv. Enhancing Tech.}, + pages = {66}, + title = {Maps: Scaling privacy compliance analysis to a million apps}, + volume = {2019}, + year = {2019}, +} +""", ) def load_data(self, **kwargs: Any) -> None: diff --git a/mteb/tasks/PairClassification/eng/PubChemAISentenceParaphrasePC.py b/mteb/tasks/PairClassification/eng/PubChemAISentenceParaphrasePC.py index f453ebee31..7dffc2f1fb 100644 --- a/mteb/tasks/PairClassification/eng/PubChemAISentenceParaphrasePC.py +++ b/mteb/tasks/PairClassification/eng/PubChemAISentenceParaphrasePC.py @@ -26,24 +26,25 @@ class PubChemAISentenceParaphrasePC(AbsTaskPairClassification): annotations_creators="LM-generated", dialect=[], sample_creation="created", - bibtex_citation=""" - @article{kasmaee2024chemteb, - title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, - author={Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, - journal={arXiv preprint arXiv:2412.00532}, - year={2024} - } - @article{kim2023pubchem, - title={PubChem 2023 update}, - author={Kim, Sunghwan and Chen, Jie and Cheng, Tiejun and Gindulyte, Asta and He, Jia and He, Siqian and Li, Qingliang and Shoemaker, Benjamin A and Thiessen, Paul A and Yu, Bo and others}, - journal={Nucleic acids research}, - volume={51}, - number={D1}, - pages={D1373--D1380}, - year={2023}, - publisher={Oxford University Press} - } - """, + bibtex_citation=r""" +@article{kasmaee2024chemteb, + author = {Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, + journal = {arXiv preprint arXiv:2412.00532}, + title = {ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, + year = {2024}, +} + +@article{kim2023pubchem, + author = {Kim, Sunghwan and Chen, Jie and Cheng, Tiejun and Gindulyte, Asta and He, Jia and He, Siqian and Li, Qingliang and Shoemaker, Benjamin A and Thiessen, Paul A and Yu, Bo and others}, + journal = {Nucleic acids research}, + number = {D1}, + pages = {D1373--D1380}, + publisher = {Oxford University Press}, + title = {PubChem 2023 update}, + volume = {51}, + year = {2023}, +} +""", ) def dataset_transform(self): diff --git a/mteb/tasks/PairClassification/eng/PubChemSMILESPC.py b/mteb/tasks/PairClassification/eng/PubChemSMILESPC.py index b3e297e043..a6772b783a 100644 --- a/mteb/tasks/PairClassification/eng/PubChemSMILESPC.py +++ b/mteb/tasks/PairClassification/eng/PubChemSMILESPC.py @@ -55,24 +55,25 @@ class PubChemSMILESPC(AbsTaskPairClassification): annotations_creators="derived", dialect=[], sample_creation="created", - bibtex_citation=""" - @article{kasmaee2024chemteb, - title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, - author={Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, - journal={arXiv preprint arXiv:2412.00532}, - year={2024} - } - @article{kim2023pubchem, - title={PubChem 2023 update}, - author={Kim, Sunghwan and Chen, Jie and Cheng, Tiejun and Gindulyte, Asta and He, Jia and He, Siqian and Li, Qingliang and Shoemaker, Benjamin A and Thiessen, Paul A and Yu, Bo and others}, - journal={Nucleic acids research}, - volume={51}, - number={D1}, - pages={D1373--D1380}, - year={2023}, - publisher={Oxford University Press} - } - """, + bibtex_citation=r""" +@article{kasmaee2024chemteb, + author = {Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, + journal = {arXiv preprint arXiv:2412.00532}, + title = {ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, + year = {2024}, +} + +@article{kim2023pubchem, + author = {Kim, Sunghwan and Chen, Jie and Cheng, Tiejun and Gindulyte, Asta and He, Jia and He, Siqian and Li, Qingliang and Shoemaker, Benjamin A and Thiessen, Paul A and Yu, Bo and others}, + journal = {Nucleic acids research}, + number = {D1}, + pages = {D1373--D1380}, + publisher = {Oxford University Press}, + title = {PubChem 2023 update}, + volume = {51}, + year = {2023}, +} +""", ) def load_data(self): diff --git a/mteb/tasks/PairClassification/eng/PubChemSynonymPC.py b/mteb/tasks/PairClassification/eng/PubChemSynonymPC.py index 6b6dfd81c8..0102e73327 100644 --- a/mteb/tasks/PairClassification/eng/PubChemSynonymPC.py +++ b/mteb/tasks/PairClassification/eng/PubChemSynonymPC.py @@ -26,24 +26,25 @@ class PubChemSynonymPC(AbsTaskPairClassification): annotations_creators="derived", dialect=[], sample_creation="created", - bibtex_citation=""" - @article{kasmaee2024chemteb, - title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, - author={Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, - journal={arXiv preprint arXiv:2412.00532}, - year={2024} - } - @article{kim2023pubchem, - title={PubChem 2023 update}, - author={Kim, Sunghwan and Chen, Jie and Cheng, Tiejun and Gindulyte, Asta and He, Jia and He, Siqian and Li, Qingliang and Shoemaker, Benjamin A and Thiessen, Paul A and Yu, Bo and others}, - journal={Nucleic acids research}, - volume={51}, - number={D1}, - pages={D1373--D1380}, - year={2023}, - publisher={Oxford University Press} - } - """, + bibtex_citation=r""" +@article{kasmaee2024chemteb, + author = {Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, + journal = {arXiv preprint arXiv:2412.00532}, + title = {ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, + year = {2024}, +} + +@article{kim2023pubchem, + author = {Kim, Sunghwan and Chen, Jie and Cheng, Tiejun and Gindulyte, Asta and He, Jia and He, Siqian and Li, Qingliang and Shoemaker, Benjamin A and Thiessen, Paul A and Yu, Bo and others}, + journal = {Nucleic acids research}, + number = {D1}, + pages = {D1373--D1380}, + publisher = {Oxford University Press}, + title = {PubChem 2023 update}, + volume = {51}, + year = {2023}, +} +""", ) def dataset_transform(self): diff --git a/mteb/tasks/PairClassification/eng/PubChemWikiParagraphsPC.py b/mteb/tasks/PairClassification/eng/PubChemWikiParagraphsPC.py index 679580f28c..b0fe9962b5 100644 --- a/mteb/tasks/PairClassification/eng/PubChemWikiParagraphsPC.py +++ b/mteb/tasks/PairClassification/eng/PubChemWikiParagraphsPC.py @@ -26,24 +26,25 @@ class PubChemWikiParagraphsPC(AbsTaskPairClassification): annotations_creators="derived", dialect=[], sample_creation="created", - bibtex_citation=""" - @article{kasmaee2024chemteb, - title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, - author={Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, - journal={arXiv preprint arXiv:2412.00532}, - year={2024} - } - @article{kim2023pubchem, - title={PubChem 2023 update}, - author={Kim, Sunghwan and Chen, Jie and Cheng, Tiejun and Gindulyte, Asta and He, Jia and He, Siqian and Li, Qingliang and Shoemaker, Benjamin A and Thiessen, Paul A and Yu, Bo and others}, - journal={Nucleic acids research}, - volume={51}, - number={D1}, - pages={D1373--D1380}, - year={2023}, - publisher={Oxford University Press} - } - """, + bibtex_citation=r""" +@article{kasmaee2024chemteb, + author = {Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, + journal = {arXiv preprint arXiv:2412.00532}, + title = {ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, + year = {2024}, +} + +@article{kim2023pubchem, + author = {Kim, Sunghwan and Chen, Jie and Cheng, Tiejun and Gindulyte, Asta and He, Jia and He, Siqian and Li, Qingliang and Shoemaker, Benjamin A and Thiessen, Paul A and Yu, Bo and others}, + journal = {Nucleic acids research}, + number = {D1}, + pages = {D1373--D1380}, + publisher = {Oxford University Press}, + title = {PubChem 2023 update}, + volume = {51}, + year = {2023}, +} +""", ) def dataset_transform(self): diff --git a/mteb/tasks/PairClassification/eng/SprintDuplicateQuestionsPC.py b/mteb/tasks/PairClassification/eng/SprintDuplicateQuestionsPC.py index 4c1ea598e2..eaf3f4cf36 100644 --- a/mteb/tasks/PairClassification/eng/SprintDuplicateQuestionsPC.py +++ b/mteb/tasks/PairClassification/eng/SprintDuplicateQuestionsPC.py @@ -31,27 +31,29 @@ class SprintDuplicateQuestionsPC(AbsTaskPairClassification): dialect=[], sample_creation="found", prompt="Retrieve duplicate questions from Sprint forum", - bibtex_citation="""@inproceedings{shah-etal-2018-adversarial, - title = "Adversarial Domain Adaptation for Duplicate Question Detection", - author = "Shah, Darsh and - Lei, Tao and - Moschitti, Alessandro and - Romeo, Salvatore and - Nakov, Preslav", - editor = "Riloff, Ellen and - Chiang, David and - Hockenmaier, Julia and - Tsujii, Jun{'}ichi", - booktitle = "Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing", - month = oct # "-" # nov, - year = "2018", - address = "Brussels, Belgium", - publisher = "Association for Computational Linguistics", - url = "https://aclanthology.org/D18-1131", - doi = "10.18653/v1/D18-1131", - pages = "1056--1063", - abstract = "We address the problem of detecting duplicate questions in forums, which is an important step towards automating the process of answering new questions. As finding and annotating such potential duplicates manually is very tedious and costly, automatic methods based on machine learning are a viable alternative. However, many forums do not have annotated data, i.e., questions labeled by experts as duplicates, and thus a promising solution is to use domain adaptation from another forum that has such annotations. Here we focus on adversarial domain adaptation, deriving important findings about when it performs well and what properties of the domains are important in this regard. Our experiments with StackExchange data show an average improvement of 5.6{\%} over the best baseline across multiple pairs of domains.", -}""", + bibtex_citation=r""" +@inproceedings{shah-etal-2018-adversarial, + abstract = {We address the problem of detecting duplicate questions in forums, which is an important step towards automating the process of answering new questions. As finding and annotating such potential duplicates manually is very tedious and costly, automatic methods based on machine learning are a viable alternative. However, many forums do not have annotated data, i.e., questions labeled by experts as duplicates, and thus a promising solution is to use domain adaptation from another forum that has such annotations. Here we focus on adversarial domain adaptation, deriving important findings about when it performs well and what properties of the domains are important in this regard. Our experiments with StackExchange data show an average improvement of 5.6{\%} over the best baseline across multiple pairs of domains.}, + address = {Brussels, Belgium}, + author = {Shah, Darsh and +Lei, Tao and +Moschitti, Alessandro and +Romeo, Salvatore and +Nakov, Preslav}, + booktitle = {Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing}, + doi = {10.18653/v1/D18-1131}, + editor = {Riloff, Ellen and +Chiang, David and +Hockenmaier, Julia and +Tsujii, Jun{'}ichi}, + month = oct # {-} # nov, + pages = {1056--1063}, + publisher = {Association for Computational Linguistics}, + title = {Adversarial Domain Adaptation for Duplicate Question Detection}, + url = {https://aclanthology.org/D18-1131}, + year = {2018}, +} +""", ) def dataset_transform(self): diff --git a/mteb/tasks/PairClassification/eng/TwitterSemEval2015PC.py b/mteb/tasks/PairClassification/eng/TwitterSemEval2015PC.py index 9da7c1072e..6914e6744e 100644 --- a/mteb/tasks/PairClassification/eng/TwitterSemEval2015PC.py +++ b/mteb/tasks/PairClassification/eng/TwitterSemEval2015PC.py @@ -27,24 +27,26 @@ class TwitterSemEval2015PC(AbsTaskPairClassification): annotations_creators="human-annotated", dialect=[], sample_creation="found", - bibtex_citation="""@inproceedings{xu-etal-2015-semeval, - title = "{S}em{E}val-2015 Task 1: Paraphrase and Semantic Similarity in {T}witter ({PIT})", - author = "Xu, Wei and - Callison-Burch, Chris and - Dolan, Bill", - editor = "Nakov, Preslav and - Zesch, Torsten and - Cer, Daniel and - Jurgens, David", - booktitle = "Proceedings of the 9th International Workshop on Semantic Evaluation ({S}em{E}val 2015)", - month = jun, - year = "2015", - address = "Denver, Colorado", - publisher = "Association for Computational Linguistics", - url = "https://aclanthology.org/S15-2001", - doi = "10.18653/v1/S15-2001", - pages = "1--11", - }""", + bibtex_citation=r""" +@inproceedings{xu-etal-2015-semeval, + address = {Denver, Colorado}, + author = {Xu, Wei and +Callison-Burch, Chris and +Dolan, Bill}, + booktitle = {Proceedings of the 9th International Workshop on Semantic Evaluation ({S}em{E}val 2015)}, + doi = {10.18653/v1/S15-2001}, + editor = {Nakov, Preslav and +Zesch, Torsten and +Cer, Daniel and +Jurgens, David}, + month = jun, + pages = {1--11}, + publisher = {Association for Computational Linguistics}, + title = {{S}em{E}val-2015 Task 1: Paraphrase and Semantic Similarity in {T}witter ({PIT})}, + url = {https://aclanthology.org/S15-2001}, + year = {2015}, +} +""", prompt="Retrieve tweets that are semantically similar to the given tweet", ) diff --git a/mteb/tasks/PairClassification/eng/TwitterURLCorpusPC.py b/mteb/tasks/PairClassification/eng/TwitterURLCorpusPC.py index 85432b1d97..d31ff81410 100644 --- a/mteb/tasks/PairClassification/eng/TwitterURLCorpusPC.py +++ b/mteb/tasks/PairClassification/eng/TwitterURLCorpusPC.py @@ -27,25 +27,27 @@ class TwitterURLCorpusPC(AbsTaskPairClassification): annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation="""@inproceedings{lan-etal-2017-continuously, - title = "A Continuously Growing Dataset of Sentential Paraphrases", - author = "Lan, Wuwei and - Qiu, Siyu and - He, Hua and - Xu, Wei", - editor = "Palmer, Martha and - Hwa, Rebecca and - Riedel, Sebastian", - booktitle = "Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing", - month = sep, - year = "2017", - address = "Copenhagen, Denmark", - publisher = "Association for Computational Linguistics", - url = "https://aclanthology.org/D17-1126", - doi = "10.18653/v1/D17-1126", - pages = "1224--1234", - abstract = "A major challenge in paraphrase research is the lack of parallel corpora. In this paper, we present a new method to collect large-scale sentential paraphrases from Twitter by linking tweets through shared URLs. The main advantage of our method is its simplicity, as it gets rid of the classifier or human in the loop needed to select data before annotation and subsequent application of paraphrase identification algorithms in the previous work. We present the largest human-labeled paraphrase corpus to date of 51,524 sentence pairs and the first cross-domain benchmarking for automatic paraphrase identification. In addition, we show that more than 30,000 new sentential paraphrases can be easily and continuously captured every month at {\textasciitilde}70{\%} precision, and demonstrate their utility for downstream NLP tasks through phrasal paraphrase extraction. We make our code and data freely available.", - }""", + bibtex_citation=r""" +@inproceedings{lan-etal-2017-continuously, + abstract = {A major challenge in paraphrase research is the lack of parallel corpora. In this paper, we present a new method to collect large-scale sentential paraphrases from Twitter by linking tweets through shared URLs. The main advantage of our method is its simplicity, as it gets rid of the classifier or human in the loop needed to select data before annotation and subsequent application of paraphrase identification algorithms in the previous work. We present the largest human-labeled paraphrase corpus to date of 51,524 sentence pairs and the first cross-domain benchmarking for automatic paraphrase identification. In addition, we show that more than 30,000 new sentential paraphrases can be easily and continuously captured every month at {\textasciitilde}70{\%} precision, and demonstrate their utility for downstream NLP tasks through phrasal paraphrase extraction. We make our code and data freely available.}, + address = {Copenhagen, Denmark}, + author = {Lan, Wuwei and +Qiu, Siyu and +He, Hua and +Xu, Wei}, + booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing}, + doi = {10.18653/v1/D17-1126}, + editor = {Palmer, Martha and +Hwa, Rebecca and +Riedel, Sebastian}, + month = sep, + pages = {1224--1234}, + publisher = {Association for Computational Linguistics}, + title = {A Continuously Growing Dataset of Sentential Paraphrases}, + url = {https://aclanthology.org/D17-1126}, + year = {2017}, +} +""", prompt="Retrieve tweets that are semantically similar to the given tweet", ) diff --git a/mteb/tasks/PairClassification/fas/FaMTEBPairClassification.py b/mteb/tasks/PairClassification/fas/FaMTEBPairClassification.py index 6deba76d8d..01cccca055 100644 --- a/mteb/tasks/PairClassification/fas/FaMTEBPairClassification.py +++ b/mteb/tasks/PairClassification/fas/FaMTEBPairClassification.py @@ -26,7 +26,19 @@ class CExaPPC(AbsTaskPairClassification): annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation=""" """, + bibtex_citation=r""" +@inproceedings{9786243, + author = {Sadeghi, Reyhaneh and Karbasi, Hamed and Akbari, Ahmad}, + booktitle = {2022 8th International Conference on Web Research (ICWR)}, + doi = {10.1109/ICWR54782.2022.9786243}, + keywords = {Data mining;Task analysis;Paraphrase Identification;Semantic Similarity;Deep Learning;Paraphrasing Corpora}, + number = {}, + pages = {168-175}, + title = {ExaPPC: a Large-Scale Persian Paraphrase Detection Corpus}, + volume = {}, + year = {2022}, +} +""", ) def dataset_transform(self): @@ -214,13 +226,23 @@ class ParsinluEntail(AbsTaskPairClassification): eval_langs=["fas-Arab"], main_score="max_ap", date=("2024-09-01", "2024-12-31"), - domains=[], + domains=["Reviews", "Written"], task_subtypes=[], license="not specified", annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation=""" """, + bibtex_citation=r""" +@misc{khashabi2021parsinlusuitelanguageunderstanding, + archiveprefix = {arXiv}, + author = {Daniel Khashabi and Arman Cohan and Siamak Shakeri and Pedram Hosseini and Pouya Pezeshkpour and Malihe Alikhani and Moin Aminnaseri and Marzieh Bitaab and Faeze Brahman and Sarik Ghazarian and Mozhdeh Gheini and Arman Kabiri and Rabeeh Karimi Mahabadi and Omid Memarrast and Ahmadreza Mosallanezhad and Erfan Noury and Shahab Raji and Mohammad Sadegh Rasooli and Sepideh Sadeghi and Erfan Sadeqi Azer and Niloofar Safi Samghabadi and Mahsa Shafaei and Saber Sheybani and Ali Tazarv and Yadollah Yaghoobzadeh}, + eprint = {2012.06154}, + primaryclass = {cs.CL}, + title = {ParsiNLU: A Suite of Language Understanding Challenges for Persian}, + url = {https://arxiv.org/abs/2012.06154}, + year = {2021}, +} +""", ) def dataset_transform(self): @@ -257,13 +279,23 @@ class ParsinluQueryParaphPC(AbsTaskPairClassification): eval_langs=["fas-Arab"], main_score="max_ap", date=("2024-09-01", "2024-12-31"), - domains=[], + domains=["Reviews", "Written"], task_subtypes=[], license="not specified", annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation=""" """, + bibtex_citation=r""" +@misc{khashabi2021parsinlusuitelanguageunderstanding, + archiveprefix = {arXiv}, + author = {Daniel Khashabi and Arman Cohan and Siamak Shakeri and Pedram Hosseini and Pouya Pezeshkpour and Malihe Alikhani and Moin Aminnaseri and Marzieh Bitaab and Faeze Brahman and Sarik Ghazarian and Mozhdeh Gheini and Arman Kabiri and Rabeeh Karimi Mahabadi and Omid Memarrast and Ahmadreza Mosallanezhad and Erfan Noury and Shahab Raji and Mohammad Sadegh Rasooli and Sepideh Sadeghi and Erfan Sadeqi Azer and Niloofar Safi Samghabadi and Mahsa Shafaei and Saber Sheybani and Ali Tazarv and Yadollah Yaghoobzadeh}, + eprint = {2012.06154}, + primaryclass = {cs.CL}, + title = {ParsiNLU: A Suite of Language Understanding Challenges for Persian}, + url = {https://arxiv.org/abs/2012.06154}, + year = {2021}, +} +""", ) def dataset_transform(self): diff --git a/mteb/tasks/PairClassification/fas/FarsTail.py b/mteb/tasks/PairClassification/fas/FarsTail.py index 552e953f77..220a9756f8 100644 --- a/mteb/tasks/PairClassification/fas/FarsTail.py +++ b/mteb/tasks/PairClassification/fas/FarsTail.py @@ -28,14 +28,16 @@ class FarsTail(AbsTaskPairClassification): annotations_creators="human-annotated", dialect=[], sample_creation="found", - bibtex_citation="""@article{amirkhani2023farstail, - title={FarsTail: a Persian natural language inference dataset}, - author={Amirkhani, Hossein and AzariJafari, Mohammad and Faridan-Jahromi, Soroush and Kouhkan, Zeinab and Pourjafari, Zohreh and Amirak, Azadeh}, - journal={Soft Computing}, - year={2023}, - publisher={Springer}, - doi={10.1007/s00500-023-08959-3} - }""", # after removing neutral + bibtex_citation=r""" +@article{amirkhani2023farstail, + author = {Amirkhani, Hossein and AzariJafari, Mohammad and Faridan-Jahromi, Soroush and Kouhkan, Zeinab and Pourjafari, Zohreh and Amirak, Azadeh}, + doi = {10.1007/s00500-023-08959-3}, + journal = {Soft Computing}, + publisher = {Springer}, + title = {FarsTail: a Persian natural language inference dataset}, + year = {2023}, +} +""", # after removing neutral ) def load_data(self, **kwargs): diff --git a/mteb/tasks/PairClassification/hye/ArmenianParaphrasePC.py b/mteb/tasks/PairClassification/hye/ArmenianParaphrasePC.py index 04431c2238..53a6f537d3 100644 --- a/mteb/tasks/PairClassification/hye/ArmenianParaphrasePC.py +++ b/mteb/tasks/PairClassification/hye/ArmenianParaphrasePC.py @@ -26,16 +26,16 @@ class ArmenianParaphrasePC(AbsTaskPairClassification): annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation=""" - @misc{malajyan2020arpa, - title={ARPA: Armenian Paraphrase Detection Corpus and Models}, - author={Arthur Malajyan and Karen Avetisyan and Tsolak Ghukasyan}, - year={2020}, - eprint={2009.12615}, - archivePrefix={arXiv}, - primaryClass={cs.CL} - } - """, + bibtex_citation=r""" +@misc{malajyan2020arpa, + archiveprefix = {arXiv}, + author = {Arthur Malajyan and Karen Avetisyan and Tsolak Ghukasyan}, + eprint = {2009.12615}, + primaryclass = {cs.CL}, + title = {ARPA: Armenian Paraphrase Detection Corpus and Models}, + year = {2020}, +} +""", ) def dataset_transform(self): diff --git a/mteb/tasks/PairClassification/ind/IndoNLI.py b/mteb/tasks/PairClassification/ind/IndoNLI.py index ac0976e475..f389c7fa0b 100644 --- a/mteb/tasks/PairClassification/ind/IndoNLI.py +++ b/mteb/tasks/PairClassification/ind/IndoNLI.py @@ -27,17 +27,19 @@ class IndoNLI(AbsTaskPairClassification): annotations_creators="expert-annotated", dialect=[], sample_creation="found", - bibtex_citation="""@inproceedings{mahendra-etal-2021-indonli, - title = "{I}ndo{NLI}: A Natural Language Inference Dataset for {I}ndonesian", - author = "Mahendra, Rahmad and Aji, Alham Fikri and Louvan, Samuel and Rahman, Fahrurrozi and Vania, Clara", - booktitle = "Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing", - month = nov, - year = "2021", - address = "Online and Punta Cana, Dominican Republic", - publisher = "Association for Computational Linguistics", - url = "https://aclanthology.org/2021.emnlp-main.821", - pages = "10511--10527", - }""", + bibtex_citation=r""" +@inproceedings{mahendra-etal-2021-indonli, + address = {Online and Punta Cana, Dominican Republic}, + author = {Mahendra, Rahmad and Aji, Alham Fikri and Louvan, Samuel and Rahman, Fahrurrozi and Vania, Clara}, + booktitle = {Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing}, + month = nov, + pages = {10511--10527}, + publisher = {Association for Computational Linguistics}, + title = {{I}ndo{NLI}: A Natural Language Inference Dataset for {I}ndonesian}, + url = {https://aclanthology.org/2021.emnlp-main.821}, + year = {2021}, +} +""", # after removing neutral ) diff --git a/mteb/tasks/PairClassification/kor/KlueNLI.py b/mteb/tasks/PairClassification/kor/KlueNLI.py index 9bd2a0d2c6..f5092133a4 100644 --- a/mteb/tasks/PairClassification/kor/KlueNLI.py +++ b/mteb/tasks/PairClassification/kor/KlueNLI.py @@ -27,14 +27,16 @@ class KlueNLI(AbsTaskPairClassification): annotations_creators="human-annotated", dialect=[], sample_creation="found", - bibtex_citation="""@misc{park2021klue, - title={KLUE: Korean Language Understanding Evaluation}, - author={Sungjoon Park and Jihyung Moon and Sungdong Kim and Won Ik Cho and Jiyoon Han and Jangwon Park and Chisung Song and Junseong Kim and Yongsook Song and Taehwan Oh and Joohong Lee and Juhyun Oh and Sungwon Lyu and Younghoon Jeong and Inkwon Lee and Sangwoo Seo and Dongjun Lee and Hyunwoo Kim and Myeonghwa Lee and Seongbo Jang and Seungwon Do and Sunkyoung Kim and Kyungtae Lim and Jongwon Lee and Kyumin Park and Jamin Shin and Seonghyun Kim and Lucy Park and Alice Oh and Jungwoo Ha and Kyunghyun Cho}, - year={2021}, - eprint={2105.09680}, - archivePrefix={arXiv}, - primaryClass={cs.CL} -}""", # 3000 - neutral samples + bibtex_citation=r""" +@misc{park2021klue, + archiveprefix = {arXiv}, + author = {Sungjoon Park and Jihyung Moon and Sungdong Kim and Won Ik Cho and Jiyoon Han and Jangwon Park and Chisung Song and Junseong Kim and Yongsook Song and Taehwan Oh and Joohong Lee and Juhyun Oh and Sungwon Lyu and Younghoon Jeong and Inkwon Lee and Sangwoo Seo and Dongjun Lee and Hyunwoo Kim and Myeonghwa Lee and Seongbo Jang and Seungwon Do and Sunkyoung Kim and Kyungtae Lim and Jongwon Lee and Kyumin Park and Jamin Shin and Seonghyun Kim and Lucy Park and Alice Oh and Jungwoo Ha and Kyunghyun Cho}, + eprint = {2105.09680}, + primaryclass = {cs.CL}, + title = {KLUE: Korean Language Understanding Evaluation}, + year = {2021}, +} +""", # 3000 - neutral samples ) def dataset_transform(self): diff --git a/mteb/tasks/PairClassification/multilingual/IndicXnliPairClassification.py b/mteb/tasks/PairClassification/multilingual/IndicXnliPairClassification.py index c26394d92d..0a48487903 100644 --- a/mteb/tasks/PairClassification/multilingual/IndicXnliPairClassification.py +++ b/mteb/tasks/PairClassification/multilingual/IndicXnliPairClassification.py @@ -49,17 +49,17 @@ class IndicXnliPairClassification(AbsTaskPairClassification, MultilingualTask): annotations_creators="derived", dialect=[], sample_creation="machine-translated", - bibtex_citation=""" - @misc{aggarwal_gupta_kunch_22, - doi = {10.48550/ARXIV.2204.08776}, - url = {https://arxiv.org/abs/2204.08776}, - author = {Aggarwal, Divyanshu and Gupta, Vivek and Kunchukuttan, Anoop}, - title = {IndicXNLI: Evaluating Multilingual Inference for Indian Languages}, - publisher = {arXiv}, - year = {2022}, - copyright = {Creative Commons Attribution 4.0 International} - } - """, + bibtex_citation=r""" +@misc{aggarwal_gupta_kunch_22, + author = {Aggarwal, Divyanshu and Gupta, Vivek and Kunchukuttan, Anoop}, + copyright = {Creative Commons Attribution 4.0 International}, + doi = {10.48550/ARXIV.2204.08776}, + publisher = {arXiv}, + title = {IndicXNLI: Evaluating Multilingual Inference for Indian Languages}, + url = {https://arxiv.org/abs/2204.08776}, + year = {2022}, +} +""", # average of premise and hypothesis ) diff --git a/mteb/tasks/PairClassification/multilingual/OpusparcusPC.py b/mteb/tasks/PairClassification/multilingual/OpusparcusPC.py index bc23e7956d..da1e9c32dd 100644 --- a/mteb/tasks/PairClassification/multilingual/OpusparcusPC.py +++ b/mteb/tasks/PairClassification/multilingual/OpusparcusPC.py @@ -39,14 +39,16 @@ class OpusparcusPC(AbsTaskPairClassification, MultilingualTask): annotations_creators="human-annotated", dialect=[], sample_creation="created", - bibtex_citation="""@misc{creutz2018open, - title={Open Subtitles Paraphrase Corpus for Six Languages}, - author={Mathias Creutz}, - year={2018}, - eprint={1809.06142}, - archivePrefix={arXiv}, - primaryClass={cs.CL} -}""", + bibtex_citation=r""" +@misc{creutz2018open, + archiveprefix = {arXiv}, + author = {Mathias Creutz}, + eprint = {1809.06142}, + primaryclass = {cs.CL}, + title = {Open Subtitles Paraphrase Corpus for Six Languages}, + year = {2018}, +} +""", ) def load_data(self, **kwargs): diff --git a/mteb/tasks/PairClassification/multilingual/PawsXPairClassification.py b/mteb/tasks/PairClassification/multilingual/PawsXPairClassification.py index 66bc37de95..dee2952dd8 100644 --- a/mteb/tasks/PairClassification/multilingual/PawsXPairClassification.py +++ b/mteb/tasks/PairClassification/multilingual/PawsXPairClassification.py @@ -37,14 +37,16 @@ class PawsXPairClassification(MultilingualTask, AbsTaskPairClassification): annotations_creators="human-annotated", dialect=[], sample_creation="human-translated", - bibtex_citation="""@misc{yang2019pawsx, - title={PAWS-X: A Cross-lingual Adversarial Dataset for Paraphrase Identification}, - author={Yinfei Yang and Yuan Zhang and Chris Tar and Jason Baldridge}, - year={2019}, - eprint={1908.11828}, - archivePrefix={arXiv}, - primaryClass={cs.CL} -}""", + bibtex_citation=r""" +@misc{yang2019pawsx, + archiveprefix = {arXiv}, + author = {Yinfei Yang and Yuan Zhang and Chris Tar and Jason Baldridge}, + eprint = {1908.11828}, + primaryclass = {cs.CL}, + title = {PAWS-X: A Cross-lingual Adversarial Dataset for Paraphrase Identification}, + year = {2019}, +} +""", ) def dataset_transform(self): diff --git a/mteb/tasks/PairClassification/multilingual/PubChemWikiPairClassification.py b/mteb/tasks/PairClassification/multilingual/PubChemWikiPairClassification.py index f1b3102fbc..c3be5a701b 100644 --- a/mteb/tasks/PairClassification/multilingual/PubChemWikiPairClassification.py +++ b/mteb/tasks/PairClassification/multilingual/PubChemWikiPairClassification.py @@ -42,24 +42,25 @@ class PubChemWikiPairClassification(AbsTaskPairClassification, MultilingualTask) annotations_creators="derived", dialect=[], sample_creation="created", - bibtex_citation=""" - @article{kasmaee2024chemteb, - title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \\& Efficiency on a Specific Domain}, - author={Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, - journal={arXiv preprint arXiv:2412.00532}, - year={2024} - } - @article{kim2023pubchem, - title={PubChem 2023 update}, - author={Kim, Sunghwan and Chen, Jie and Cheng, Tiejun and Gindulyte, Asta and He, Jia and He, Siqian and Li, Qingliang and Shoemaker, Benjamin A and Thiessen, Paul A and Yu, Bo and others}, - journal={Nucleic acids research}, - volume={51}, - number={D1}, - pages={D1373--D1380}, - year={2023}, - publisher={Oxford University Press} - } - """, + bibtex_citation=r""" +@article{kasmaee2024chemteb, + author = {Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, + journal = {arXiv preprint arXiv:2412.00532}, + title = {ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \\& Efficiency on a Specific Domain}, + year = {2024}, +} + +@article{kim2023pubchem, + author = {Kim, Sunghwan and Chen, Jie and Cheng, Tiejun and Gindulyte, Asta and He, Jia and He, Siqian and Li, Qingliang and Shoemaker, Benjamin A and Thiessen, Paul A and Yu, Bo and others}, + journal = {Nucleic acids research}, + number = {D1}, + pages = {D1373--D1380}, + publisher = {Oxford University Press}, + title = {PubChem 2023 update}, + volume = {51}, + year = {2023}, +} +""", ) def dataset_transform(self) -> None: diff --git a/mteb/tasks/PairClassification/multilingual/RTE3.py b/mteb/tasks/PairClassification/multilingual/RTE3.py index 9a03fedb4f..49c2e0cdd3 100644 --- a/mteb/tasks/PairClassification/multilingual/RTE3.py +++ b/mteb/tasks/PairClassification/multilingual/RTE3.py @@ -36,21 +36,22 @@ class RTE3(MultilingualTask, AbsTaskPairClassification): annotations_creators="expert-annotated", dialect=[], sample_creation="found", - bibtex_citation="""@inproceedings{giampiccolo-etal-2007-third, - title = "The Third {PASCAL} Recognizing Textual Entailment Challenge", - author = "Giampiccolo, Danilo and - Magnini, Bernardo and - Dagan, Ido and - Dolan, Bill", - booktitle = "Proceedings of the {ACL}-{PASCAL} Workshop on Textual Entailment and Paraphrasing", - month = jun, - year = "2007", - address = "Prague", - publisher = "Association for Computational Linguistics", - url = "https://aclanthology.org/W07-1401", - pages = "1--9", - } - """, + bibtex_citation=r""" +@inproceedings{giampiccolo-etal-2007-third, + address = {Prague}, + author = {Giampiccolo, Danilo and +Magnini, Bernardo and +Dagan, Ido and +Dolan, Bill}, + booktitle = {Proceedings of the {ACL}-{PASCAL} Workshop on Textual Entailment and Paraphrasing}, + month = jun, + pages = {1--9}, + publisher = {Association for Computational Linguistics}, + title = {The Third {PASCAL} Recognizing Textual Entailment Challenge}, + url = {https://aclanthology.org/W07-1401}, + year = {2007}, +} +""", # sum of 4 languages after neutral filtering ) diff --git a/mteb/tasks/PairClassification/multilingual/XNLI.py b/mteb/tasks/PairClassification/multilingual/XNLI.py index 8f3f795bad..0c303a1aa1 100644 --- a/mteb/tasks/PairClassification/multilingual/XNLI.py +++ b/mteb/tasks/PairClassification/multilingual/XNLI.py @@ -44,22 +44,23 @@ class XNLI(MultilingualTask, AbsTaskPairClassification): annotations_creators="expert-annotated", dialect=[], sample_creation="created", - bibtex_citation="""@InProceedings{conneau2018xnli, - author = {Conneau, Alexis - and Rinott, Ruty - and Lample, Guillaume - and Williams, Adina - and Bowman, Samuel R. - and Schwenk, Holger - and Stoyanov, Veselin}, - title = {XNLI: Evaluating Cross-lingual Sentence Representations}, - booktitle = {Proceedings of the 2018 Conference on Empirical Methods - in Natural Language Processing}, - year = {2018}, - publisher = {Association for Computational Linguistics}, - location = {Brussels, Belgium}, - } - """, + bibtex_citation=r""" +@inproceedings{conneau2018xnli, + author = {Conneau, Alexis +and Rinott, Ruty +and Lample, Guillaume +and Williams, Adina +and Bowman, Samuel R. +and Schwenk, Holger +and Stoyanov, Veselin}, + booktitle = {Proceedings of the 2018 Conference on Empirical Methods +in Natural Language Processing}, + location = {Brussels, Belgium}, + publisher = {Association for Computational Linguistics}, + title = {XNLI: Evaluating Cross-lingual Sentence Representations}, + year = {2018}, +} +""", ) def dataset_transform(self): @@ -131,15 +132,16 @@ class XNLIV2(MultilingualTask, AbsTaskPairClassification): annotations_creators="expert-annotated", dialect=[], sample_creation="machine-translated and verified", - bibtex_citation="""@inproceedings{upadhyay2023xnli, - title={XNLI 2.0: Improving XNLI dataset and performance on Cross Lingual Understanding (XLU)}, - author={Upadhyay, Ankit Kumar and Upadhya, Harsit Kumar}, - booktitle={2023 IEEE 8th International Conference for Convergence in Technology (I2CT)}, - pages={1--6}, - year={2023}, - organization={IEEE} - } - """, + bibtex_citation=r""" +@inproceedings{upadhyay2023xnli, + author = {Upadhyay, Ankit Kumar and Upadhya, Harsit Kumar}, + booktitle = {2023 IEEE 8th International Conference for Convergence in Technology (I2CT)}, + organization = {IEEE}, + pages = {1--6}, + title = {XNLI 2.0: Improving XNLI dataset and performance on Cross Lingual Understanding (XLU)}, + year = {2023}, +} +""", # average of premise and hypothesis ) diff --git a/mteb/tasks/PairClassification/multilingual/XStance.py b/mteb/tasks/PairClassification/multilingual/XStance.py index 03d4f066e7..9852374e06 100644 --- a/mteb/tasks/PairClassification/multilingual/XStance.py +++ b/mteb/tasks/PairClassification/multilingual/XStance.py @@ -35,17 +35,17 @@ class XStance(MultilingualTask, AbsTaskPairClassification): annotations_creators="human-annotated", dialect=[], sample_creation="created", - bibtex_citation=""" - @inproceedings{vamvas2020xstance, - author = "Vamvas, Jannis and Sennrich, Rico", - title = "{X-Stance}: A Multilingual Multi-Target Dataset for Stance Detection", - booktitle = "Proceedings of the 5th Swiss Text Analytics Conference (SwissText) 16th Conference on Natural Language Processing (KONVENS)", - address = "Zurich, Switzerland", - year = "2020", - month = "jun", - url = "http://ceur-ws.org/Vol-2624/paper9.pdf" - } - """, + bibtex_citation=r""" +@inproceedings{vamvas2020xstance, + address = {Zurich, Switzerland}, + author = {Vamvas, Jannis and Sennrich, Rico}, + booktitle = {Proceedings of the 5th Swiss Text Analytics Conference (SwissText) 16th Conference on Natural Language Processing (KONVENS)}, + month = {jun}, + title = {{X-Stance}: A Multilingual Multi-Target Dataset for Stance Detection}, + url = {http://ceur-ws.org/Vol-2624/paper9.pdf}, + year = {2020}, +} +""", # length of`sent1` + `sent2` ) diff --git a/mteb/tasks/PairClassification/pol/PolishPC.py b/mteb/tasks/PairClassification/pol/PolishPC.py index 099a953642..d87811accc 100644 --- a/mteb/tasks/PairClassification/pol/PolishPC.py +++ b/mteb/tasks/PairClassification/pol/PolishPC.py @@ -21,42 +21,44 @@ class SickePLPC(AbsTaskPairClassification): eval_langs=["pol-Latn"], main_score="max_ap", date=None, - domains=None, + domains=["Reviews"], task_subtypes=None, license=None, annotations_creators=None, dialect=None, sample_creation=None, - bibtex_citation="""@inproceedings{dadas-etal-2020-evaluation, - title = "Evaluation of Sentence Representations in {P}olish", - author = "Dadas, Slawomir and - Pere{\l}kiewicz, Micha{\l} and - Po{\'s}wiata, Rafa{\l}", - editor = "Calzolari, Nicoletta and - B{\'e}chet, Fr{\'e}d{\'e}ric and - Blache, Philippe and - Choukri, Khalid and - Cieri, Christopher and - Declerck, Thierry and - Goggi, Sara and - Isahara, Hitoshi and - Maegaard, Bente and - Mariani, Joseph and - Mazo, H{\'e}l{\`e}ne and - Moreno, Asuncion and - Odijk, Jan and - Piperidis, Stelios", - booktitle = "Proceedings of the Twelfth Language Resources and Evaluation Conference", - month = may, - year = "2020", - address = "Marseille, France", - publisher = "European Language Resources Association", - url = "https://aclanthology.org/2020.lrec-1.207", - pages = "1674--1680", - abstract = "Methods for learning sentence representations have been actively developed in recent years. However, the lack of pre-trained models and datasets annotated at the sentence level has been a problem for low-resource languages such as Polish which led to less interest in applying these methods to language-specific tasks. In this study, we introduce two new Polish datasets for evaluating sentence embeddings and provide a comprehensive evaluation of eight sentence representation methods including Polish and multilingual models. We consider classic word embedding models, recently developed contextual embeddings and multilingual sentence encoders, showing strengths and weaknesses of specific approaches. We also examine different methods of aggregating word vectors into a single sentence vector.", - language = "English", - ISBN = "979-10-95546-34-4", - }""", + bibtex_citation=r""" +@inproceedings{dadas-etal-2020-evaluation, + abstract = {Methods for learning sentence representations have been actively developed in recent years. However, the lack of pre-trained models and datasets annotated at the sentence level has been a problem for low-resource languages such as Polish which led to less interest in applying these methods to language-specific tasks. In this study, we introduce two new Polish datasets for evaluating sentence embeddings and provide a comprehensive evaluation of eight sentence representation methods including Polish and multilingual models. We consider classic word embedding models, recently developed contextual embeddings and multilingual sentence encoders, showing strengths and weaknesses of specific approaches. We also examine different methods of aggregating word vectors into a single sentence vector.}, + address = {Marseille, France}, + author = {Dadas, Slawomir and +Pere{\l}kiewicz, Micha{\l} and +Po{\'s}wiata, Rafa{\l}}, + booktitle = {Proceedings of the Twelfth Language Resources and Evaluation Conference}, + editor = {Calzolari, Nicoletta and +B{\'e}chet, Fr{\'e}d{\'e}ric and +Blache, Philippe and +Choukri, Khalid and +Cieri, Christopher and +Declerck, Thierry and +Goggi, Sara and +Isahara, Hitoshi and +Maegaard, Bente and +Mariani, Joseph and +Mazo, H{\'e}l{\`e}ne and +Moreno, Asuncion and +Odijk, Jan and +Piperidis, Stelios}, + isbn = {979-10-95546-34-4}, + language = {English}, + month = may, + pages = {1674--1680}, + publisher = {European Language Resources Association}, + title = {Evaluation of Sentence Representations in {P}olish}, + url = {https://aclanthology.org/2020.lrec-1.207}, + year = {2020}, +} +""", ) def dataset_transform(self): @@ -95,14 +97,16 @@ class PpcPC(AbsTaskPairClassification): annotations_creators="derived", # mined dialect=[], sample_creation="found", - bibtex_citation="""@misc{dadas2022training, - title={Training Effective Neural Sentence Encoders from Automatically Mined Paraphrases}, - author={Sławomir Dadas}, - year={2022}, - eprint={2207.12759}, - archivePrefix={arXiv}, - primaryClass={cs.CL} -}""", + bibtex_citation=r""" +@misc{dadas2022training, + archiveprefix = {arXiv}, + author = {Sławomir Dadas}, + eprint = {2207.12759}, + primaryclass = {cs.CL}, + title = {Training Effective Neural Sentence Encoders from Automatically Mined Paraphrases}, + year = {2022}, +} +""", ) def dataset_transform(self): @@ -132,22 +136,24 @@ class CdscePC(AbsTaskPairClassification): annotations_creators="human-annotated", dialect=[], sample_creation="found", - bibtex_citation="""@inproceedings{wroblewska-krasnowska-kieras-2017-polish, - title = "{P}olish evaluation dataset for compositional distributional semantics models", - author = "Wr{\'o}blewska, Alina and - Krasnowska-Kiera{\'s}, Katarzyna", - editor = "Barzilay, Regina and - Kan, Min-Yen", - booktitle = "Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)", - month = jul, - year = "2017", - address = "Vancouver, Canada", - publisher = "Association for Computational Linguistics", - url = "https://aclanthology.org/P17-1073", - doi = "10.18653/v1/P17-1073", - pages = "784--792", - abstract = "The paper presents a procedure of building an evaluation dataset. for the validation of compositional distributional semantics models estimated for languages other than English. The procedure generally builds on steps designed to assemble the SICK corpus, which contains pairs of English sentences annotated for semantic relatedness and entailment, because we aim at building a comparable dataset. However, the implementation of particular building steps significantly differs from the original SICK design assumptions, which is caused by both lack of necessary extraneous resources for an investigated language and the need for language-specific transformation rules. The designed procedure is verified on Polish, a fusional language with a relatively free word order, and contributes to building a Polish evaluation dataset. The resource consists of 10K sentence pairs which are human-annotated for semantic relatedness and entailment. The dataset may be used for the evaluation of compositional distributional semantics models of Polish.", - }""", + bibtex_citation=r""" +@inproceedings{wroblewska-krasnowska-kieras-2017-polish, + abstract = {The paper presents a procedure of building an evaluation dataset. for the validation of compositional distributional semantics models estimated for languages other than English. The procedure generally builds on steps designed to assemble the SICK corpus, which contains pairs of English sentences annotated for semantic relatedness and entailment, because we aim at building a comparable dataset. However, the implementation of particular building steps significantly differs from the original SICK design assumptions, which is caused by both lack of necessary extraneous resources for an investigated language and the need for language-specific transformation rules. The designed procedure is verified on Polish, a fusional language with a relatively free word order, and contributes to building a Polish evaluation dataset. The resource consists of 10K sentence pairs which are human-annotated for semantic relatedness and entailment. The dataset may be used for the evaluation of compositional distributional semantics models of Polish.}, + address = {Vancouver, Canada}, + author = {Wr{\'o}blewska, Alina and +Krasnowska-Kiera{\'s}, Katarzyna}, + booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)}, + doi = {10.18653/v1/P17-1073}, + editor = {Barzilay, Regina and +Kan, Min-Yen}, + month = jul, + pages = {784--792}, + publisher = {Association for Computational Linguistics}, + title = {{P}olish evaluation dataset for compositional distributional semantics models}, + url = {https://aclanthology.org/P17-1073}, + year = {2017}, +} +""", ) def dataset_transform(self): @@ -177,28 +183,30 @@ class PscPC(AbsTaskPairClassification): annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation="""@inproceedings{ogrodniczuk-kopec-2014-polish, - title = "The {P}olish Summaries Corpus", - author = "Ogrodniczuk, Maciej and - Kope{\'c}, Mateusz", - editor = "Calzolari, Nicoletta and - Choukri, Khalid and - Declerck, Thierry and - Loftsson, Hrafn and - Maegaard, Bente and - Mariani, Joseph and - Moreno, Asuncion and - Odijk, Jan and - Piperidis, Stelios", - booktitle = "Proceedings of the Ninth International Conference on Language Resources and Evaluation ({LREC}'14)", - month = may, - year = "2014", - address = "Reykjavik, Iceland", - publisher = "European Language Resources Association (ELRA)", - url = "http://www.lrec-conf.org/proceedings/lrec2014/pdf/1211_Paper.pdf", - pages = "3712--3715", - abstract = "This article presents the Polish Summaries Corpus, a new resource created to support the development and evaluation of the tools for automated single-document summarization of Polish. The Corpus contains a large number of manual summaries of news articles, with many independently created summaries for a single text. Such approach is supposed to overcome the annotator bias, which is often described as a problem during the evaluation of the summarization algorithms against a single gold standard. There are several summarizers developed specifically for Polish language, but their in-depth evaluation and comparison was impossible without a large, manually created corpus. We present in detail the process of text selection, annotation process and the contents of the corpus, which includes both abstract free-word summaries, as well as extraction-based summaries created by selecting text spans from the original document. Finally, we describe how that resource could be used not only for the evaluation of the existing summarization tools, but also for studies on the human summarization process in Polish language.", - }""", + bibtex_citation=r""" +@inproceedings{ogrodniczuk-kopec-2014-polish, + abstract = {This article presents the Polish Summaries Corpus, a new resource created to support the development and evaluation of the tools for automated single-document summarization of Polish. The Corpus contains a large number of manual summaries of news articles, with many independently created summaries for a single text. Such approach is supposed to overcome the annotator bias, which is often described as a problem during the evaluation of the summarization algorithms against a single gold standard. There are several summarizers developed specifically for Polish language, but their in-depth evaluation and comparison was impossible without a large, manually created corpus. We present in detail the process of text selection, annotation process and the contents of the corpus, which includes both abstract free-word summaries, as well as extraction-based summaries created by selecting text spans from the original document. Finally, we describe how that resource could be used not only for the evaluation of the existing summarization tools, but also for studies on the human summarization process in Polish language.}, + address = {Reykjavik, Iceland}, + author = {Ogrodniczuk, Maciej and +Kope{\'c}, Mateusz}, + booktitle = {Proceedings of the Ninth International Conference on Language Resources and Evaluation ({LREC}'14)}, + editor = {Calzolari, Nicoletta and +Choukri, Khalid and +Declerck, Thierry and +Loftsson, Hrafn and +Maegaard, Bente and +Mariani, Joseph and +Moreno, Asuncion and +Odijk, Jan and +Piperidis, Stelios}, + month = may, + pages = {3712--3715}, + publisher = {European Language Resources Association (ELRA)}, + title = {The {P}olish Summaries Corpus}, + url = {http://www.lrec-conf.org/proceedings/lrec2014/pdf/1211_Paper.pdf}, + year = {2014}, +} +""", ) def dataset_transform(self): diff --git a/mteb/tasks/PairClassification/por/Assin2RTE.py b/mteb/tasks/PairClassification/por/Assin2RTE.py index aa0046cb6e..70d6a1b929 100644 --- a/mteb/tasks/PairClassification/por/Assin2RTE.py +++ b/mteb/tasks/PairClassification/por/Assin2RTE.py @@ -26,14 +26,16 @@ class Assin2RTE(AbsTaskPairClassification): annotations_creators="human-annotated", dialect=[], sample_creation="found", - bibtex_citation="""@inproceedings{real2020assin, - title={The assin 2 shared task: a quick overview}, - author={Real, Livy and Fonseca, Erick and Oliveira, Hugo Goncalo}, - booktitle={International Conference on Computational Processing of the Portuguese Language}, - pages={406--412}, - year={2020}, - organization={Springer} - }""", + bibtex_citation=r""" +@inproceedings{real2020assin, + author = {Real, Livy and Fonseca, Erick and Oliveira, Hugo Goncalo}, + booktitle = {International Conference on Computational Processing of the Portuguese Language}, + organization = {Springer}, + pages = {406--412}, + title = {The assin 2 shared task: a quick overview}, + year = {2020}, +} +""", ) def dataset_transform(self): diff --git a/mteb/tasks/PairClassification/por/SickBrPC.py b/mteb/tasks/PairClassification/por/SickBrPC.py index 6ac2cb9a38..445e6c87b4 100644 --- a/mteb/tasks/PairClassification/por/SickBrPC.py +++ b/mteb/tasks/PairClassification/por/SickBrPC.py @@ -26,27 +26,27 @@ class SickBrPC(AbsTaskPairClassification): annotations_creators="human-annotated", dialect=[], sample_creation="human-translated and localized", - bibtex_citation=""" - @inproceedings{real18, - author="Real, Livy - and Rodrigues, Ana - and Vieira e Silva, Andressa - and Albiero, Beatriz - and Thalenberg, Bruna - and Guide, Bruno - and Silva, Cindy - and de Oliveira Lima, Guilherme - and C{\\^a}mara, Igor C. S. - and Stanojevi{\\'{c}}, Milo{\\v{s}} - and Souza, Rodrigo - and de Paiva, Valeria" - year ="2018", - title="SICK-BR: A Portuguese Corpus for Inference", - booktitle="Computational Processing of the Portuguese Language. PROPOR 2018.", - doi ="10.1007/978-3-319-99722-3_31", - isbn="978-3-319-99722-3" - } - """, + bibtex_citation=r""" +@inproceedings{real18, + author = {Real, Livy +and Rodrigues, Ana +and Vieira e Silva, Andressa +and Albiero, Beatriz +and Thalenberg, Bruna +and Guide, Bruno +and Silva, Cindy +and de Oliveira Lima, Guilherme +and C{\^a}mara, Igor C. S. +and Stanojevi{\'{c}}, Milo{\v{s}} +and Souza, Rodrigo +and de Paiva, Valeria}, + booktitle = {{Computational Processing of the Portuguese Language. PROPOR 2018.}}, + doi = {10.1007/978-3-319-99722-3_31}, + isbn = {978-3-319-99722-3}, + title = {{SICK-BR: A Portuguese Corpus for Inference}}, + year = {2018}, +} +""", ) def dataset_transform(self): diff --git a/mteb/tasks/PairClassification/rus/TERRa.py b/mteb/tasks/PairClassification/rus/TERRa.py index 50b9560a46..3fc382b497 100644 --- a/mteb/tasks/PairClassification/rus/TERRa.py +++ b/mteb/tasks/PairClassification/rus/TERRa.py @@ -28,21 +28,23 @@ class TERRa(AbsTaskPairClassification): annotations_creators="human-annotated", dialect=[], sample_creation="found", - bibtex_citation="""@article{shavrina2020russiansuperglue, - title={RussianSuperGLUE: A Russian Language Understanding Evaluation Benchmark}, - author={Shavrina, Tatiana - and Fenogenova, Alena - and Emelyanov, Anton - and Shevelev, Denis - and Artemova, Ekaterina - and Malykh, Valentin - and Mikhailov, Vladislav - and Tikhonova, Maria - and Chertok, Andrey - and Evlampiev, Andrey}, - journal={arXiv preprint arXiv:2010.15925}, - year={2020} - }""", + bibtex_citation=r""" +@article{shavrina2020russiansuperglue, + author = {Shavrina, Tatiana +and Fenogenova, Alena +and Emelyanov, Anton +and Shevelev, Denis +and Artemova, Ekaterina +and Malykh, Valentin +and Mikhailov, Vladislav +and Tikhonova, Maria +and Chertok, Andrey +and Evlampiev, Andrey}, + journal = {arXiv preprint arXiv:2010.15925}, + title = {RussianSuperGLUE: A Russian Language Understanding Evaluation Benchmark}, + year = {2020}, +} +""", prompt="Given a premise, retrieve a hypothesis that is entailed by the premise", ) diff --git a/mteb/tasks/PairClassification/zho/CMTEBPairClassification.py b/mteb/tasks/PairClassification/zho/CMTEBPairClassification.py index 3ddb8c290d..63ba4dc64a 100644 --- a/mteb/tasks/PairClassification/zho/CMTEBPairClassification.py +++ b/mteb/tasks/PairClassification/zho/CMTEBPairClassification.py @@ -26,14 +26,16 @@ class Ocnli(AbsTaskPairClassification): annotations_creators=None, dialect=None, sample_creation=None, - bibtex_citation="""@misc{hu2020ocnli, - title={OCNLI: Original Chinese Natural Language Inference}, - author={Hai Hu and Kyle Richardson and Liang Xu and Lu Li and Sandra Kuebler and Lawrence S. Moss}, - year={2020}, - eprint={2010.05444}, - archivePrefix={arXiv}, - primaryClass={cs.CL} - }""", + bibtex_citation=r""" +@misc{hu2020ocnli, + archiveprefix = {arXiv}, + author = {Hai Hu and Kyle Richardson and Liang Xu and Lu Li and Sandra Kuebler and Lawrence S. Moss}, + eprint = {2010.05444}, + primaryclass = {cs.CL}, + title = {OCNLI: Original Chinese Natural Language Inference}, + year = {2020}, +} +""", prompt="Retrieve semantically similar text.", ) @@ -64,49 +66,51 @@ class Cmnli(AbsTaskPairClassification): annotations_creators=None, dialect=None, sample_creation=None, - bibtex_citation="""@inproceedings{xu-etal-2020-clue, - title = "{CLUE}: A {C}hinese Language Understanding Evaluation Benchmark", - author = "Xu, Liang and - Hu, Hai and - Zhang, Xuanwei and - Li, Lu and - Cao, Chenjie and - Li, Yudong and - Xu, Yechen and - Sun, Kai and - Yu, Dian and - Yu, Cong and - Tian, Yin and - Dong, Qianqian and - Liu, Weitang and - Shi, Bo and - Cui, Yiming and - Li, Junyi and - Zeng, Jun and - Wang, Rongzhao and - Xie, Weijian and - Li, Yanting and - Patterson, Yina and - Tian, Zuoyu and - Zhang, Yiwen and - Zhou, He and - Liu, Shaoweihua and - Zhao, Zhe and - Zhao, Qipeng and - Yue, Cong and - Zhang, Xinrui and - Yang, Zhengliang and - Richardson, Kyle and - Lan, Zhenzhong", - booktitle = "Proceedings of the 28th International Conference on Computational Linguistics", - month = dec, - year = "2020", - address = "Barcelona, Spain (Online)", - publisher = "International Committee on Computational Linguistics", - url = "https://aclanthology.org/2020.coling-main.419", - doi = "10.18653/v1/2020.coling-main.419", - pages = "4762--4772", - }""", + bibtex_citation=r""" +@inproceedings{xu-etal-2020-clue, + address = {Barcelona, Spain (Online)}, + author = {Xu, Liang and +Hu, Hai and +Zhang, Xuanwei and +Li, Lu and +Cao, Chenjie and +Li, Yudong and +Xu, Yechen and +Sun, Kai and +Yu, Dian and +Yu, Cong and +Tian, Yin and +Dong, Qianqian and +Liu, Weitang and +Shi, Bo and +Cui, Yiming and +Li, Junyi and +Zeng, Jun and +Wang, Rongzhao and +Xie, Weijian and +Li, Yanting and +Patterson, Yina and +Tian, Zuoyu and +Zhang, Yiwen and +Zhou, He and +Liu, Shaoweihua and +Zhao, Zhe and +Zhao, Qipeng and +Yue, Cong and +Zhang, Xinrui and +Yang, Zhengliang and +Richardson, Kyle and +Lan, Zhenzhong}, + booktitle = {Proceedings of the 28th International Conference on Computational Linguistics}, + doi = {10.18653/v1/2020.coling-main.419}, + month = dec, + pages = {4762--4772}, + publisher = {International Committee on Computational Linguistics}, + title = {{CLUE}: A {C}hinese Language Understanding Evaluation Benchmark}, + url = {https://aclanthology.org/2020.coling-main.419}, + year = {2020}, +} +""", prompt="Retrieve semantically similar text.", ) diff --git a/mteb/tasks/Reranking/ara/NamaaMrTydiReranking.py b/mteb/tasks/Reranking/ara/NamaaMrTydiReranking.py index dd305727ee..b037a2c544 100644 --- a/mteb/tasks/Reranking/ara/NamaaMrTydiReranking.py +++ b/mteb/tasks/Reranking/ara/NamaaMrTydiReranking.py @@ -27,13 +27,15 @@ class NamaaMrTydiReranking(AbsTaskReranking): annotations_creators="human-annotated", dialect=[], sample_creation="found", - bibtex_citation="""@article{muennighoff2022mteb, - doi = {10.48550/ARXIV.2210.07316}, - url = {https://arxiv.org/abs/2210.07316}, + bibtex_citation=r""" +@article{muennighoff2022mteb, author = {Muennighoff, Niklas and Tazi, Nouamane and Magne, Lo{\"\\i}c and Reimers, Nils}, - title = {MTEB: Massive Text Embedding Benchmark}, + doi = {10.48550/ARXIV.2210.07316}, + journal = {arXiv preprint arXiv:2210.07316}, publisher = {arXiv}, - journal={arXiv preprint arXiv:2210.07316}, - year = {2022} -}""", + title = {MTEB: Massive Text Embedding Benchmark}, + url = {https://arxiv.org/abs/2210.07316}, + year = {2022}, +} +""", ) diff --git a/mteb/tasks/Reranking/eng/AskUbuntuDupQuestions.py b/mteb/tasks/Reranking/eng/AskUbuntuDupQuestions.py index b96d3d944b..788c0b82e8 100644 --- a/mteb/tasks/Reranking/eng/AskUbuntuDupQuestions.py +++ b/mteb/tasks/Reranking/eng/AskUbuntuDupQuestions.py @@ -28,12 +28,14 @@ class AskUbuntuDupQuestions(AbsTaskReranking): dialect=[], sample_creation="found", prompt="Retrieve duplicate questions from AskUbuntu forum", - bibtex_citation="""@article{wang-2021-TSDAE, - title = "TSDAE: Using Transformer-based Sequential Denoising Auto-Encoderfor Unsupervised Sentence Embedding Learning", - author = "Wang, Kexin and Reimers, Nils and Gurevych, Iryna", - journal= "arXiv preprint arXiv:2104.06979", - month = "4", - year = "2021", - url = "https://arxiv.org/abs/2104.06979", -}""", + bibtex_citation=r""" +@article{wang-2021-TSDAE, + author = {Wang, Kexin and Reimers, Nils and Gurevych, Iryna}, + journal = {arXiv preprint arXiv:2104.06979}, + month = {4}, + title = {TSDAE: Using Transformer-based Sequential Denoising Auto-Encoderfor Unsupervised Sentence Embedding Learning}, + url = {https://arxiv.org/abs/2104.06979}, + year = {2021}, +} +""", ) diff --git a/mteb/tasks/Reranking/eng/BuiltBenchReranking.py b/mteb/tasks/Reranking/eng/BuiltBenchReranking.py index 890978fbf9..3922f4b092 100644 --- a/mteb/tasks/Reranking/eng/BuiltBenchReranking.py +++ b/mteb/tasks/Reranking/eng/BuiltBenchReranking.py @@ -27,12 +27,14 @@ class BuiltBenchReranking(AbsTaskReranking): annotations_creators="derived", dialect=[], sample_creation="created", - bibtex_citation="""@article{shahinmoghadam2024benchmarking, - title={Benchmarking pre-trained text embedding models in aligning built asset information}, - author={Shahinmoghadam, Mehrzad and Motamedi, Ali}, - journal={arXiv preprint arXiv:2411.12056}, - year={2024} -}""", + bibtex_citation=r""" +@article{shahinmoghadam2024benchmarking, + author = {Shahinmoghadam, Mehrzad and Motamedi, Ali}, + journal = {arXiv preprint arXiv:2411.12056}, + title = {Benchmarking pre-trained text embedding models in aligning built asset information}, + year = {2024}, +} +""", prompt={ "query": "Given a query, retrieve relevant entity descriptions from buit asset classification systems such as IFC and Uniclass" }, diff --git a/mteb/tasks/Reranking/eng/MindSmallReranking.py b/mteb/tasks/Reranking/eng/MindSmallReranking.py index f1253ba435..db54158188 100644 --- a/mteb/tasks/Reranking/eng/MindSmallReranking.py +++ b/mteb/tasks/Reranking/eng/MindSmallReranking.py @@ -28,23 +28,33 @@ class MindSmallReranking(AbsTaskReranking): dialect=[], sample_creation="found", prompt="Retrieve relevant news articles based on user browsing history", - bibtex_citation="""@inproceedings{wu-etal-2020-mind, title = "{MIND}: A Large-scale Dataset for News - Recommendation", author = "Wu, Fangzhao and Qiao, Ying and Chen, Jiun-Hung and Wu, Chuhan and Qi, - Tao and Lian, Jianxun and Liu, Danyang and Xie, Xing and Gao, Jianfeng and Wu, Winnie and Zhou, Ming", - editor = "Jurafsky, Dan and Chai, Joyce and Schluter, Natalie and Tetreault, Joel", booktitle = - "Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics", month = jul, - year = "2020", address = "Online", publisher = "Association for Computational Linguistics", - url = "https://aclanthology.org/2020.acl-main.331", doi = "10.18653/v1/2020.acl-main.331", - pages = "3597--3606", abstract = "News recommendation is an important technique for personalized news - service. Compared with product and movie recommendations which have been comprehensively studied, - the research on news recommendation is much more limited, mainly due to the lack of a high-quality benchmark - dataset. In this paper, we present a large-scale dataset named MIND for news recommendation. Constructed from - the user click logs of Microsoft News, MIND contains 1 million users and more than 160k English news - articles, each of which has rich textual content such as title, abstract and body. We demonstrate MIND a good - testbed for news recommendation through a comparative study of several state-of-the-art news recommendation - methods which are originally developed on different proprietary datasets. Our results show the performance of - news recommendation highly relies on the quality of news content understanding and user interest modeling. - Many natural language processing techniques such as effective text representation methods and pre-trained - language models can effectively improve the performance of news recommendation. The MIND dataset will be - available at https://msnews.github.io}.", }""", + bibtex_citation=r""" +@inproceedings{wu-etal-2020-mind, + abstract = {News recommendation is an important technique for personalized news +service. Compared with product and movie recommendations which have been comprehensively studied, +the research on news recommendation is much more limited, mainly due to the lack of a high-quality benchmark +dataset. In this paper, we present a large-scale dataset named MIND for news recommendation. Constructed from +the user click logs of Microsoft News, MIND contains 1 million users and more than 160k English news +articles, each of which has rich textual content such as title, abstract and body. We demonstrate MIND a good +testbed for news recommendation through a comparative study of several state-of-the-art news recommendation +methods which are originally developed on different proprietary datasets. Our results show the performance of +news recommendation highly relies on the quality of news content understanding and user interest modeling. +Many natural language processing techniques such as effective text representation methods and pre-trained +language models can effectively improve the performance of news recommendation. The MIND dataset will be +available at https://msnews.github.io.}, + address = {Online}, + author = {Wu, Fangzhao and Qiao, Ying and Chen, Jiun-Hung and Wu, Chuhan and Qi, +Tao and Lian, Jianxun and Liu, Danyang and Xie, Xing and Gao, Jianfeng and Wu, Winnie and Zhou, Ming}, + booktitle = {Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics}, + doi = {10.18653/v1/2020.acl-main.331}, + editor = {Jurafsky, Dan and Chai, Joyce and Schluter, Natalie and Tetreault, Joel}, + month = jul, + pages = {3597--3606}, + publisher = {Association for Computational Linguistics}, + title = {{MIND}: A Large-scale Dataset for News +Recommendation}, + url = {https://aclanthology.org/2020.acl-main.331}, + year = {2020}, +} +""", ) diff --git a/mteb/tasks/Reranking/eng/SciDocsReranking.py b/mteb/tasks/Reranking/eng/SciDocsReranking.py index c8ba6d5e2b..99edf718d4 100644 --- a/mteb/tasks/Reranking/eng/SciDocsReranking.py +++ b/mteb/tasks/Reranking/eng/SciDocsReranking.py @@ -28,27 +28,27 @@ class SciDocsReranking(AbsTaskReranking): dialect=None, sample_creation="found", prompt="Given a title of a scientific paper, retrieve the titles of other relevant papers", - bibtex_citation=""" + bibtex_citation=r""" @inproceedings{cohan-etal-2020-specter, - title = "{SPECTER}: Document-level Representation Learning using Citation-informed Transformers", - author = "Cohan, Arman and - Feldman, Sergey and - Beltagy, Iz and - Downey, Doug and - Weld, Daniel", - editor = "Jurafsky, Dan and - Chai, Joyce and - Schluter, Natalie and - Tetreault, Joel", - booktitle = "Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics", - month = jul, - year = "2020", - address = "Online", - publisher = "Association for Computational Linguistics", - url = "https://aclanthology.org/2020.acl-main.207", - doi = "10.18653/v1/2020.acl-main.207", - pages = "2270--2282", - abstract = "Representation learning is a critical ingredient for natural language processing systems. Recent Transformer language models like BERT learn powerful textual representations, but these models are targeted towards token- and sentence-level training objectives and do not leverage information on inter-document relatedness, which limits their document-level representation power. For applications on scientific documents, such as classification and recommendation, accurate embeddings of documents are a necessity. We propose SPECTER, a new method to generate document-level embedding of scientific papers based on pretraining a Transformer language model on a powerful signal of document-level relatedness: the citation graph. Unlike existing pretrained language models, Specter can be easily applied to downstream applications without task-specific fine-tuning. Additionally, to encourage further research on document-level models, we introduce SciDocs, a new evaluation benchmark consisting of seven document-level tasks ranging from citation prediction, to document classification and recommendation. We show that Specter outperforms a variety of competitive baselines on the benchmark.", + abstract = {Representation learning is a critical ingredient for natural language processing systems. Recent Transformer language models like BERT learn powerful textual representations, but these models are targeted towards token- and sentence-level training objectives and do not leverage information on inter-document relatedness, which limits their document-level representation power. For applications on scientific documents, such as classification and recommendation, accurate embeddings of documents are a necessity. We propose SPECTER, a new method to generate document-level embedding of scientific papers based on pretraining a Transformer language model on a powerful signal of document-level relatedness: the citation graph. Unlike existing pretrained language models, Specter can be easily applied to downstream applications without task-specific fine-tuning. Additionally, to encourage further research on document-level models, we introduce SciDocs, a new evaluation benchmark consisting of seven document-level tasks ranging from citation prediction, to document classification and recommendation. We show that Specter outperforms a variety of competitive baselines on the benchmark.}, + address = {Online}, + author = {Cohan, Arman and +Feldman, Sergey and +Beltagy, Iz and +Downey, Doug and +Weld, Daniel}, + booktitle = {Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics}, + doi = {10.18653/v1/2020.acl-main.207}, + editor = {Jurafsky, Dan and +Chai, Joyce and +Schluter, Natalie and +Tetreault, Joel}, + month = jul, + pages = {2270--2282}, + publisher = {Association for Computational Linguistics}, + title = {{SPECTER}: Document-level Representation Learning using Citation-informed Transformers}, + url = {https://aclanthology.org/2020.acl-main.207}, + year = {2020}, } """, adapted_from=["SCIDOCS"], diff --git a/mteb/tasks/Reranking/eng/StackOverflowDupQuestions.py b/mteb/tasks/Reranking/eng/StackOverflowDupQuestions.py index 897f9d7bc9..82179a01bd 100644 --- a/mteb/tasks/Reranking/eng/StackOverflowDupQuestions.py +++ b/mteb/tasks/Reranking/eng/StackOverflowDupQuestions.py @@ -28,11 +28,13 @@ class StackOverflowDupQuestions(AbsTaskReranking): dialect=[], sample_creation="found", prompt="Retrieve duplicate questions from StackOverflow forum", - bibtex_citation="""@article{Liu2018LinkSOAD, - title={LinkSO: a dataset for learning to retrieve similar question answer pairs on software development forums}, - author={Xueqing Liu and Chi Wang and Yue Leng and ChengXiang Zhai}, - journal={Proceedings of the 4th ACM SIGSOFT International Workshop on NLP for Software Engineering}, - year={2018}, - url={https://api.semanticscholar.org/CorpusID:53111679} -}""", + bibtex_citation=r""" +@article{Liu2018LinkSOAD, + author = {Xueqing Liu and Chi Wang and Yue Leng and ChengXiang Zhai}, + journal = {Proceedings of the 4th ACM SIGSOFT International Workshop on NLP for Software Engineering}, + title = {LinkSO: a dataset for learning to retrieve similar question answer pairs on software development forums}, + url = {https://api.semanticscholar.org/CorpusID:53111679}, + year = {2018}, +} +""", ) diff --git a/mteb/tasks/Reranking/eng/WebLINXCandidatesReranking.py b/mteb/tasks/Reranking/eng/WebLINXCandidatesReranking.py index 9db3acb394..a47d0c653e 100644 --- a/mteb/tasks/Reranking/eng/WebLINXCandidatesReranking.py +++ b/mteb/tasks/Reranking/eng/WebLINXCandidatesReranking.py @@ -37,16 +37,16 @@ class WebLINXCandidatesReranking(AbsTaskReranking): annotations_creators="expert-annotated", dialect=[], sample_creation="created", - bibtex_citation=""" + bibtex_citation=r""" @misc{lù2024weblinx, - title={WebLINX: Real-World Website Navigation with Multi-Turn Dialogue}, - author={Xing Han Lù and Zdeněk Kasner and Siva Reddy}, - year={2024}, - eprint={2402.05930}, - archivePrefix={arXiv}, - primaryClass={cs.CL} + archiveprefix = {arXiv}, + author = {Xing Han Lù and Zdeněk Kasner and Siva Reddy}, + eprint = {2402.05930}, + primaryclass = {cs.CL}, + title = {WebLINX: Real-World Website Navigation with Multi-Turn Dialogue}, + year = {2024}, } - """, +""", ) def load_data(self, **kwargs): diff --git a/mteb/tasks/Reranking/fra/AlloprofReranking.py b/mteb/tasks/Reranking/fra/AlloprofReranking.py index 20d24f03ec..f9f36f43c4 100644 --- a/mteb/tasks/Reranking/fra/AlloprofReranking.py +++ b/mteb/tasks/Reranking/fra/AlloprofReranking.py @@ -29,16 +29,18 @@ class AlloprofReranking(AbsTaskReranking): annotations_creators="expert-annotated", dialect=None, sample_creation="found", - bibtex_citation="""@misc{lef23, - doi = {10.48550/ARXIV.2302.07738}, - url = {https://arxiv.org/abs/2302.07738}, - author = {Lefebvre-Brossard, Antoine and Gazaille, Stephane and Desmarais, Michel C.}, - keywords = {Computation and Language (cs.CL), Information Retrieval (cs.IR), Machine Learning (cs.LG), FOS: Computer and information sciences, FOS: Computer and information sciences}, - title = {Alloprof: a new French question-answer education dataset and its use in an information retrieval case study}, - publisher = {arXiv}, - year = {2023}, - copyright = {Creative Commons Attribution Non Commercial Share Alike 4.0 International} - }""", + bibtex_citation=r""" +@misc{lef23, + author = {Lefebvre-Brossard, Antoine and Gazaille, Stephane and Desmarais, Michel C.}, + copyright = {Creative Commons Attribution Non Commercial Share Alike 4.0 International}, + doi = {10.48550/ARXIV.2302.07738}, + keywords = {Computation and Language (cs.CL), Information Retrieval (cs.IR), Machine Learning (cs.LG), FOS: Computer and information sciences, FOS: Computer and information sciences}, + publisher = {arXiv}, + title = {Alloprof: a new French question-answer education dataset and its use in an information retrieval case study}, + url = {https://arxiv.org/abs/2302.07738}, + year = {2023}, +} +""", ) def load_data(self, **kwargs): diff --git a/mteb/tasks/Reranking/fra/SyntecReranking.py b/mteb/tasks/Reranking/fra/SyntecReranking.py index de30fc4b85..fd0444ce55 100644 --- a/mteb/tasks/Reranking/fra/SyntecReranking.py +++ b/mteb/tasks/Reranking/fra/SyntecReranking.py @@ -29,14 +29,16 @@ class SyntecReranking(AbsTaskReranking): annotations_creators="human-annotated", dialect=None, sample_creation="found", - bibtex_citation="""@misc{ciancone2024extending, - title={Extending the Massive Text Embedding Benchmark to French}, - author={Mathieu Ciancone and Imene Kerboua and Marion Schaeffer and Wissam Siblini}, - year={2024}, - eprint={2405.20468}, - archivePrefix={arXiv}, - primaryClass={cs.CL} -}""", + bibtex_citation=r""" +@misc{ciancone2024extending, + archiveprefix = {arXiv}, + author = {Mathieu Ciancone and Imene Kerboua and Marion Schaeffer and Wissam Siblini}, + eprint = {2405.20468}, + primaryclass = {cs.CL}, + title = {Extending the Massive Text Embedding Benchmark to French}, + year = {2024}, +} +""", ) def load_data(self, **kwargs): diff --git a/mteb/tasks/Reranking/jpn/MMarcoReranking.py b/mteb/tasks/Reranking/jpn/MMarcoReranking.py index 54290b61cd..fef3d4980e 100644 --- a/mteb/tasks/Reranking/jpn/MMarcoReranking.py +++ b/mteb/tasks/Reranking/jpn/MMarcoReranking.py @@ -27,12 +27,15 @@ class VoyageMMarcoReranking(AbsTaskReranking): dialect=["jpn-Jpan"], sample_creation="found", prompt="Given a Japanese search query, retrieve web passages that answer the question", - bibtex_citation="""@misc{clavié2023jacolbert, - title={JaColBERT and Hard Negatives, Towards Better Japanese-First Embeddings for Retrieval: Early Technical Report}, - author={Benjamin Clavié}, - year={2023}, - eprint={2312.16144}, - archivePrefix={arXiv},}""", + bibtex_citation=r""" +@misc{clavié2023jacolbert, + archiveprefix = {arXiv}, + author = {Benjamin Clavié}, + eprint = {2312.16144}, + title = {JaColBERT and Hard Negatives, Towards Better Japanese-First Embeddings for Retrieval: Early Technical Report}, + year = {2023}, +} +""", ) def dataset_transform(self): diff --git a/mteb/tasks/Reranking/multilingual/ESCIReranking.py b/mteb/tasks/Reranking/multilingual/ESCIReranking.py index 03c6608f27..08f7c86bd6 100644 --- a/mteb/tasks/Reranking/multilingual/ESCIReranking.py +++ b/mteb/tasks/Reranking/multilingual/ESCIReranking.py @@ -15,12 +15,12 @@ "jp": ["jpn-Jpan"], } -_CITATION = """@article{reddy2022shopping, - title={Shopping Queries Dataset: A Large-Scale {ESCI} Benchmark for Improving Product Search}, - author={Chandan K. Reddy and Lluís Màrquez and Fran Valero and Nikhil Rao and Hugo Zaragoza and Sambaran Bandyopadhyay and Arnab Biswas and Anlu Xing and Karthik Subbian}, - year={2022}, - eprint={2206.06588}, - archivePrefix={arXiv} +_CITATION = r"""@article{reddy2022shopping, + archiveprefix = {arXiv}, + author = {Chandan K. Reddy and Lluís Màrquez and Fran Valero and Nikhil Rao and Hugo Zaragoza and Sambaran Bandyopadhyay and Arnab Biswas and Anlu Xing and Karthik Subbian}, + eprint = {2206.06588}, + title = {Shopping Queries Dataset: A Large-Scale {ESCI} Benchmark for Improving Product Search}, + year = {2022}, }""" diff --git a/mteb/tasks/Reranking/multilingual/MIRACLReranking.py b/mteb/tasks/Reranking/multilingual/MIRACLReranking.py index 267c832638..535d777bab 100644 --- a/mteb/tasks/Reranking/multilingual/MIRACLReranking.py +++ b/mteb/tasks/Reranking/multilingual/MIRACLReranking.py @@ -37,16 +37,16 @@ "zh": ["zho-Hans"], } -_CITATION = """@article{10.1162/tacl_a_00595, - author = {Zhang, Xinyu and Thakur, Nandan and Ogundepo, Odunayo and Kamalloo, Ehsan and Alfonso-Hermelo, David and Li, Xiaoguang and Liu, Qun and Rezagholizadeh, Mehdi and Lin, Jimmy}, - title = "{MIRACL: A Multilingual Retrieval Dataset Covering 18 Diverse Languages}", - journal = {Transactions of the Association for Computational Linguistics}, - volume = {11}, - pages = {1114-1131}, - year = {2023}, - month = {09}, - issn = {2307-387X}, - doi = {10.1162/tacl_a_00595}, +_CITATION = r"""@article{10.1162/tacl_a_00595, + author = {Zhang, Xinyu and Thakur, Nandan and Ogundepo, Odunayo and Kamalloo, Ehsan and Alfonso-Hermelo, David and Li, Xiaoguang and Liu, Qun and Rezagholizadeh, Mehdi and Lin, Jimmy}, + doi = {10.1162/tacl_a_00595}, + issn = {2307-387X}, + journal = {Transactions of the Association for Computational Linguistics}, + month = {09}, + pages = {1114-1131}, + title = {{MIRACL: A Multilingual Retrieval Dataset Covering 18 Diverse Languages}}, + volume = {11}, + year = {2023}, }""" diff --git a/mteb/tasks/Reranking/multilingual/WikipediaRerankingMultilingual.py b/mteb/tasks/Reranking/multilingual/WikipediaRerankingMultilingual.py index 3bfbd04f13..cb42eb532a 100644 --- a/mteb/tasks/Reranking/multilingual/WikipediaRerankingMultilingual.py +++ b/mteb/tasks/Reranking/multilingual/WikipediaRerankingMultilingual.py @@ -47,9 +47,11 @@ class WikipediaRerankingMultilingual(MultilingualTask, AbsTaskReranking): annotations_creators="LM-generated and reviewed", dialect=[], sample_creation="LM-generated and verified", - bibtex_citation="""@ONLINE{wikidump, - author = "Wikimedia Foundation", - title = "Wikimedia Downloads", - url = "https://dumps.wikimedia.org" -}""", + bibtex_citation=r""" +@online{wikidump, + author = {Wikimedia Foundation}, + title = {Wikimedia Downloads}, + url = {https://dumps.wikimedia.org}, +} +""", ) diff --git a/mteb/tasks/Reranking/rus/RuBQReranking.py b/mteb/tasks/Reranking/rus/RuBQReranking.py index fb79a17588..8399005f2a 100644 --- a/mteb/tasks/Reranking/rus/RuBQReranking.py +++ b/mteb/tasks/Reranking/rus/RuBQReranking.py @@ -27,13 +27,15 @@ class RuBQReranking(AbsTaskReranking): annotations_creators="human-annotated", dialect=[], sample_creation="created", - bibtex_citation="""@inproceedings{RuBQ2021, - title={RuBQ 2.0: An Innovated Russian Question Answering Dataset}, - author={Ivan Rybin and Vladislav Korablinov and Pavel Efimov and Pavel Braslavski}, - booktitle={ESWC}, - year={2021}, - pages={532--547} - }""", + bibtex_citation=r""" +@inproceedings{RuBQ2021, + author = {Ivan Rybin and Vladislav Korablinov and Pavel Efimov and Pavel Braslavski}, + booktitle = {ESWC}, + pages = {532--547}, + title = {RuBQ 2.0: An Innovated Russian Question Answering Dataset}, + year = {2021}, +} +""", prompt={ "query": "Given a question, retrieve Wikipedia passages that answer the question.", }, diff --git a/mteb/tasks/Reranking/zho/CMTEBReranking.py b/mteb/tasks/Reranking/zho/CMTEBReranking.py index c701aa9227..2ad84444b2 100644 --- a/mteb/tasks/Reranking/zho/CMTEBReranking.py +++ b/mteb/tasks/Reranking/zho/CMTEBReranking.py @@ -21,21 +21,23 @@ class T2Reranking(AbsTaskReranking): main_score="map", date=None, form=None, - domains=None, + domains=[], task_subtypes=None, - license=None, + license="not specified", annotations_creators=None, dialect=None, sample_creation=None, prompt="Given a Chinese search query, retrieve web passages that answer the question", - bibtex_citation="""@misc{xie2023t2ranking, - title={T2Ranking: A large-scale Chinese Benchmark for Passage Ranking}, - author={Xiaohui Xie and Qian Dong and Bingning Wang and Feiyang Lv and Ting Yao and Weinan Gan and Zhijing Wu and Xiangsheng Li and Haitao Li and Yiqun Liu and Jin Ma}, - year={2023}, - eprint={2304.03679}, - archivePrefix={arXiv}, - primaryClass={cs.IR} -}""", + bibtex_citation=r""" +@misc{xie2023t2ranking, + archiveprefix = {arXiv}, + author = {Xiaohui Xie and Qian Dong and Bingning Wang and Feiyang Lv and Ting Yao and Weinan Gan and Zhijing Wu and Xiangsheng Li and Haitao Li and Yiqun Liu and Jin Ma}, + eprint = {2304.03679}, + primaryclass = {cs.IR}, + title = {T2Ranking: A large-scale Chinese Benchmark for Passage Ranking}, + year = {2023}, +} +""", ) @@ -63,14 +65,16 @@ class MMarcoReranking(AbsTaskReranking): dialect=None, sample_creation=None, prompt="Given a Chinese search query, retrieve web passages that answer the question", - bibtex_citation="""@misc{bonifacio2021mmarco, - title={mMARCO: A Multilingual Version of MS MARCO Passage Ranking Dataset}, - author={Luiz Henrique Bonifacio and Vitor Jeronymo and Hugo Queiroz Abonizio and Israel Campiotti and Marzieh Fadaee and and Roberto Lotufo and Rodrigo Nogueira}, - year={2021}, - eprint={2108.13897}, - archivePrefix={arXiv}, - primaryClass={cs.CL} -}""", + bibtex_citation=r""" +@misc{bonifacio2021mmarco, + archiveprefix = {arXiv}, + author = {Luiz Henrique Bonifacio and Vitor Jeronymo and Hugo Queiroz Abonizio and Israel Campiotti and Marzieh Fadaee and and Roberto Lotufo and Rodrigo Nogueira}, + eprint = {2108.13897}, + primaryclass = {cs.CL}, + title = {mMARCO: A Multilingual Version of MS MARCO Passage Ranking Dataset}, + year = {2021}, +} +""", ) @@ -97,16 +101,18 @@ class CMedQAv1(AbsTaskReranking): annotations_creators="expert-annotated", dialect=[], sample_creation="found", - bibtex_citation="""@article{zhang2017chinese, - title={Chinese Medical Question Answer Matching Using End-to-End Character-Level Multi-Scale CNNs}, - author={Zhang, Sheng and Zhang, Xin and Wang, Hui and Cheng, Jiajun and Li, Pei and Ding, Zhaoyun}, - journal={Applied Sciences}, - volume={7}, - number={8}, - pages={767}, - year={2017}, - publisher={Multidisciplinary Digital Publishing Institute} -}""", + bibtex_citation=r""" +@article{zhang2017chinese, + author = {Zhang, Sheng and Zhang, Xin and Wang, Hui and Cheng, Jiajun and Li, Pei and Ding, Zhaoyun}, + journal = {Applied Sciences}, + number = {8}, + pages = {767}, + publisher = {Multidisciplinary Digital Publishing Institute}, + title = {Chinese Medical Question Answer Matching Using End-to-End Character-Level Multi-Scale CNNs}, + volume = {7}, + year = {2017}, +} +""", ) @@ -134,16 +140,19 @@ class CMedQAv2(AbsTaskReranking): annotations_creators=None, dialect=None, sample_creation=None, - bibtex_citation="""@ARTICLE{8548603, -author={S. Zhang and X. Zhang and H. Wang and L. Guo and S. Liu}, -journal={IEEE Access}, -title={Multi-Scale Attentive Interaction Networks for Chinese Medical Question Answer Selection}, -year={2018}, -volume={6}, -number={}, -pages={74061-74071}, -keywords={Biomedical imaging;Data mining;Semantics;Medical services;Feature extraction;Knowledge discovery;Medical question answering;interactive attention;deep learning;deep neural networks}, -doi={10.1109/ACCESS.2018.2883637}, -ISSN={2169-3536}, -month={},}""", + bibtex_citation=r""" +@article{8548603, + author = {S. Zhang and X. Zhang and H. Wang and L. Guo and S. Liu}, + doi = {10.1109/ACCESS.2018.2883637}, + issn = {2169-3536}, + journal = {IEEE Access}, + keywords = {Biomedical imaging;Data mining;Semantics;Medical services;Feature extraction;Knowledge discovery;Medical question answering;interactive attention;deep learning;deep neural networks}, + month = {}, + number = {}, + pages = {74061-74071}, + title = {Multi-Scale Attentive Interaction Networks for Chinese Medical Question Answer Selection}, + volume = {6}, + year = {2018}, +} +""", ) diff --git a/mteb/tasks/Retrieval/__init__.py b/mteb/tasks/Retrieval/__init__.py index a13fa94bfc..b2abdd19e5 100644 --- a/mteb/tasks/Retrieval/__init__.py +++ b/mteb/tasks/Retrieval/__init__.py @@ -188,6 +188,8 @@ from .swe.SwednRetrieval import * from .swe.SweFaqRetrieval import * from .tur.TurHistQuad import * +from .vie.GreenNodeTableMarkdownRetrieval import * from .vie.VieQuADRetrieval import * +from .vie.ZacLegalTextRetrieval import * from .zho.CMTEBRetrieval import * from .zho.LeCaRDv2Retrieval import * diff --git a/mteb/tasks/Retrieval/ara/SadeemQuestionRetrieval.py b/mteb/tasks/Retrieval/ara/SadeemQuestionRetrieval.py index 2009a91c79..f82d06ee61 100644 --- a/mteb/tasks/Retrieval/ara/SadeemQuestionRetrieval.py +++ b/mteb/tasks/Retrieval/ara/SadeemQuestionRetrieval.py @@ -1,60 +1,60 @@ -from __future__ import annotations - -import datasets - -from mteb.abstasks.TaskMetadata import TaskMetadata - -from ....abstasks.AbsTaskRetrieval import AbsTaskRetrieval - - -class SadeemQuestionRetrieval(AbsTaskRetrieval): - _EVAL_SPLIT = "test" - - metadata = TaskMetadata( - name="SadeemQuestionRetrieval", - dataset={ - "path": "sadeem-ai/sadeem-ar-eval-retrieval-questions", - "revision": "3cb0752b182e5d5d740df547748b06663c8e0bd9", - "name": "test", - }, - reference="https://huggingface.co/datasets/sadeem-ai/sadeem-ar-eval-retrieval-questions", - description="SadeemQuestion: A Benchmark Data Set for Community Question-Retrieval Research", - type="Retrieval", - category="s2p", - modalities=["text"], - eval_splits=[_EVAL_SPLIT], - eval_langs=["ara-Arab"], - main_score="ndcg_at_10", - date=("2024-01-01", "2024-04-01"), - domains=["Written", "Written"], - task_subtypes=["Article retrieval"], - license="not specified", - annotations_creators="derived", - dialect=[], - sample_creation="found", - bibtex_citation=""" - @inproceedings{sadeem-2024-ar-retrieval-questions, - title = "SadeemQuestionRetrieval: A New Benchmark for Arabic questions-based Articles Searching.", - author = "abubakr.soliman@sadeem.app" - } - """, - ) - - def load_data(self, **kwargs): - if self.data_loaded: - return - - query_list = datasets.load_dataset(**self.metadata_dict["dataset"])["queries"] - queries = {row["query-id"]: row["text"] for row in query_list} - - corpus_list = datasets.load_dataset(**self.metadata_dict["dataset"])["corpus"] - corpus = {row["corpus-id"]: {"text": row["text"]} for row in corpus_list} - - qrels_list = datasets.load_dataset(**self.metadata_dict["dataset"])["qrels"] - qrels = {row["query-id"]: {row["corpus-id"]: 1} for row in qrels_list} - - self.corpus = {self._EVAL_SPLIT: corpus} - self.queries = {self._EVAL_SPLIT: queries} - self.relevant_docs = {self._EVAL_SPLIT: qrels} - - self.data_loaded = True +from __future__ import annotations + +import datasets + +from mteb.abstasks.TaskMetadata import TaskMetadata + +from ....abstasks.AbsTaskRetrieval import AbsTaskRetrieval + + +class SadeemQuestionRetrieval(AbsTaskRetrieval): + _EVAL_SPLIT = "test" + + metadata = TaskMetadata( + name="SadeemQuestionRetrieval", + dataset={ + "path": "sadeem-ai/sadeem-ar-eval-retrieval-questions", + "revision": "3cb0752b182e5d5d740df547748b06663c8e0bd9", + "name": "test", + }, + reference="https://huggingface.co/datasets/sadeem-ai/sadeem-ar-eval-retrieval-questions", + description="SadeemQuestion: A Benchmark Data Set for Community Question-Retrieval Research", + type="Retrieval", + category="s2p", + modalities=["text"], + eval_splits=[_EVAL_SPLIT], + eval_langs=["ara-Arab"], + main_score="ndcg_at_10", + date=("2024-01-01", "2024-04-01"), + domains=["Written", "Written"], + task_subtypes=["Article retrieval"], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=r""" +@inproceedings{sadeem-2024-ar-retrieval-questions, + author = {abubakr.soliman@sadeem.app}, + title = {SadeemQuestionRetrieval: A New Benchmark for Arabic questions-based Articles Searching.}, +} +""", + ) + + def load_data(self, **kwargs): + if self.data_loaded: + return + + query_list = datasets.load_dataset(**self.metadata_dict["dataset"])["queries"] + queries = {row["query-id"]: row["text"] for row in query_list} + + corpus_list = datasets.load_dataset(**self.metadata_dict["dataset"])["corpus"] + corpus = {row["corpus-id"]: {"text": row["text"]} for row in corpus_list} + + qrels_list = datasets.load_dataset(**self.metadata_dict["dataset"])["qrels"] + qrels = {row["query-id"]: {row["corpus-id"]: 1} for row in qrels_list} + + self.corpus = {self._EVAL_SPLIT: corpus} + self.queries = {self._EVAL_SPLIT: queries} + self.relevant_docs = {self._EVAL_SPLIT: qrels} + + self.data_loaded = True diff --git a/mteb/tasks/Retrieval/code/AppsRetrieval.py b/mteb/tasks/Retrieval/code/AppsRetrieval.py index e207f8e340..62fc53560f 100644 --- a/mteb/tasks/Retrieval/code/AppsRetrieval.py +++ b/mteb/tasks/Retrieval/code/AppsRetrieval.py @@ -28,10 +28,12 @@ class AppsRetrieval(AbsTaskRetrieval): annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation="""@article{hendrycksapps2021, - title={Measuring Coding Challenge Competence With APPS}, - author={Dan Hendrycks and Steven Basart and Saurav Kadavath and Mantas Mazeika and Akul Arora and Ethan Guo and Collin Burns and Samir Puranik and Horace He and Dawn Song and Jacob Steinhardt}, - journal={NeurIPS}, - year={2021} - }""", + bibtex_citation=r""" +@article{hendrycksapps2021, + author = {Dan Hendrycks and Steven Basart and Saurav Kadavath and Mantas Mazeika and Akul Arora and Ethan Guo and Collin Burns and Samir Puranik and Horace He and Dawn Song and Jacob Steinhardt}, + journal = {NeurIPS}, + title = {Measuring Coding Challenge Competence With APPS}, + year = {2021}, +} +""", ) diff --git a/mteb/tasks/Retrieval/code/COIRCodeSearchNetRetrieval.py b/mteb/tasks/Retrieval/code/COIRCodeSearchNetRetrieval.py index 29858026a6..5306f033aa 100644 --- a/mteb/tasks/Retrieval/code/COIRCodeSearchNetRetrieval.py +++ b/mteb/tasks/Retrieval/code/COIRCodeSearchNetRetrieval.py @@ -96,7 +96,14 @@ class COIRCodeSearchNetRetrieval(MultilingualTask, AbsTaskRetrieval): annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation="@article{husain2019codesearchnet, title={{CodeSearchNet} challenge: Evaluating the state of semantic code search}, author={Husain, Hamel and Wu, Ho-Hsiang and Gazit, Tiferet and Allamanis, Miltiadis and Brockschmidt, Marc}, journal={arXiv preprint arXiv:1909.09436}, year={2019} }", + bibtex_citation=r""" +@article{husain2019codesearchnet, + author = {Husain, Hamel and Wu, Ho-Hsiang and Gazit, Tiferet and Allamanis, Miltiadis and Brockschmidt, Marc}, + journal = {arXiv preprint arXiv:1909.09436}, + title = {{CodeSearchNet} challenge: Evaluating the state of semantic code search}, + year = {2019}, +} +""", ) def load_data(self, **kwargs): diff --git a/mteb/tasks/Retrieval/code/CodeEditSearchRetrieval.py b/mteb/tasks/Retrieval/code/CodeEditSearchRetrieval.py index e3175fa324..4f46641d69 100644 --- a/mteb/tasks/Retrieval/code/CodeEditSearchRetrieval.py +++ b/mteb/tasks/Retrieval/code/CodeEditSearchRetrieval.py @@ -46,7 +46,14 @@ class CodeEditSearchRetrieval(MultilingualTask, AbsTaskRetrieval): annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation="@article{muennighoff2023octopack, title={OctoPack: Instruction Tuning Code Large Language Models}, author={Niklas Muennighoff and Qian Liu and Armel Zebaze and Qinkai Zheng and Binyuan Hui and Terry Yue Zhuo and Swayam Singh and Xiangru Tang and Leandro von Werra and Shayne Longpre}, journal={arXiv preprint arXiv:2308.07124}, year={2023} }", + bibtex_citation=r""" +@article{muennighoff2023octopack, + author = {Niklas Muennighoff and Qian Liu and Armel Zebaze and Qinkai Zheng and Binyuan Hui and Terry Yue Zhuo and Swayam Singh and Xiangru Tang and Leandro von Werra and Shayne Longpre}, + journal = {arXiv preprint arXiv:2308.07124}, + title = {OctoPack: Instruction Tuning Code Large Language Models}, + year = {2023}, +} +""", ) def load_data(self, **kwargs): diff --git a/mteb/tasks/Retrieval/code/CodeFeedbackMTRetrieval.py b/mteb/tasks/Retrieval/code/CodeFeedbackMTRetrieval.py index fcb1a822b2..45dbcb8603 100644 --- a/mteb/tasks/Retrieval/code/CodeFeedbackMTRetrieval.py +++ b/mteb/tasks/Retrieval/code/CodeFeedbackMTRetrieval.py @@ -28,13 +28,15 @@ class CodeFeedbackMT(AbsTaskRetrieval): annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation="""@misc{zheng2024opencodeinterpreterintegratingcodegeneration, - title={OpenCodeInterpreter: Integrating Code Generation with Execution and Refinement}, - author={Tianyu Zheng and Ge Zhang and Tianhao Shen and Xueling Liu and Bill Yuchen Lin and Jie Fu and Wenhu Chen and Xiang Yue}, - year={2024}, - eprint={2402.14658}, - archivePrefix={arXiv}, - primaryClass={cs.SE}, - url={https://arxiv.org/abs/2402.14658}, - }""", + bibtex_citation=r""" +@misc{zheng2024opencodeinterpreterintegratingcodegeneration, + archiveprefix = {arXiv}, + author = {Tianyu Zheng and Ge Zhang and Tianhao Shen and Xueling Liu and Bill Yuchen Lin and Jie Fu and Wenhu Chen and Xiang Yue}, + eprint = {2402.14658}, + primaryclass = {cs.SE}, + title = {OpenCodeInterpreter: Integrating Code Generation with Execution and Refinement}, + url = {https://arxiv.org/abs/2402.14658}, + year = {2024}, +} +""", ) diff --git a/mteb/tasks/Retrieval/code/CodeFeedbackSTRetrieval.py b/mteb/tasks/Retrieval/code/CodeFeedbackSTRetrieval.py index 2a99c990c4..00994ac642 100644 --- a/mteb/tasks/Retrieval/code/CodeFeedbackSTRetrieval.py +++ b/mteb/tasks/Retrieval/code/CodeFeedbackSTRetrieval.py @@ -28,13 +28,15 @@ class CodeFeedbackST(AbsTaskRetrieval): annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation="""@misc{li2024coircomprehensivebenchmarkcode, - title={CoIR: A Comprehensive Benchmark for Code Information Retrieval Models}, - author={Xiangyang Li and Kuicai Dong and Yi Quan Lee and Wei Xia and Yichun Yin and Hao Zhang and Yong Liu and Yasheng Wang and Ruiming Tang}, - year={2024}, - eprint={2407.02883}, - archivePrefix={arXiv}, - primaryClass={cs.IR}, - url={https://arxiv.org/abs/2407.02883}, - }""", + bibtex_citation=r""" +@misc{li2024coircomprehensivebenchmarkcode, + archiveprefix = {arXiv}, + author = {Xiangyang Li and Kuicai Dong and Yi Quan Lee and Wei Xia and Yichun Yin and Hao Zhang and Yong Liu and Yasheng Wang and Ruiming Tang}, + eprint = {2407.02883}, + primaryclass = {cs.IR}, + title = {CoIR: A Comprehensive Benchmark for Code Information Retrieval Models}, + url = {https://arxiv.org/abs/2407.02883}, + year = {2024}, +} +""", ) diff --git a/mteb/tasks/Retrieval/code/CodeRAG.py b/mteb/tasks/Retrieval/code/CodeRAG.py index 1573aa7ff9..0123d30428 100644 --- a/mteb/tasks/Retrieval/code/CodeRAG.py +++ b/mteb/tasks/Retrieval/code/CodeRAG.py @@ -30,14 +30,14 @@ def split_by_first_newline(s): "sample_creation": "found", "bibtex_citation": """ @misc{wang2024coderagbenchretrievalaugmentcode, - title={CodeRAG-Bench: Can Retrieval Augment Code Generation?}, - author={Zora Zhiruo Wang and Akari Asai and Xinyan Velocity Yu and Frank F. Xu and Yiqing Xie and Graham Neubig and Daniel Fried}, - year={2024}, - eprint={2406.14497}, - archivePrefix={arXiv}, - primaryClass={cs.SE}, - url={https://arxiv.org/abs/2406.14497}, - } + archiveprefix = {arXiv}, + author = {Zora Zhiruo Wang and Akari Asai and Xinyan Velocity Yu and Frank F. Xu and Yiqing Xie and Graham Neubig and Daniel Fried}, + eprint = {2406.14497}, + primaryclass = {cs.SE}, + title = {CodeRAG-Bench: Can Retrieval Augment Code Generation?}, + url = {https://arxiv.org/abs/2406.14497}, + year = {2024}, +} """, } diff --git a/mteb/tasks/Retrieval/code/CodeSearchNetCCRetrieval.py b/mteb/tasks/Retrieval/code/CodeSearchNetCCRetrieval.py index 3f5ca2e028..71e579efcc 100644 --- a/mteb/tasks/Retrieval/code/CodeSearchNetCCRetrieval.py +++ b/mteb/tasks/Retrieval/code/CodeSearchNetCCRetrieval.py @@ -95,15 +95,17 @@ class CodeSearchNetCCRetrieval(MultilingualTask, AbsTaskRetrieval): annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation="""@misc{li2024coircomprehensivebenchmarkcode, - title={CoIR: A Comprehensive Benchmark for Code Information Retrieval Models}, - author={Xiangyang Li and Kuicai Dong and Yi Quan Lee and Wei Xia and Yichun Yin and Hao Zhang and Yong Liu and Yasheng Wang and Ruiming Tang}, - year={2024}, - eprint={2407.02883}, - archivePrefix={arXiv}, - primaryClass={cs.IR}, - url={https://arxiv.org/abs/2407.02883}, - }""", + bibtex_citation=r""" +@misc{li2024coircomprehensivebenchmarkcode, + archiveprefix = {arXiv}, + author = {Xiangyang Li and Kuicai Dong and Yi Quan Lee and Wei Xia and Yichun Yin and Hao Zhang and Yong Liu and Yasheng Wang and Ruiming Tang}, + eprint = {2407.02883}, + primaryclass = {cs.IR}, + title = {CoIR: A Comprehensive Benchmark for Code Information Retrieval Models}, + url = {https://arxiv.org/abs/2407.02883}, + year = {2024}, +} +""", ) def load_data(self, **kwargs): diff --git a/mteb/tasks/Retrieval/code/CodeSearchNetRetrieval.py b/mteb/tasks/Retrieval/code/CodeSearchNetRetrieval.py index ddcef675f5..1eb657f35e 100644 --- a/mteb/tasks/Retrieval/code/CodeSearchNetRetrieval.py +++ b/mteb/tasks/Retrieval/code/CodeSearchNetRetrieval.py @@ -32,7 +32,14 @@ class CodeSearchNetRetrieval(MultilingualTask, AbsTaskRetrieval): annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation="@article{husain2019codesearchnet, title={{CodeSearchNet} challenge: Evaluating the state of semantic code search}, author={Husain, Hamel and Wu, Ho-Hsiang and Gazit, Tiferet and Allamanis, Miltiadis and Brockschmidt, Marc}, journal={arXiv preprint arXiv:1909.09436}, year={2019} }", + bibtex_citation=r""" +@article{husain2019codesearchnet, + author = {Husain, Hamel and Wu, Ho-Hsiang and Gazit, Tiferet and Allamanis, Miltiadis and Brockschmidt, Marc}, + journal = {arXiv preprint arXiv:1909.09436}, + title = {{CodeSearchNet} challenge: Evaluating the state of semantic code search}, + year = {2019}, +} +""", ) def load_data(self, **kwargs): diff --git a/mteb/tasks/Retrieval/code/CodeTransOceanContestRetrieval.py b/mteb/tasks/Retrieval/code/CodeTransOceanContestRetrieval.py index 9933e8fe87..423be6bdfc 100644 --- a/mteb/tasks/Retrieval/code/CodeTransOceanContestRetrieval.py +++ b/mteb/tasks/Retrieval/code/CodeTransOceanContestRetrieval.py @@ -28,13 +28,15 @@ class CodeTransOceanContestRetrieval(AbsTaskRetrieval): annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation="""@misc{yan2023codetransoceancomprehensivemultilingualbenchmark, - title={CodeTransOcean: A Comprehensive Multilingual Benchmark for Code Translation}, - author={Weixiang Yan and Yuchen Tian and Yunzhe Li and Qian Chen and Wen Wang}, - year={2023}, - eprint={2310.04951}, - archivePrefix={arXiv}, - primaryClass={cs.AI}, - url={https://arxiv.org/abs/2310.04951}, - }""", + bibtex_citation=r""" +@misc{yan2023codetransoceancomprehensivemultilingualbenchmark, + archiveprefix = {arXiv}, + author = {Weixiang Yan and Yuchen Tian and Yunzhe Li and Qian Chen and Wen Wang}, + eprint = {2310.04951}, + primaryclass = {cs.AI}, + title = {CodeTransOcean: A Comprehensive Multilingual Benchmark for Code Translation}, + url = {https://arxiv.org/abs/2310.04951}, + year = {2023}, +} +""", ) diff --git a/mteb/tasks/Retrieval/code/CodeTransOceanDLRetrieval.py b/mteb/tasks/Retrieval/code/CodeTransOceanDLRetrieval.py index f17e1df43b..77107edde9 100644 --- a/mteb/tasks/Retrieval/code/CodeTransOceanDLRetrieval.py +++ b/mteb/tasks/Retrieval/code/CodeTransOceanDLRetrieval.py @@ -28,13 +28,15 @@ class CodeTransOceanDLRetrieval(AbsTaskRetrieval): annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation="""@misc{yan2023codetransoceancomprehensivemultilingualbenchmark, - title={CodeTransOcean: A Comprehensive Multilingual Benchmark for Code Translation}, - author={Weixiang Yan and Yuchen Tian and Yunzhe Li and Qian Chen and Wen Wang}, - year={2023}, - eprint={2310.04951}, - archivePrefix={arXiv}, - primaryClass={cs.AI}, - url={https://arxiv.org/abs/2310.04951}, - }""", + bibtex_citation=r""" +@misc{yan2023codetransoceancomprehensivemultilingualbenchmark, + archiveprefix = {arXiv}, + author = {Weixiang Yan and Yuchen Tian and Yunzhe Li and Qian Chen and Wen Wang}, + eprint = {2310.04951}, + primaryclass = {cs.AI}, + title = {CodeTransOcean: A Comprehensive Multilingual Benchmark for Code Translation}, + url = {https://arxiv.org/abs/2310.04951}, + year = {2023}, +} +""", ) diff --git a/mteb/tasks/Retrieval/code/CosQARetrieval.py b/mteb/tasks/Retrieval/code/CosQARetrieval.py index e0b975aebe..9ad7bcb9aa 100644 --- a/mteb/tasks/Retrieval/code/CosQARetrieval.py +++ b/mteb/tasks/Retrieval/code/CosQARetrieval.py @@ -28,13 +28,15 @@ class CosQARetrieval(AbsTaskRetrieval): annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation="""@misc{huang2021cosqa20000webqueries, - title={CoSQA: 20,000+ Web Queries for Code Search and Question Answering}, - author={Junjie Huang and Duyu Tang and Linjun Shou and Ming Gong and Ke Xu and Daxin Jiang and Ming Zhou and Nan Duan}, - year={2021}, - eprint={2105.13239}, - archivePrefix={arXiv}, - primaryClass={cs.CL}, - url={https://arxiv.org/abs/2105.13239}, - }""", + bibtex_citation=r""" +@misc{huang2021cosqa20000webqueries, + archiveprefix = {arXiv}, + author = {Junjie Huang and Duyu Tang and Linjun Shou and Ming Gong and Ke Xu and Daxin Jiang and Ming Zhou and Nan Duan}, + eprint = {2105.13239}, + primaryclass = {cs.CL}, + title = {CoSQA: 20,000+ Web Queries for Code Search and Question Answering}, + url = {https://arxiv.org/abs/2105.13239}, + year = {2021}, +} +""", ) diff --git a/mteb/tasks/Retrieval/code/StackOverflowQARetrieval.py b/mteb/tasks/Retrieval/code/StackOverflowQARetrieval.py index 3f06da1660..3484a15ea9 100644 --- a/mteb/tasks/Retrieval/code/StackOverflowQARetrieval.py +++ b/mteb/tasks/Retrieval/code/StackOverflowQARetrieval.py @@ -28,13 +28,15 @@ class StackOverflowQARetrieval(AbsTaskRetrieval): annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation="""@misc{li2024coircomprehensivebenchmarkcode, - title={CoIR: A Comprehensive Benchmark for Code Information Retrieval Models}, - author={Xiangyang Li and Kuicai Dong and Yi Quan Lee and Wei Xia and Yichun Yin and Hao Zhang and Yong Liu and Yasheng Wang and Ruiming Tang}, - year={2024}, - eprint={2407.02883}, - archivePrefix={arXiv}, - primaryClass={cs.IR}, - url={https://arxiv.org/abs/2407.02883}, - }""", + bibtex_citation=r""" +@misc{li2024coircomprehensivebenchmarkcode, + archiveprefix = {arXiv}, + author = {Xiangyang Li and Kuicai Dong and Yi Quan Lee and Wei Xia and Yichun Yin and Hao Zhang and Yong Liu and Yasheng Wang and Ruiming Tang}, + eprint = {2407.02883}, + primaryclass = {cs.IR}, + title = {CoIR: A Comprehensive Benchmark for Code Information Retrieval Models}, + url = {https://arxiv.org/abs/2407.02883}, + year = {2024}, +} +""", ) diff --git a/mteb/tasks/Retrieval/code/SyntheticText2SqlRetrieval.py b/mteb/tasks/Retrieval/code/SyntheticText2SqlRetrieval.py index cd4cd8835e..6e3300aea4 100644 --- a/mteb/tasks/Retrieval/code/SyntheticText2SqlRetrieval.py +++ b/mteb/tasks/Retrieval/code/SyntheticText2SqlRetrieval.py @@ -28,11 +28,13 @@ class SyntheticText2SQLRetrieval(AbsTaskRetrieval): annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation="""@software{gretel-synthetic-text-to-sql-2024, - author = {Meyer, Yev and Emadi, Marjan and Nathawani, Dhruv and Ramaswamy, Lipika and Boyd, Kendrick and Van Segbroeck, Maarten and Grossman, Matthew and Mlocek, Piotr and Newberry, Drew}, - title = {{Synthetic-Text-To-SQL}: A synthetic dataset for training language models to generate SQL queries from natural language prompts}, - month = {April}, - year = {2024}, - url = {https://huggingface.co/datasets/gretelai/synthetic-text-to-sql} - }""", + bibtex_citation=r""" +@software{gretel-synthetic-text-to-sql-2024, + author = {Meyer, Yev and Emadi, Marjan and Nathawani, Dhruv and Ramaswamy, Lipika and Boyd, Kendrick and Van Segbroeck, Maarten and Grossman, Matthew and Mlocek, Piotr and Newberry, Drew}, + month = {April}, + title = {{Synthetic-Text-To-SQL}: A synthetic dataset for training language models to generate SQL queries from natural language prompts}, + url = {https://huggingface.co/datasets/gretelai/synthetic-text-to-sql}, + year = {2024}, +} +""", ) diff --git a/mteb/tasks/Retrieval/dan/DanFeverRetrieval.py b/mteb/tasks/Retrieval/dan/DanFeverRetrieval.py index 6a7b239f2f..83ce3dd752 100644 --- a/mteb/tasks/Retrieval/dan/DanFeverRetrieval.py +++ b/mteb/tasks/Retrieval/dan/DanFeverRetrieval.py @@ -27,21 +27,21 @@ class DanFeverRetrieval(AbsTaskRetrieval): annotations_creators="human-annotated", dialect=[], sample_creation="found", - bibtex_citation=""" + bibtex_citation=r""" @inproceedings{norregaard-derczynski-2021-danfever, - title = "{D}an{FEVER}: claim verification dataset for {D}anish", - author = "N{\o}rregaard, Jeppe and - Derczynski, Leon", - editor = "Dobnik, Simon and - {\O}vrelid, Lilja", - booktitle = "Proceedings of the 23rd Nordic Conference on Computational Linguistics (NoDaLiDa)", - month = may # " 31--2 " # jun, - year = "2021", - address = "Reykjavik, Iceland (Online)", - publisher = {Link{\"o}ping University Electronic Press, Sweden}, - url = "https://aclanthology.org/2021.nodalida-main.47", - pages = "422--428", - abstract = "We present a dataset, DanFEVER, intended for multilingual misinformation research. The dataset is in Danish and has the same format as the well-known English FEVER dataset. It can be used for testing methods in multilingual settings, as well as for creating models in production for the Danish language.", + abstract = {We present a dataset, DanFEVER, intended for multilingual misinformation research. The dataset is in Danish and has the same format as the well-known English FEVER dataset. It can be used for testing methods in multilingual settings, as well as for creating models in production for the Danish language.}, + address = {Reykjavik, Iceland (Online)}, + author = {N{\o}rregaard, Jeppe and +Derczynski, Leon}, + booktitle = {Proceedings of the 23rd Nordic Conference on Computational Linguistics (NoDaLiDa)}, + editor = {Dobnik, Simon and +{\O}vrelid, Lilja}, + month = may # { 31--2 } # jun, + pages = {422--428}, + publisher = {Link{\"o}ping University Electronic Press, Sweden}, + title = {{D}an{FEVER}: claim verification dataset for {D}anish}, + url = {https://aclanthology.org/2021.nodalida-main.47}, + year = {2021}, } """, prompt={ @@ -130,21 +130,21 @@ class DanFever(AbsTaskRetrieval): annotations_creators="human-annotated", dialect=[], sample_creation="found", - bibtex_citation=""" + bibtex_citation=r""" @inproceedings{norregaard-derczynski-2021-danfever, - title = "{D}an{FEVER}: claim verification dataset for {D}anish", - author = "N{\o}rregaard, Jeppe and - Derczynski, Leon", - editor = "Dobnik, Simon and - {\O}vrelid, Lilja", - booktitle = "Proceedings of the 23rd Nordic Conference on Computational Linguistics (NoDaLiDa)", - month = may # " 31--2 " # jun, - year = "2021", - address = "Reykjavik, Iceland (Online)", - publisher = {Link{\"o}ping University Electronic Press, Sweden}, - url = "https://aclanthology.org/2021.nodalida-main.47", - pages = "422--428", - abstract = "We present a dataset, DanFEVER, intended for multilingual misinformation research. The dataset is in Danish and has the same format as the well-known English FEVER dataset. It can be used for testing methods in multilingual settings, as well as for creating models in production for the Danish language.", + abstract = {We present a dataset, DanFEVER, intended for multilingual misinformation research. The dataset is in Danish and has the same format as the well-known English FEVER dataset. It can be used for testing methods in multilingual settings, as well as for creating models in production for the Danish language.}, + address = {Reykjavik, Iceland (Online)}, + author = {N{\o}rregaard, Jeppe and +Derczynski, Leon}, + booktitle = {Proceedings of the 23rd Nordic Conference on Computational Linguistics (NoDaLiDa)}, + editor = {Dobnik, Simon and +{\O}vrelid, Lilja}, + month = may # { 31--2 } # jun, + pages = {422--428}, + publisher = {Link{\"o}ping University Electronic Press, Sweden}, + title = {{D}an{FEVER}: claim verification dataset for {D}anish}, + url = {https://aclanthology.org/2021.nodalida-main.47}, + year = {2021}, } """, prompt={ diff --git a/mteb/tasks/Retrieval/dan/TV2Nordretrieval.py b/mteb/tasks/Retrieval/dan/TV2Nordretrieval.py index 1abc46fcc9..388997c7f6 100644 --- a/mteb/tasks/Retrieval/dan/TV2Nordretrieval.py +++ b/mteb/tasks/Retrieval/dan/TV2Nordretrieval.py @@ -27,34 +27,36 @@ class TV2Nordretrieval(AbsTaskRetrieval): annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation="""@inproceedings{flansmose-mikkelsen-etal-2022-ddisco, - title = "{DD}is{C}o: A Discourse Coherence Dataset for {D}anish", - author = "Flansmose Mikkelsen, Linea and - Kinch, Oliver and - Jess Pedersen, Anders and - Lacroix, Oph{\'e}lie", - editor = "Calzolari, Nicoletta and - B{\'e}chet, Fr{\'e}d{\'e}ric and - Blache, Philippe and - Choukri, Khalid and - Cieri, Christopher and - Declerck, Thierry and - Goggi, Sara and - Isahara, Hitoshi and - Maegaard, Bente and - Mariani, Joseph and - Mazo, H{\'e}l{\`e}ne and - Odijk, Jan and - Piperidis, Stelios", - booktitle = "Proceedings of the Thirteenth Language Resources and Evaluation Conference", - month = jun, - year = "2022", - address = "Marseille, France", - publisher = "European Language Resources Association", - url = "https://aclanthology.org/2022.lrec-1.260", - pages = "2440--2445", - abstract = "To date, there has been no resource for studying discourse coherence on real-world Danish texts. Discourse coherence has mostly been approached with the assumption that incoherent texts can be represented by coherent texts in which sentences have been shuffled. However, incoherent real-world texts rarely resemble that. We thus present DDisCo, a dataset including text from the Danish Wikipedia and Reddit annotated for discourse coherence. We choose to annotate real-world texts instead of relying on artificially incoherent text for training and testing models. Then, we evaluate the performance of several methods, including neural networks, on the dataset.", -}""", + bibtex_citation=r""" +@inproceedings{flansmose-mikkelsen-etal-2022-ddisco, + abstract = {To date, there has been no resource for studying discourse coherence on real-world Danish texts. Discourse coherence has mostly been approached with the assumption that incoherent texts can be represented by coherent texts in which sentences have been shuffled. However, incoherent real-world texts rarely resemble that. We thus present DDisCo, a dataset including text from the Danish Wikipedia and Reddit annotated for discourse coherence. We choose to annotate real-world texts instead of relying on artificially incoherent text for training and testing models. Then, we evaluate the performance of several methods, including neural networks, on the dataset.}, + address = {Marseille, France}, + author = {Flansmose Mikkelsen, Linea and +Kinch, Oliver and +Jess Pedersen, Anders and +Lacroix, Oph{\'e}lie}, + booktitle = {Proceedings of the Thirteenth Language Resources and Evaluation Conference}, + editor = {Calzolari, Nicoletta and +B{\'e}chet, Fr{\'e}d{\'e}ric and +Blache, Philippe and +Choukri, Khalid and +Cieri, Christopher and +Declerck, Thierry and +Goggi, Sara and +Isahara, Hitoshi and +Maegaard, Bente and +Mariani, Joseph and +Mazo, H{\'e}l{\`e}ne and +Odijk, Jan and +Piperidis, Stelios}, + month = jun, + pages = {2440--2445}, + publisher = {European Language Resources Association}, + title = {{DD}is{C}o: A Discourse Coherence Dataset for {D}anish}, + url = {https://aclanthology.org/2022.lrec-1.260}, + year = {2022}, +} +""", prompt={ "query": "Given a summary of a Danish news article retrieve the corresponding news article" }, diff --git a/mteb/tasks/Retrieval/dan/TwitterHjerneRetrieval.py b/mteb/tasks/Retrieval/dan/TwitterHjerneRetrieval.py index 5bc91789e7..198d1bc1b5 100644 --- a/mteb/tasks/Retrieval/dan/TwitterHjerneRetrieval.py +++ b/mteb/tasks/Retrieval/dan/TwitterHjerneRetrieval.py @@ -27,11 +27,11 @@ class TwitterHjerneRetrieval(AbsTaskRetrieval): annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation=""" + bibtex_citation=r""" @article{holm2024gllms, - title={Are GLLMs Danoliterate? Benchmarking Generative NLP in Danish}, - author={Holm, Soren Vejlgaard}, - year={2024} + author = {Holm, Soren Vejlgaard}, + title = {Are GLLMs Danoliterate? Benchmarking Generative NLP in Danish}, + year = {2024}, } """, prompt={"query": "Retrieve answers to questions asked in Danish tweets"}, diff --git a/mteb/tasks/Retrieval/deu/GerDaLIRRetrieval.py b/mteb/tasks/Retrieval/deu/GerDaLIRRetrieval.py index 111eb986ed..380f30dcdf 100644 --- a/mteb/tasks/Retrieval/deu/GerDaLIRRetrieval.py +++ b/mteb/tasks/Retrieval/deu/GerDaLIRRetrieval.py @@ -25,25 +25,27 @@ class GerDaLIR(AbsTaskRetrieval): eval_langs=["deu-Latn"], main_score="ndcg_at_10", date=None, - domains=None, - task_subtypes=None, + domains=["Legal"], + task_subtypes=[], license=None, annotations_creators=None, dialect=None, sample_creation=None, - bibtex_citation="""@inproceedings{wrzalik-krechel-2021-gerdalir, - title = "{G}er{D}a{LIR}: A {G}erman Dataset for Legal Information Retrieval", - author = "Wrzalik, Marco and - Krechel, Dirk", - booktitle = "Proceedings of the Natural Legal Language Processing Workshop 2021", - month = nov, - year = "2021", - address = "Punta Cana, Dominican Republic", - publisher = "Association for Computational Linguistics", - url = "https://aclanthology.org/2021.nllp-1.13", - pages = "123--128", - abstract = "We present GerDaLIR, a German Dataset for Legal Information Retrieval based on case documents from the open legal information platform Open Legal Data. The dataset consists of 123K queries, each labelled with at least one relevant document in a collection of 131K case documents. We conduct several baseline experiments including BM25 and a state-of-the-art neural re-ranker. With our dataset, we aim to provide a standardized benchmark for German LIR and promote open research in this area. Beyond that, our dataset comprises sufficient training data to be used as a downstream task for German or multilingual language models.", -}""", + bibtex_citation=r""" +@inproceedings{wrzalik-krechel-2021-gerdalir, + abstract = {We present GerDaLIR, a German Dataset for Legal Information Retrieval based on case documents from the open legal information platform Open Legal Data. The dataset consists of 123K queries, each labelled with at least one relevant document in a collection of 131K case documents. We conduct several baseline experiments including BM25 and a state-of-the-art neural re-ranker. With our dataset, we aim to provide a standardized benchmark for German LIR and promote open research in this area. Beyond that, our dataset comprises sufficient training data to be used as a downstream task for German or multilingual language models.}, + address = {Punta Cana, Dominican Republic}, + author = {Wrzalik, Marco and +Krechel, Dirk}, + booktitle = {Proceedings of the Natural Legal Language Processing Workshop 2021}, + month = nov, + pages = {123--128}, + publisher = {Association for Computational Linguistics}, + title = {{G}er{D}a{LIR}: A {G}erman Dataset for Legal Information Retrieval}, + url = {https://aclanthology.org/2021.nllp-1.13}, + year = {2021}, +} +""", ) def load_data(self, **kwargs): diff --git a/mteb/tasks/Retrieval/deu/GerDaLIRSmallRetrieval.py b/mteb/tasks/Retrieval/deu/GerDaLIRSmallRetrieval.py index d80487251e..3325a75c5c 100644 --- a/mteb/tasks/Retrieval/deu/GerDaLIRSmallRetrieval.py +++ b/mteb/tasks/Retrieval/deu/GerDaLIRSmallRetrieval.py @@ -27,17 +27,19 @@ class GerDaLIRSmall(AbsTaskRetrieval): annotations_creators="derived", dialect=None, sample_creation="found", - bibtex_citation="""@inproceedings{wrzalik-krechel-2021-gerdalir, - title = "{G}er{D}a{LIR}: A {G}erman Dataset for Legal Information Retrieval", - author = "Wrzalik, Marco and - Krechel, Dirk", - booktitle = "Proceedings of the Natural Legal Language Processing Workshop 2021", - month = nov, - year = "2021", - address = "Punta Cana, Dominican Republic", - publisher = "Association for Computational Linguistics", - url = "https://aclanthology.org/2021.nllp-1.13", - pages = "123--128", - abstract = "We present GerDaLIR, a German Dataset for Legal Information Retrieval based on case documents from the open legal information platform Open Legal Data. The dataset consists of 123K queries, each labelled with at least one relevant document in a collection of 131K case documents. We conduct several baseline experiments including BM25 and a state-of-the-art neural re-ranker. With our dataset, we aim to provide a standardized benchmark for German LIR and promote open research in this area. Beyond that, our dataset comprises sufficient training data to be used as a downstream task for German or multilingual language models.", -}""", + bibtex_citation=r""" +@inproceedings{wrzalik-krechel-2021-gerdalir, + abstract = {We present GerDaLIR, a German Dataset for Legal Information Retrieval based on case documents from the open legal information platform Open Legal Data. The dataset consists of 123K queries, each labelled with at least one relevant document in a collection of 131K case documents. We conduct several baseline experiments including BM25 and a state-of-the-art neural re-ranker. With our dataset, we aim to provide a standardized benchmark for German LIR and promote open research in this area. Beyond that, our dataset comprises sufficient training data to be used as a downstream task for German or multilingual language models.}, + address = {Punta Cana, Dominican Republic}, + author = {Wrzalik, Marco and +Krechel, Dirk}, + booktitle = {Proceedings of the Natural Legal Language Processing Workshop 2021}, + month = nov, + pages = {123--128}, + publisher = {Association for Computational Linguistics}, + title = {{G}er{D}a{LIR}: A {G}erman Dataset for Legal Information Retrieval}, + url = {https://aclanthology.org/2021.nllp-1.13}, + year = {2021}, +} +""", ) diff --git a/mteb/tasks/Retrieval/deu/GermanDPRRetrieval.py b/mteb/tasks/Retrieval/deu/GermanDPRRetrieval.py index a3118b8f73..9b280aaecf 100644 --- a/mteb/tasks/Retrieval/deu/GermanDPRRetrieval.py +++ b/mteb/tasks/Retrieval/deu/GermanDPRRetrieval.py @@ -25,21 +25,23 @@ class GermanDPR(AbsTaskRetrieval): eval_splits=[_EVAL_SPLIT], eval_langs=["deu-Latn"], main_score="ndcg_at_10", - date=None, - domains=None, - task_subtypes=None, - license=None, - annotations_creators=None, - dialect=None, - sample_creation=None, - bibtex_citation="""@misc{möller2021germanquad, - title={GermanQuAD and GermanDPR: Improving Non-English Question Answering and Passage Retrieval}, - author={Timo Möller and Julian Risch and Malte Pietsch}, - year={2021}, - eprint={2104.12741}, - archivePrefix={arXiv}, - primaryClass={cs.CL} -}""", + date=("2020-05-19", "2021-04-26"), + domains=["Written", "Non-fiction", "Web"], + task_subtypes=["Question answering"], + license="cc-by-4.0", + annotations_creators="human-annotated", + dialect=[], + sample_creation="found", + bibtex_citation=r""" +@misc{möller2021germanquad, + archiveprefix = {arXiv}, + author = {Timo Möller and Julian Risch and Malte Pietsch}, + eprint = {2104.12741}, + primaryclass = {cs.CL}, + title = {GermanQuAD and GermanDPR: Improving Non-English Question Answering and Passage Retrieval}, + year = {2021}, +} +""", ) @staticmethod diff --git a/mteb/tasks/Retrieval/deu/GermanGovServiceRetrieval.py b/mteb/tasks/Retrieval/deu/GermanGovServiceRetrieval.py index 7d77873882..a6fc74455a 100644 --- a/mteb/tasks/Retrieval/deu/GermanGovServiceRetrieval.py +++ b/mteb/tasks/Retrieval/deu/GermanGovServiceRetrieval.py @@ -32,18 +32,20 @@ class GermanGovServiceRetrieval(AbsTaskRetrieval): license="mit", annotations_creators="derived", dialect=[], - bibtex_citation="""@software{lhm-dienstleistungen-qa, - author = {Schröder, Leon Marius and - Gutknecht, Clemens and - Alkiddeh, Oubada and - Susanne Weiß, - Lukas, Leon}, - title = {LHM-Dienstleistungen-QA - german public domain question-answering dataset}, - month = nov, - year = 2022, - publisher = {it@M}, - url = {https://huggingface.co/datasets/it-at-m/LHM-Dienstleistungen-QA} -}""", + bibtex_citation=r""" +@software{lhm-dienstleistungen-qa, + author = {Schröder, Leon Marius and +Gutknecht, Clemens and +Alkiddeh, Oubada and +Susanne Weiß, +Lukas, Leon}, + month = nov, + publisher = {it@M}, + title = {LHM-Dienstleistungen-QA - german public domain question-answering dataset}, + url = {https://huggingface.co/datasets/it-at-m/LHM-Dienstleistungen-QA}, + year = {2022}, +} +""", sample_creation="found", ) diff --git a/mteb/tasks/Retrieval/deu/GermanQuADRetrieval.py b/mteb/tasks/Retrieval/deu/GermanQuADRetrieval.py index ba6a21e96e..a6b1a5d7d5 100644 --- a/mteb/tasks/Retrieval/deu/GermanQuADRetrieval.py +++ b/mteb/tasks/Retrieval/deu/GermanQuADRetrieval.py @@ -31,7 +31,7 @@ class GermanQuADRetrieval(AbsTaskRetrieval): metadata = TaskMetadata( name="GermanQuAD-Retrieval", description="Context Retrieval for German Question Answering", - reference="https://www.kaggle.com/datasets/GermanQuAD", + reference="https://huggingface.co/datasets/deepset/germanquad", dataset={ "path": "mteb/germanquad-retrieval", "revision": "f5c87ae5a2e7a5106606314eef45255f03151bb3", @@ -42,21 +42,23 @@ class GermanQuADRetrieval(AbsTaskRetrieval): eval_splits=["test"], eval_langs=["deu-Latn"], main_score="mrr_at_5", - date=None, - domains=None, - task_subtypes=None, - license=None, - annotations_creators=None, - dialect=None, - sample_creation=None, - bibtex_citation="""misc{möller2021germanquad, - title={GermanQuAD and GermanDPR: Improving Non-English Question Answering and Passage Retrieval}, - author={Timo Möller and Julian Risch and Malte Pietsch}, - year={2021}, - eprint={2104.12741}, - archivePrefix={arXiv}, - primaryClass={cs.CL} -}""", + date=("2020-05-19", "2021-04-26"), + domains=["Written", "Non-fiction", "Web"], + task_subtypes=["Question answering"], + license="cc-by-4.0", + annotations_creators="human-annotated", + dialect=[], + sample_creation="found", + bibtex_citation=r""" +@misc{möller2021germanquad, + archiveprefix = {arXiv}, + author = {Timo Möller and Julian Risch and Malte Pietsch}, + eprint = {2104.12741}, + primaryclass = {cs.CL}, + title = {GermanQuAD and GermanDPR: Improving Non-English Question Answering and Passage Retrieval}, + year = {2021}, +} +""", ) def load_data(self, **kwargs): diff --git a/mteb/tasks/Retrieval/deu/LegalQuADRetrieval.py b/mteb/tasks/Retrieval/deu/LegalQuADRetrieval.py index 663d427f97..60132b1aad 100644 --- a/mteb/tasks/Retrieval/deu/LegalQuADRetrieval.py +++ b/mteb/tasks/Retrieval/deu/LegalQuADRetrieval.py @@ -27,15 +27,17 @@ class LegalQuAD(AbsTaskRetrieval): annotations_creators="derived", dialect=None, sample_creation="found", - bibtex_citation="""@INPROCEEDINGS{9723721, - author={Hoppe, Christoph and Pelkmann, David and Migenda, Nico and Hötte, Daniel and Schenck, Wolfram}, - booktitle={2021 IEEE Fourth International Conference on Artificial Intelligence and Knowledge Engineering (AIKE)}, - title={Towards Intelligent Legal Advisors for Document Retrieval and Question-Answering in German Legal Documents}, - year={2021}, - volume={}, - number={}, - pages={29-32}, - keywords={Knowledge engineering;Law;Semantic search;Conferences;Bit error rate;NLP;knowledge extraction;question-answering;semantic search;document retrieval;German language}, - doi={10.1109/AIKE52691.2021.00011} - }""", + bibtex_citation=r""" +@inproceedings{9723721, + author = {Hoppe, Christoph and Pelkmann, David and Migenda, Nico and Hötte, Daniel and Schenck, Wolfram}, + booktitle = {2021 IEEE Fourth International Conference on Artificial Intelligence and Knowledge Engineering (AIKE)}, + doi = {10.1109/AIKE52691.2021.00011}, + keywords = {Knowledge engineering;Law;Semantic search;Conferences;Bit error rate;NLP;knowledge extraction;question-answering;semantic search;document retrieval;German language}, + number = {}, + pages = {29-32}, + title = {Towards Intelligent Legal Advisors for Document Retrieval and Question-Answering in German Legal Documents}, + volume = {}, + year = {2021}, +} +""", ) diff --git a/mteb/tasks/Retrieval/eng/AILACasedocsRetrieval.py b/mteb/tasks/Retrieval/eng/AILACasedocsRetrieval.py index d1d56e7737..2549e584ba 100644 --- a/mteb/tasks/Retrieval/eng/AILACasedocsRetrieval.py +++ b/mteb/tasks/Retrieval/eng/AILACasedocsRetrieval.py @@ -27,19 +27,21 @@ class AILACasedocs(AbsTaskRetrieval): annotations_creators="derived", dialect=None, sample_creation="found", - bibtex_citation="""@dataset{paheli_bhattacharya_2020_4063986, - author = {Paheli Bhattacharya and - Kripabandhu Ghosh and - Saptarshi Ghosh and - Arindam Pal and - Parth Mehta and - Arnab Bhattacharya and - Prasenjit Majumder}, - title = {AILA 2019 Precedent \& Statute Retrieval Task}, - month = oct, - year = 2020, - publisher = {Zenodo}, - doi = {10.5281/zenodo.4063986}, - url = {https://doi.org/10.5281/zenodo.4063986} -}""", + bibtex_citation=r""" +@dataset{paheli_bhattacharya_2020_4063986, + author = {Paheli Bhattacharya and +Kripabandhu Ghosh and +Saptarshi Ghosh and +Arindam Pal and +Parth Mehta and +Arnab Bhattacharya and +Prasenjit Majumder}, + doi = {10.5281/zenodo.4063986}, + month = oct, + publisher = {Zenodo}, + title = {AILA 2019 Precedent \& Statute Retrieval Task}, + url = {https://doi.org/10.5281/zenodo.4063986}, + year = {2020}, +} +""", ) diff --git a/mteb/tasks/Retrieval/eng/AILAStatutesRetrieval.py b/mteb/tasks/Retrieval/eng/AILAStatutesRetrieval.py index 4577c64ed6..43a7b15602 100644 --- a/mteb/tasks/Retrieval/eng/AILAStatutesRetrieval.py +++ b/mteb/tasks/Retrieval/eng/AILAStatutesRetrieval.py @@ -27,19 +27,21 @@ class AILAStatutes(AbsTaskRetrieval): annotations_creators="derived", dialect=None, sample_creation="found", - bibtex_citation="""@dataset{paheli_bhattacharya_2020_4063986, - author = {Paheli Bhattacharya and - Kripabandhu Ghosh and - Saptarshi Ghosh and - Arindam Pal and - Parth Mehta and - Arnab Bhattacharya and - Prasenjit Majumder}, - title = {AILA 2019 Precedent \& Statute Retrieval Task}, - month = oct, - year = 2020, - publisher = {Zenodo}, - doi = {10.5281/zenodo.4063986}, - url = {https://doi.org/10.5281/zenodo.4063986} -}""", + bibtex_citation=r""" +@dataset{paheli_bhattacharya_2020_4063986, + author = {Paheli Bhattacharya and +Kripabandhu Ghosh and +Saptarshi Ghosh and +Arindam Pal and +Parth Mehta and +Arnab Bhattacharya and +Prasenjit Majumder}, + doi = {10.5281/zenodo.4063986}, + month = oct, + publisher = {Zenodo}, + title = {AILA 2019 Precedent \& Statute Retrieval Task}, + url = {https://doi.org/10.5281/zenodo.4063986}, + year = {2020}, +} +""", ) diff --git a/mteb/tasks/Retrieval/eng/ARCChallengeRetrieval.py b/mteb/tasks/Retrieval/eng/ARCChallengeRetrieval.py index 7488e902d2..cb96c9cb78 100644 --- a/mteb/tasks/Retrieval/eng/ARCChallengeRetrieval.py +++ b/mteb/tasks/Retrieval/eng/ARCChallengeRetrieval.py @@ -28,17 +28,19 @@ class ARCChallenge(AbsTaskRetrieval): annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation="""@article{xiao2024rar, - title={RAR-b: Reasoning as Retrieval Benchmark}, - author={Xiao, Chenghao and Hudson, G Thomas and Moubayed, Noura Al}, - journal={arXiv preprint arXiv:2404.06347}, - year={2024} -} + bibtex_citation=r""" @article{clark2018think, - title={Think you have solved question answering? try arc, the ai2 reasoning challenge}, - author={Clark, Peter and Cowhey, Isaac and Etzioni, Oren and Khot, Tushar and Sabharwal, Ashish and Schoenick, Carissa and Tafjord, Oyvind}, - journal={arXiv preprint arXiv:1803.05457}, - year={2018} + author = {Clark, Peter and Cowhey, Isaac and Etzioni, Oren and Khot, Tushar and Sabharwal, Ashish and Schoenick, Carissa and Tafjord, Oyvind}, + journal = {arXiv preprint arXiv:1803.05457}, + title = {Think you have solved question answering? try arc, the ai2 reasoning challenge}, + year = {2018}, +} + +@article{xiao2024rar, + author = {Xiao, Chenghao and Hudson, G Thomas and Moubayed, Noura Al}, + journal = {arXiv preprint arXiv:2404.06347}, + title = {RAR-b: Reasoning as Retrieval Benchmark}, + year = {2024}, } """, prompt={"query": "Retrieve the answer to the question."}, diff --git a/mteb/tasks/Retrieval/eng/AlphaNLIRetrieval.py b/mteb/tasks/Retrieval/eng/AlphaNLIRetrieval.py index 3fd53b5ab5..4ac99f5fcc 100644 --- a/mteb/tasks/Retrieval/eng/AlphaNLIRetrieval.py +++ b/mteb/tasks/Retrieval/eng/AlphaNLIRetrieval.py @@ -28,18 +28,19 @@ class AlphaNLI(AbsTaskRetrieval): annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation="""@article{xiao2024rar, - title={RAR-b: Reasoning as Retrieval Benchmark}, - author={Xiao, Chenghao and Hudson, G Thomas and Moubayed, Noura Al}, - journal={arXiv preprint arXiv:2404.06347}, - year={2024} + bibtex_citation=r""" +@article{bhagavatula2019abductive, + author = {Bhagavatula, Chandra and Bras, Ronan Le and Malaviya, Chaitanya and Sakaguchi, Keisuke and Holtzman, Ari and Rashkin, Hannah and Downey, Doug and Yih, Scott Wen-tau and Choi, Yejin}, + journal = {arXiv preprint arXiv:1908.05739}, + title = {Abductive commonsense reasoning}, + year = {2019}, } -@article{bhagavatula2019abductive, - title={Abductive commonsense reasoning}, - author={Bhagavatula, Chandra and Bras, Ronan Le and Malaviya, Chaitanya and Sakaguchi, Keisuke and Holtzman, Ari and Rashkin, Hannah and Downey, Doug and Yih, Scott Wen-tau and Choi, Yejin}, - journal={arXiv preprint arXiv:1908.05739}, - year={2019} +@article{xiao2024rar, + author = {Xiao, Chenghao and Hudson, G Thomas and Moubayed, Noura Al}, + journal = {arXiv preprint arXiv:2404.06347}, + title = {RAR-b: Reasoning as Retrieval Benchmark}, + year = {2024}, } """, prompt={ diff --git a/mteb/tasks/Retrieval/eng/ArguAnaRetrieval.py b/mteb/tasks/Retrieval/eng/ArguAnaRetrieval.py index ff608bab6e..a3e7e93a75 100644 --- a/mteb/tasks/Retrieval/eng/ArguAnaRetrieval.py +++ b/mteb/tasks/Retrieval/eng/ArguAnaRetrieval.py @@ -29,15 +29,17 @@ class ArguAna(AbsTaskRetrieval): annotations_creators=None, dialect=[], sample_creation=None, - bibtex_citation="""@inproceedings{boteva2016, + bibtex_citation=r""" +@inproceedings{boteva2016, author = {Boteva, Vera and Gholipour, Demian and Sokolov, Artem and Riezler, Stefan}, - title = {A Full-Text Learning to Rank Dataset for Medical Information Retrieval}, + city = {Padova}, + country = {Italy}, journal = {Proceedings of the 38th European Conference on Information Retrieval}, journal-abbrev = {ECIR}, + title = {A Full-Text Learning to Rank Dataset for Medical Information Retrieval}, + url = {http://www.cl.uni-heidelberg.de/~riezler/publications/papers/ECIR2016.pdf}, year = {2016}, - city = {Padova}, - country = {Italy}, - url = {http://www.cl.uni-heidelberg.de/~riezler/publications/papers/ECIR2016.pdf} -}""", +} +""", prompt={"query": "Given a claim, find documents that refute the claim"}, ) diff --git a/mteb/tasks/Retrieval/eng/BrightRetrieval.py b/mteb/tasks/Retrieval/eng/BrightRetrieval.py index 4cb6cf8fbd..35b5b2e0bb 100644 --- a/mteb/tasks/Retrieval/eng/BrightRetrieval.py +++ b/mteb/tasks/Retrieval/eng/BrightRetrieval.py @@ -124,17 +124,17 @@ class BrightRetrieval(MultilingualTask, AbsTaskRetrieval): dialect=[], sample_creation="found", modalities=["text"], - bibtex_citation=""" + bibtex_citation=r""" @misc{su2024brightrealisticchallengingbenchmark, - title={BRIGHT: A Realistic and Challenging Benchmark for Reasoning-Intensive Retrieval}, - author={Hongjin Su and Howard Yen and Mengzhou Xia and Weijia Shi and Niklas Muennighoff and Han-yu Wang and Haisu Liu and Quan Shi and Zachary S. Siegel and Michael Tang and Ruoxi Sun and Jinsung Yoon and Sercan O. Arik and Danqi Chen and Tao Yu}, - year={2024}, - eprint={2407.12883}, - archivePrefix={arXiv}, - primaryClass={cs.CL}, - url={https://arxiv.org/abs/2407.12883}, + archiveprefix = {arXiv}, + author = {Hongjin Su and Howard Yen and Mengzhou Xia and Weijia Shi and Niklas Muennighoff and Han-yu Wang and Haisu Liu and Quan Shi and Zachary S. Siegel and Michael Tang and Ruoxi Sun and Jinsung Yoon and Sercan O. Arik and Danqi Chen and Tao Yu}, + eprint = {2407.12883}, + primaryclass = {cs.CL}, + title = {BRIGHT: A Realistic and Challenging Benchmark for Reasoning-Intensive Retrieval}, + url = {https://arxiv.org/abs/2407.12883}, + year = {2024}, } - """, +""", ) load_bright_data = load_bright_data load_data = load_data diff --git a/mteb/tasks/Retrieval/eng/BuiltBenchRetrieval.py b/mteb/tasks/Retrieval/eng/BuiltBenchRetrieval.py index 5d36f219a7..5762b75c23 100644 --- a/mteb/tasks/Retrieval/eng/BuiltBenchRetrieval.py +++ b/mteb/tasks/Retrieval/eng/BuiltBenchRetrieval.py @@ -27,12 +27,14 @@ class BuiltBenchRetrieval(AbsTaskRetrieval): annotations_creators="derived", dialect=[], sample_creation="created", - bibtex_citation="""@article{shahinmoghadam2024benchmarking, - title={Benchmarking pre-trained text embedding models in aligning built asset information}, - author={Shahinmoghadam, Mehrzad and Motamedi, Ali}, - journal={arXiv preprint arXiv:2411.12056}, - year={2024} -}""", + bibtex_citation=r""" +@article{shahinmoghadam2024benchmarking, + author = {Shahinmoghadam, Mehrzad and Motamedi, Ali}, + journal = {arXiv preprint arXiv:2411.12056}, + title = {Benchmarking pre-trained text embedding models in aligning built asset information}, + year = {2024}, +} +""", prompt={ "query": "Given a query, retrieve relevant entity descriptions from buit asset classification systems such as IFC and Uniclass" }, diff --git a/mteb/tasks/Retrieval/eng/CQADupstackAndroidRetrieval.py b/mteb/tasks/Retrieval/eng/CQADupstackAndroidRetrieval.py index 156395a077..018c0e33c4 100644 --- a/mteb/tasks/Retrieval/eng/CQADupstackAndroidRetrieval.py +++ b/mteb/tasks/Retrieval/eng/CQADupstackAndroidRetrieval.py @@ -27,21 +27,23 @@ class CQADupstackAndroidRetrieval(AbsTaskRetrieval): annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation="""@inproceedings{hoogeveen2015, -author = {Hoogeveen, Doris and Verspoor, Karin M. and Baldwin, Timothy}, -title = {CQADupStack: A Benchmark Data Set for Community Question-Answering Research}, -booktitle = {Proceedings of the 20th Australasian Document Computing Symposium (ADCS)}, -series = {ADCS '15}, -year = {2015}, -isbn = {978-1-4503-4040-3}, -location = {Parramatta, NSW, Australia}, -pages = {3:1--3:8}, -articleno = {3}, -numpages = {8}, -url = {http://doi.acm.org/10.1145/2838931.2838934}, -doi = {10.1145/2838931.2838934}, -acmid = {2838934}, -publisher = {ACM}, -address = {New York, NY, USA}, -}""", + bibtex_citation=r""" +@inproceedings{hoogeveen2015, + acmid = {2838934}, + address = {New York, NY, USA}, + articleno = {3}, + author = {Hoogeveen, Doris and Verspoor, Karin M. and Baldwin, Timothy}, + booktitle = {Proceedings of the 20th Australasian Document Computing Symposium (ADCS)}, + doi = {10.1145/2838931.2838934}, + isbn = {978-1-4503-4040-3}, + location = {Parramatta, NSW, Australia}, + numpages = {8}, + pages = {3:1--3:8}, + publisher = {ACM}, + series = {ADCS '15}, + title = {CQADupStack: A Benchmark Data Set for Community Question-Answering Research}, + url = {http://doi.acm.org/10.1145/2838931.2838934}, + year = {2015}, +} +""", ) diff --git a/mteb/tasks/Retrieval/eng/CQADupstackEnglishRetrieval.py b/mteb/tasks/Retrieval/eng/CQADupstackEnglishRetrieval.py index af47eda5c4..0c7397de97 100644 --- a/mteb/tasks/Retrieval/eng/CQADupstackEnglishRetrieval.py +++ b/mteb/tasks/Retrieval/eng/CQADupstackEnglishRetrieval.py @@ -27,21 +27,23 @@ class CQADupstackEnglishRetrieval(AbsTaskRetrieval): annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation="""@inproceedings{hoogeveen2015, -author = {Hoogeveen, Doris and Verspoor, Karin M. and Baldwin, Timothy}, -title = {CQADupStack: A Benchmark Data Set for Community Question-Answering Research}, -booktitle = {Proceedings of the 20th Australasian Document Computing Symposium (ADCS)}, -series = {ADCS '15}, -year = {2015}, -isbn = {978-1-4503-4040-3}, -location = {Parramatta, NSW, Australia}, -pages = {3:1--3:8}, -articleno = {3}, -numpages = {8}, -url = {http://doi.acm.org/10.1145/2838931.2838934}, -doi = {10.1145/2838931.2838934}, -acmid = {2838934}, -publisher = {ACM}, -address = {New York, NY, USA}, -}""", + bibtex_citation=r""" +@inproceedings{hoogeveen2015, + acmid = {2838934}, + address = {New York, NY, USA}, + articleno = {3}, + author = {Hoogeveen, Doris and Verspoor, Karin M. and Baldwin, Timothy}, + booktitle = {Proceedings of the 20th Australasian Document Computing Symposium (ADCS)}, + doi = {10.1145/2838931.2838934}, + isbn = {978-1-4503-4040-3}, + location = {Parramatta, NSW, Australia}, + numpages = {8}, + pages = {3:1--3:8}, + publisher = {ACM}, + series = {ADCS '15}, + title = {CQADupStack: A Benchmark Data Set for Community Question-Answering Research}, + url = {http://doi.acm.org/10.1145/2838931.2838934}, + year = {2015}, +} +""", ) diff --git a/mteb/tasks/Retrieval/eng/CQADupstackGamingRetrieval.py b/mteb/tasks/Retrieval/eng/CQADupstackGamingRetrieval.py index b51a3e64b5..e5f557b0ec 100644 --- a/mteb/tasks/Retrieval/eng/CQADupstackGamingRetrieval.py +++ b/mteb/tasks/Retrieval/eng/CQADupstackGamingRetrieval.py @@ -27,21 +27,23 @@ class CQADupstackGamingRetrieval(AbsTaskRetrieval): annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation="""@inproceedings{hoogeveen2015, -author = {Hoogeveen, Doris and Verspoor, Karin M. and Baldwin, Timothy}, -title = {CQADupStack: A Benchmark Data Set for Community Question-Answering Research}, -booktitle = {Proceedings of the 20th Australasian Document Computing Symposium (ADCS)}, -series = {ADCS '15}, -year = {2015}, -isbn = {978-1-4503-4040-3}, -location = {Parramatta, NSW, Australia}, -pages = {3:1--3:8}, -articleno = {3}, -numpages = {8}, -url = {http://doi.acm.org/10.1145/2838931.2838934}, -doi = {10.1145/2838931.2838934}, -acmid = {2838934}, -publisher = {ACM}, -address = {New York, NY, USA}, -}""", + bibtex_citation=r""" +@inproceedings{hoogeveen2015, + acmid = {2838934}, + address = {New York, NY, USA}, + articleno = {3}, + author = {Hoogeveen, Doris and Verspoor, Karin M. and Baldwin, Timothy}, + booktitle = {Proceedings of the 20th Australasian Document Computing Symposium (ADCS)}, + doi = {10.1145/2838931.2838934}, + isbn = {978-1-4503-4040-3}, + location = {Parramatta, NSW, Australia}, + numpages = {8}, + pages = {3:1--3:8}, + publisher = {ACM}, + series = {ADCS '15}, + title = {CQADupStack: A Benchmark Data Set for Community Question-Answering Research}, + url = {http://doi.acm.org/10.1145/2838931.2838934}, + year = {2015}, +} +""", ) diff --git a/mteb/tasks/Retrieval/eng/CQADupstackGisRetrieval.py b/mteb/tasks/Retrieval/eng/CQADupstackGisRetrieval.py index da38284f2d..ed6be358ee 100644 --- a/mteb/tasks/Retrieval/eng/CQADupstackGisRetrieval.py +++ b/mteb/tasks/Retrieval/eng/CQADupstackGisRetrieval.py @@ -27,21 +27,23 @@ class CQADupstackGisRetrieval(AbsTaskRetrieval): annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation="""@inproceedings{hoogeveen2015, -author = {Hoogeveen, Doris and Verspoor, Karin M. and Baldwin, Timothy}, -title = {CQADupStack: A Benchmark Data Set for Community Question-Answering Research}, -booktitle = {Proceedings of the 20th Australasian Document Computing Symposium (ADCS)}, -series = {ADCS '15}, -year = {2015}, -isbn = {978-1-4503-4040-3}, -location = {Parramatta, NSW, Australia}, -pages = {3:1--3:8}, -articleno = {3}, -numpages = {8}, -url = {http://doi.acm.org/10.1145/2838931.2838934}, -doi = {10.1145/2838931.2838934}, -acmid = {2838934}, -publisher = {ACM}, -address = {New York, NY, USA}, -}""", + bibtex_citation=r""" +@inproceedings{hoogeveen2015, + acmid = {2838934}, + address = {New York, NY, USA}, + articleno = {3}, + author = {Hoogeveen, Doris and Verspoor, Karin M. and Baldwin, Timothy}, + booktitle = {Proceedings of the 20th Australasian Document Computing Symposium (ADCS)}, + doi = {10.1145/2838931.2838934}, + isbn = {978-1-4503-4040-3}, + location = {Parramatta, NSW, Australia}, + numpages = {8}, + pages = {3:1--3:8}, + publisher = {ACM}, + series = {ADCS '15}, + title = {CQADupStack: A Benchmark Data Set for Community Question-Answering Research}, + url = {http://doi.acm.org/10.1145/2838931.2838934}, + year = {2015}, +} +""", ) diff --git a/mteb/tasks/Retrieval/eng/CQADupstackMathematicaRetrieval.py b/mteb/tasks/Retrieval/eng/CQADupstackMathematicaRetrieval.py index b29d166129..a141fe222e 100644 --- a/mteb/tasks/Retrieval/eng/CQADupstackMathematicaRetrieval.py +++ b/mteb/tasks/Retrieval/eng/CQADupstackMathematicaRetrieval.py @@ -27,21 +27,23 @@ class CQADupstackMathematicaRetrieval(AbsTaskRetrieval): annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation="""@inproceedings{hoogeveen2015, -author = {Hoogeveen, Doris and Verspoor, Karin M. and Baldwin, Timothy}, -title = {CQADupStack: A Benchmark Data Set for Community Question-Answering Research}, -booktitle = {Proceedings of the 20th Australasian Document Computing Symposium (ADCS)}, -series = {ADCS '15}, -year = {2015}, -isbn = {978-1-4503-4040-3}, -location = {Parramatta, NSW, Australia}, -pages = {3:1--3:8}, -articleno = {3}, -numpages = {8}, -url = {http://doi.acm.org/10.1145/2838931.2838934}, -doi = {10.1145/2838931.2838934}, -acmid = {2838934}, -publisher = {ACM}, -address = {New York, NY, USA}, -}""", + bibtex_citation=r""" +@inproceedings{hoogeveen2015, + acmid = {2838934}, + address = {New York, NY, USA}, + articleno = {3}, + author = {Hoogeveen, Doris and Verspoor, Karin M. and Baldwin, Timothy}, + booktitle = {Proceedings of the 20th Australasian Document Computing Symposium (ADCS)}, + doi = {10.1145/2838931.2838934}, + isbn = {978-1-4503-4040-3}, + location = {Parramatta, NSW, Australia}, + numpages = {8}, + pages = {3:1--3:8}, + publisher = {ACM}, + series = {ADCS '15}, + title = {CQADupStack: A Benchmark Data Set for Community Question-Answering Research}, + url = {http://doi.acm.org/10.1145/2838931.2838934}, + year = {2015}, +} +""", ) diff --git a/mteb/tasks/Retrieval/eng/CQADupstackPhysicsRetrieval.py b/mteb/tasks/Retrieval/eng/CQADupstackPhysicsRetrieval.py index 3dd0fdc4a5..df6bb8e913 100644 --- a/mteb/tasks/Retrieval/eng/CQADupstackPhysicsRetrieval.py +++ b/mteb/tasks/Retrieval/eng/CQADupstackPhysicsRetrieval.py @@ -27,21 +27,23 @@ class CQADupstackPhysicsRetrieval(AbsTaskRetrieval): annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation="""@inproceedings{hoogeveen2015, -author = {Hoogeveen, Doris and Verspoor, Karin M. and Baldwin, Timothy}, -title = {CQADupStack: A Benchmark Data Set for Community Question-Answering Research}, -booktitle = {Proceedings of the 20th Australasian Document Computing Symposium (ADCS)}, -series = {ADCS '15}, -year = {2015}, -isbn = {978-1-4503-4040-3}, -location = {Parramatta, NSW, Australia}, -pages = {3:1--3:8}, -articleno = {3}, -numpages = {8}, -url = {http://doi.acm.org/10.1145/2838931.2838934}, -doi = {10.1145/2838931.2838934}, -acmid = {2838934}, -publisher = {ACM}, -address = {New York, NY, USA}, -}""", + bibtex_citation=r""" +@inproceedings{hoogeveen2015, + acmid = {2838934}, + address = {New York, NY, USA}, + articleno = {3}, + author = {Hoogeveen, Doris and Verspoor, Karin M. and Baldwin, Timothy}, + booktitle = {Proceedings of the 20th Australasian Document Computing Symposium (ADCS)}, + doi = {10.1145/2838931.2838934}, + isbn = {978-1-4503-4040-3}, + location = {Parramatta, NSW, Australia}, + numpages = {8}, + pages = {3:1--3:8}, + publisher = {ACM}, + series = {ADCS '15}, + title = {CQADupStack: A Benchmark Data Set for Community Question-Answering Research}, + url = {http://doi.acm.org/10.1145/2838931.2838934}, + year = {2015}, +} +""", ) diff --git a/mteb/tasks/Retrieval/eng/CQADupstackProgrammersRetrieval.py b/mteb/tasks/Retrieval/eng/CQADupstackProgrammersRetrieval.py index f84b1b17e4..5d1ef668e6 100644 --- a/mteb/tasks/Retrieval/eng/CQADupstackProgrammersRetrieval.py +++ b/mteb/tasks/Retrieval/eng/CQADupstackProgrammersRetrieval.py @@ -27,21 +27,23 @@ class CQADupstackProgrammersRetrieval(AbsTaskRetrieval): annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation="""@inproceedings{hoogeveen2015, -author = {Hoogeveen, Doris and Verspoor, Karin M. and Baldwin, Timothy}, -title = {CQADupStack: A Benchmark Data Set for Community Question-Answering Research}, -booktitle = {Proceedings of the 20th Australasian Document Computing Symposium (ADCS)}, -series = {ADCS '15}, -year = {2015}, -isbn = {978-1-4503-4040-3}, -location = {Parramatta, NSW, Australia}, -pages = {3:1--3:8}, -articleno = {3}, -numpages = {8}, -url = {http://doi.acm.org/10.1145/2838931.2838934}, -doi = {10.1145/2838931.2838934}, -acmid = {2838934}, -publisher = {ACM}, -address = {New York, NY, USA}, -}""", + bibtex_citation=r""" +@inproceedings{hoogeveen2015, + acmid = {2838934}, + address = {New York, NY, USA}, + articleno = {3}, + author = {Hoogeveen, Doris and Verspoor, Karin M. and Baldwin, Timothy}, + booktitle = {Proceedings of the 20th Australasian Document Computing Symposium (ADCS)}, + doi = {10.1145/2838931.2838934}, + isbn = {978-1-4503-4040-3}, + location = {Parramatta, NSW, Australia}, + numpages = {8}, + pages = {3:1--3:8}, + publisher = {ACM}, + series = {ADCS '15}, + title = {CQADupStack: A Benchmark Data Set for Community Question-Answering Research}, + url = {http://doi.acm.org/10.1145/2838931.2838934}, + year = {2015}, +} +""", ) diff --git a/mteb/tasks/Retrieval/eng/CQADupstackStatsRetrieval.py b/mteb/tasks/Retrieval/eng/CQADupstackStatsRetrieval.py index 1fd18f8d84..b0b052a3d9 100644 --- a/mteb/tasks/Retrieval/eng/CQADupstackStatsRetrieval.py +++ b/mteb/tasks/Retrieval/eng/CQADupstackStatsRetrieval.py @@ -27,21 +27,23 @@ class CQADupstackStatsRetrieval(AbsTaskRetrieval): annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation="""@inproceedings{hoogeveen2015, -author = {Hoogeveen, Doris and Verspoor, Karin M. and Baldwin, Timothy}, -title = {CQADupStack: A Benchmark Data Set for Community Question-Answering Research}, -booktitle = {Proceedings of the 20th Australasian Document Computing Symposium (ADCS)}, -series = {ADCS '15}, -year = {2015}, -isbn = {978-1-4503-4040-3}, -location = {Parramatta, NSW, Australia}, -pages = {3:1--3:8}, -articleno = {3}, -numpages = {8}, -url = {http://doi.acm.org/10.1145/2838931.2838934}, -doi = {10.1145/2838931.2838934}, -acmid = {2838934}, -publisher = {ACM}, -address = {New York, NY, USA}, -}""", + bibtex_citation=r""" +@inproceedings{hoogeveen2015, + acmid = {2838934}, + address = {New York, NY, USA}, + articleno = {3}, + author = {Hoogeveen, Doris and Verspoor, Karin M. and Baldwin, Timothy}, + booktitle = {Proceedings of the 20th Australasian Document Computing Symposium (ADCS)}, + doi = {10.1145/2838931.2838934}, + isbn = {978-1-4503-4040-3}, + location = {Parramatta, NSW, Australia}, + numpages = {8}, + pages = {3:1--3:8}, + publisher = {ACM}, + series = {ADCS '15}, + title = {CQADupStack: A Benchmark Data Set for Community Question-Answering Research}, + url = {http://doi.acm.org/10.1145/2838931.2838934}, + year = {2015}, +} +""", ) diff --git a/mteb/tasks/Retrieval/eng/CQADupstackTexRetrieval.py b/mteb/tasks/Retrieval/eng/CQADupstackTexRetrieval.py index c4447442be..f44a67ed9e 100644 --- a/mteb/tasks/Retrieval/eng/CQADupstackTexRetrieval.py +++ b/mteb/tasks/Retrieval/eng/CQADupstackTexRetrieval.py @@ -27,21 +27,23 @@ class CQADupstackTexRetrieval(AbsTaskRetrieval): annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation="""@inproceedings{hoogeveen2015, -author = {Hoogeveen, Doris and Verspoor, Karin M. and Baldwin, Timothy}, -title = {CQADupStack: A Benchmark Data Set for Community Question-Answering Research}, -booktitle = {Proceedings of the 20th Australasian Document Computing Symposium (ADCS)}, -series = {ADCS '15}, -year = {2015}, -isbn = {978-1-4503-4040-3}, -location = {Parramatta, NSW, Australia}, -pages = {3:1--3:8}, -articleno = {3}, -numpages = {8}, -url = {http://doi.acm.org/10.1145/2838931.2838934}, -doi = {10.1145/2838931.2838934}, -acmid = {2838934}, -publisher = {ACM}, -address = {New York, NY, USA}, -}""", + bibtex_citation=r""" +@inproceedings{hoogeveen2015, + acmid = {2838934}, + address = {New York, NY, USA}, + articleno = {3}, + author = {Hoogeveen, Doris and Verspoor, Karin M. and Baldwin, Timothy}, + booktitle = {Proceedings of the 20th Australasian Document Computing Symposium (ADCS)}, + doi = {10.1145/2838931.2838934}, + isbn = {978-1-4503-4040-3}, + location = {Parramatta, NSW, Australia}, + numpages = {8}, + pages = {3:1--3:8}, + publisher = {ACM}, + series = {ADCS '15}, + title = {CQADupStack: A Benchmark Data Set for Community Question-Answering Research}, + url = {http://doi.acm.org/10.1145/2838931.2838934}, + year = {2015}, +} +""", ) diff --git a/mteb/tasks/Retrieval/eng/CQADupstackUnixRetrieval.py b/mteb/tasks/Retrieval/eng/CQADupstackUnixRetrieval.py index 57c9964b15..7365f991f3 100644 --- a/mteb/tasks/Retrieval/eng/CQADupstackUnixRetrieval.py +++ b/mteb/tasks/Retrieval/eng/CQADupstackUnixRetrieval.py @@ -27,21 +27,23 @@ class CQADupstackUnixRetrieval(AbsTaskRetrieval): annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation="""@inproceedings{hoogeveen2015, -author = {Hoogeveen, Doris and Verspoor, Karin M. and Baldwin, Timothy}, -title = {CQADupStack: A Benchmark Data Set for Community Question-Answering Research}, -booktitle = {Proceedings of the 20th Australasian Document Computing Symposium (ADCS)}, -series = {ADCS '15}, -year = {2015}, -isbn = {978-1-4503-4040-3}, -location = {Parramatta, NSW, Australia}, -pages = {3:1--3:8}, -articleno = {3}, -numpages = {8}, -url = {http://doi.acm.org/10.1145/2838931.2838934}, -doi = {10.1145/2838931.2838934}, -acmid = {2838934}, -publisher = {ACM}, -address = {New York, NY, USA}, -}""", + bibtex_citation=r""" +@inproceedings{hoogeveen2015, + acmid = {2838934}, + address = {New York, NY, USA}, + articleno = {3}, + author = {Hoogeveen, Doris and Verspoor, Karin M. and Baldwin, Timothy}, + booktitle = {Proceedings of the 20th Australasian Document Computing Symposium (ADCS)}, + doi = {10.1145/2838931.2838934}, + isbn = {978-1-4503-4040-3}, + location = {Parramatta, NSW, Australia}, + numpages = {8}, + pages = {3:1--3:8}, + publisher = {ACM}, + series = {ADCS '15}, + title = {CQADupStack: A Benchmark Data Set for Community Question-Answering Research}, + url = {http://doi.acm.org/10.1145/2838931.2838934}, + year = {2015}, +} +""", ) diff --git a/mteb/tasks/Retrieval/eng/CQADupstackWebmastersRetrieval.py b/mteb/tasks/Retrieval/eng/CQADupstackWebmastersRetrieval.py index 2e9bd63e08..78564903e7 100644 --- a/mteb/tasks/Retrieval/eng/CQADupstackWebmastersRetrieval.py +++ b/mteb/tasks/Retrieval/eng/CQADupstackWebmastersRetrieval.py @@ -27,21 +27,23 @@ class CQADupstackWebmastersRetrieval(AbsTaskRetrieval): annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation="""@inproceedings{hoogeveen2015, -author = {Hoogeveen, Doris and Verspoor, Karin M. and Baldwin, Timothy}, -title = {CQADupStack: A Benchmark Data Set for Community Question-Answering Research}, -booktitle = {Proceedings of the 20th Australasian Document Computing Symposium (ADCS)}, -series = {ADCS '15}, -year = {2015}, -isbn = {978-1-4503-4040-3}, -location = {Parramatta, NSW, Australia}, -pages = {3:1--3:8}, -articleno = {3}, -numpages = {8}, -url = {http://doi.acm.org/10.1145/2838931.2838934}, -doi = {10.1145/2838931.2838934}, -acmid = {2838934}, -publisher = {ACM}, -address = {New York, NY, USA}, -}""", + bibtex_citation=r""" +@inproceedings{hoogeveen2015, + acmid = {2838934}, + address = {New York, NY, USA}, + articleno = {3}, + author = {Hoogeveen, Doris and Verspoor, Karin M. and Baldwin, Timothy}, + booktitle = {Proceedings of the 20th Australasian Document Computing Symposium (ADCS)}, + doi = {10.1145/2838931.2838934}, + isbn = {978-1-4503-4040-3}, + location = {Parramatta, NSW, Australia}, + numpages = {8}, + pages = {3:1--3:8}, + publisher = {ACM}, + series = {ADCS '15}, + title = {CQADupStack: A Benchmark Data Set for Community Question-Answering Research}, + url = {http://doi.acm.org/10.1145/2838931.2838934}, + year = {2015}, +} +""", ) diff --git a/mteb/tasks/Retrieval/eng/CQADupstackWordpressRetrieval.py b/mteb/tasks/Retrieval/eng/CQADupstackWordpressRetrieval.py index 3b11866f82..9b89e232e9 100644 --- a/mteb/tasks/Retrieval/eng/CQADupstackWordpressRetrieval.py +++ b/mteb/tasks/Retrieval/eng/CQADupstackWordpressRetrieval.py @@ -27,21 +27,23 @@ class CQADupstackWordpressRetrieval(AbsTaskRetrieval): annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation="""@inproceedings{hoogeveen2015, -author = {Hoogeveen, Doris and Verspoor, Karin M. and Baldwin, Timothy}, -title = {CQADupStack: A Benchmark Data Set for Community Question-Answering Research}, -booktitle = {Proceedings of the 20th Australasian Document Computing Symposium (ADCS)}, -series = {ADCS '15}, -year = {2015}, -isbn = {978-1-4503-4040-3}, -location = {Parramatta, NSW, Australia}, -pages = {3:1--3:8}, -articleno = {3}, -numpages = {8}, -url = {http://doi.acm.org/10.1145/2838931.2838934}, -doi = {10.1145/2838931.2838934}, -acmid = {2838934}, -publisher = {ACM}, -address = {New York, NY, USA}, -}""", + bibtex_citation=r""" +@inproceedings{hoogeveen2015, + acmid = {2838934}, + address = {New York, NY, USA}, + articleno = {3}, + author = {Hoogeveen, Doris and Verspoor, Karin M. and Baldwin, Timothy}, + booktitle = {Proceedings of the 20th Australasian Document Computing Symposium (ADCS)}, + doi = {10.1145/2838931.2838934}, + isbn = {978-1-4503-4040-3}, + location = {Parramatta, NSW, Australia}, + numpages = {8}, + pages = {3:1--3:8}, + publisher = {ACM}, + series = {ADCS '15}, + title = {CQADupStack: A Benchmark Data Set for Community Question-Answering Research}, + url = {http://doi.acm.org/10.1145/2838931.2838934}, + year = {2015}, +} +""", ) diff --git a/mteb/tasks/Retrieval/eng/ChemHotpotQARetrieval.py b/mteb/tasks/Retrieval/eng/ChemHotpotQARetrieval.py index 88fbc50df4..d6f458c432 100644 --- a/mteb/tasks/Retrieval/eng/ChemHotpotQARetrieval.py +++ b/mteb/tasks/Retrieval/eng/ChemHotpotQARetrieval.py @@ -26,35 +26,36 @@ class ChemHotpotQARetrieval(AbsTaskRetrieval): annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation=""" - @article{kasmaee2024chemteb, - title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, - author={Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, - journal={arXiv preprint arXiv:2412.00532}, - year={2024} - } - @inproceedings{yang-etal-2018-hotpotqa, - title = "{H}otpot{QA}: A Dataset for Diverse, Explainable Multi-hop Question Answering", - author = "Yang, Zhilin and - Qi, Peng and - Zhang, Saizheng and - Bengio, Yoshua and - Cohen, William and - Salakhutdinov, Ruslan and - Manning, Christopher D.", - editor = "Riloff, Ellen and - Chiang, David and - Hockenmaier, Julia and - Tsujii, Jun{'}ichi", - booktitle = "Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing", - month = oct # "-" # nov, - year = "2018", - address = "Brussels, Belgium", - publisher = "Association for Computational Linguistics", - url = "https://aclanthology.org/D18-1259", - doi = "10.18653/v1/D18-1259", - pages = "2369--2380", - abstract = "Existing question answering (QA) datasets fail to train QA systems to perform complex reasoning and provide explanations for answers. We introduce HotpotQA, a new dataset with 113k Wikipedia-based question-answer pairs with four key features: (1) the questions require finding and reasoning over multiple supporting documents to answer; (2) the questions are diverse and not constrained to any pre-existing knowledge bases or knowledge schemas; (3) we provide sentence-level supporting facts required for reasoning, allowing QA systems to reason with strong supervision and explain the predictions; (4) we offer a new type of factoid comparison questions to test QA systems{'} ability to extract relevant facts and perform necessary comparison. We show that HotpotQA is challenging for the latest QA systems, and the supporting facts enable models to improve performance and make explainable predictions.", - } + bibtex_citation=r""" +@article{kasmaee2024chemteb, + author = {Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, + journal = {arXiv preprint arXiv:2412.00532}, + title = {ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, + year = {2024}, +} + +@inproceedings{yang-etal-2018-hotpotqa, + abstract = {Existing question answering (QA) datasets fail to train QA systems to perform complex reasoning and provide explanations for answers. We introduce HotpotQA, a new dataset with 113k Wikipedia-based question-answer pairs with four key features: (1) the questions require finding and reasoning over multiple supporting documents to answer; (2) the questions are diverse and not constrained to any pre-existing knowledge bases or knowledge schemas; (3) we provide sentence-level supporting facts required for reasoning, allowing QA systems to reason with strong supervision and explain the predictions; (4) we offer a new type of factoid comparison questions to test QA systems{'} ability to extract relevant facts and perform necessary comparison. We show that HotpotQA is challenging for the latest QA systems, and the supporting facts enable models to improve performance and make explainable predictions.}, + address = {Brussels, Belgium}, + author = {Yang, Zhilin and +Qi, Peng and +Zhang, Saizheng and +Bengio, Yoshua and +Cohen, William and +Salakhutdinov, Ruslan and +Manning, Christopher D.}, + booktitle = {Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing}, + doi = {10.18653/v1/D18-1259}, + editor = {Riloff, Ellen and +Chiang, David and +Hockenmaier, Julia and +Tsujii, Jun{'}ichi}, + month = oct # {-} # nov, + pages = {2369--2380}, + publisher = {Association for Computational Linguistics}, + title = {{H}otpot{QA}: A Dataset for Diverse, Explainable Multi-hop Question Answering}, + url = {https://aclanthology.org/D18-1259}, + year = {2018}, +} """, ) diff --git a/mteb/tasks/Retrieval/eng/ChemNQRetrieval.py b/mteb/tasks/Retrieval/eng/ChemNQRetrieval.py index 6058932550..e970f7b6e9 100644 --- a/mteb/tasks/Retrieval/eng/ChemNQRetrieval.py +++ b/mteb/tasks/Retrieval/eng/ChemNQRetrieval.py @@ -26,20 +26,22 @@ class ChemNQRetrieval(AbsTaskRetrieval): annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation=""" - @article{kasmaee2024chemteb, - title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, - author={Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, - journal={arXiv preprint arXiv:2412.00532}, - year={2024} - } - @article{47761, - title = {Natural Questions: a Benchmark for Question Answering Research}, - author = {Tom Kwiatkowski and Jennimaria Palomaki and Olivia Redfield and Michael Collins and Ankur Parikh - and Chris Alberti and Danielle Epstein and Illia Polosukhin and Matthew Kelcey and Jacob Devlin and Kenton Lee - and Kristina N. Toutanova and Llion Jones and Ming-Wei Chang and Andrew Dai and Jakob Uszkoreit and Quoc Le - and Slav Petrov}, - year = {2019}, - journal = {Transactions of the Association of Computational Linguistics}} - """, + bibtex_citation=r""" +@article{47761, + author = {Tom Kwiatkowski and Jennimaria Palomaki and Olivia Redfield and Michael Collins and Ankur Parikh +and Chris Alberti and Danielle Epstein and Illia Polosukhin and Matthew Kelcey and Jacob Devlin and Kenton Lee +and Kristina N. Toutanova and Llion Jones and Ming-Wei Chang and Andrew Dai and Jakob Uszkoreit and Quoc Le +and Slav Petrov}, + journal = {Transactions of the Association of Computational Linguistics}, + title = {Natural Questions: a Benchmark for Question Answering Research}, + year = {2019}, +} + +@article{kasmaee2024chemteb, + author = {Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, + journal = {arXiv preprint arXiv:2412.00532}, + title = {ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, + year = {2024}, +} +""", ) diff --git a/mteb/tasks/Retrieval/eng/ClimateFEVERRetrieval.py b/mteb/tasks/Retrieval/eng/ClimateFEVERRetrieval.py index dd0eebfa82..8f841ff14f 100644 --- a/mteb/tasks/Retrieval/eng/ClimateFEVERRetrieval.py +++ b/mteb/tasks/Retrieval/eng/ClimateFEVERRetrieval.py @@ -27,14 +27,16 @@ class ClimateFEVER(AbsTaskRetrieval): annotations_creators="human-annotated", dialect=[], sample_creation="found", - bibtex_citation="""@misc{diggelmann2021climatefever, - title={CLIMATE-FEVER: A Dataset for Verification of Real-World Climate Claims}, - author={Thomas Diggelmann and Jordan Boyd-Graber and Jannis Bulian and Massimiliano Ciaramita and Markus Leippold}, - year={2021}, - eprint={2012.00614}, - archivePrefix={arXiv}, - primaryClass={cs.CL} -}""", + bibtex_citation=r""" +@misc{diggelmann2021climatefever, + archiveprefix = {arXiv}, + author = {Thomas Diggelmann and Jordan Boyd-Graber and Jannis Bulian and Massimiliano Ciaramita and Markus Leippold}, + eprint = {2012.00614}, + primaryclass = {cs.CL}, + title = {CLIMATE-FEVER: A Dataset for Verification of Real-World Climate Claims}, + year = {2021}, +} +""", prompt={ "query": "Given a claim about climate change, retrieve documents that support or refute the claim" }, @@ -63,14 +65,16 @@ class ClimateFEVERHardNegatives(AbsTaskRetrieval): annotations_creators="human-annotated", dialect=[], sample_creation="found", - bibtex_citation="""@misc{diggelmann2021climatefever, - title={CLIMATE-FEVER: A Dataset for Verification of Real-World Climate Claims}, - author={Thomas Diggelmann and Jordan Boyd-Graber and Jannis Bulian and Massimiliano Ciaramita and Markus Leippold}, - year={2021}, - eprint={2012.00614}, - archivePrefix={arXiv}, - primaryClass={cs.CL} -}""", + bibtex_citation=r""" +@misc{diggelmann2021climatefever, + archiveprefix = {arXiv}, + author = {Thomas Diggelmann and Jordan Boyd-Graber and Jannis Bulian and Massimiliano Ciaramita and Markus Leippold}, + eprint = {2012.00614}, + primaryclass = {cs.CL}, + title = {CLIMATE-FEVER: A Dataset for Verification of Real-World Climate Claims}, + year = {2021}, +} +""", adapted_from=["ClimateFEVER"], ) @@ -97,14 +101,16 @@ class ClimateFEVERRetrievalv2(AbsTaskRetrieval): annotations_creators="human-annotated", dialect=[], sample_creation="found", - bibtex_citation="""@misc{diggelmann2021climatefever, - title={CLIMATE-FEVER: A Dataset for Verification of Real-World Climate Claims}, - author={Thomas Diggelmann and Jordan Boyd-Graber and Jannis Bulian and Massimiliano Ciaramita and Markus Leippold}, - year={2021}, - eprint={2012.00614}, - archivePrefix={arXiv}, - primaryClass={cs.CL} -}""", + bibtex_citation=r""" +@misc{diggelmann2021climatefever, + archiveprefix = {arXiv}, + author = {Thomas Diggelmann and Jordan Boyd-Graber and Jannis Bulian and Massimiliano Ciaramita and Markus Leippold}, + eprint = {2012.00614}, + primaryclass = {cs.CL}, + title = {CLIMATE-FEVER: A Dataset for Verification of Real-World Climate Claims}, + year = {2021}, +} +""", prompt={ "query": "Given a claim about climate change, retrieve documents that support or refute the claim" }, diff --git a/mteb/tasks/Retrieval/eng/DBPediaRetrieval.py b/mteb/tasks/Retrieval/eng/DBPediaRetrieval.py index 93fd6f68f1..c09043a83c 100644 --- a/mteb/tasks/Retrieval/eng/DBPediaRetrieval.py +++ b/mteb/tasks/Retrieval/eng/DBPediaRetrieval.py @@ -27,16 +27,18 @@ class DBPedia(AbsTaskRetrieval): annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation="""@inproceedings{Hasibi:2017:DVT, - author = {Hasibi, Faegheh and Nikolaev, Fedor and Xiong, Chenyan and Balog, Krisztian and Bratsberg, Svein Erik and Kotov, Alexander and Callan, Jamie}, - title = {DBpedia-Entity V2: A Test Collection for Entity Search}, - booktitle = {Proceedings of the 40th International ACM SIGIR Conference on Research and Development in Information Retrieval}, - series = {SIGIR '17}, - year = {2017}, - pages = {1265--1268}, - doi = {10.1145/3077136.3080751}, - publisher = {ACM} -}""", + bibtex_citation=r""" +@inproceedings{Hasibi:2017:DVT, + author = {Hasibi, Faegheh and Nikolaev, Fedor and Xiong, Chenyan and Balog, Krisztian and Bratsberg, Svein Erik and Kotov, Alexander and Callan, Jamie}, + booktitle = {Proceedings of the 40th International ACM SIGIR Conference on Research and Development in Information Retrieval}, + doi = {10.1145/3077136.3080751}, + pages = {1265--1268}, + publisher = {ACM}, + series = {SIGIR '17}, + title = {DBpedia-Entity V2: A Test Collection for Entity Search}, + year = {2017}, +} +""", prompt={ "query": "Given a query, retrieve relevant entity descriptions from DBPedia" }, @@ -65,15 +67,17 @@ class DBPediaHardNegatives(AbsTaskRetrieval): annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation="""@inproceedings{Hasibi:2017:DVT, - author = {Hasibi, Faegheh and Nikolaev, Fedor and Xiong, Chenyan and Balog, Krisztian and Bratsberg, Svein Erik and Kotov, Alexander and Callan, Jamie}, - title = {DBpedia-Entity V2: A Test Collection for Entity Search}, - booktitle = {Proceedings of the 40th International ACM SIGIR Conference on Research and Development in Information Retrieval}, - series = {SIGIR '17}, - year = {2017}, - pages = {1265--1268}, - doi = {10.1145/3077136.3080751}, - publisher = {ACM} -}""", + bibtex_citation=r""" +@inproceedings{Hasibi:2017:DVT, + author = {Hasibi, Faegheh and Nikolaev, Fedor and Xiong, Chenyan and Balog, Krisztian and Bratsberg, Svein Erik and Kotov, Alexander and Callan, Jamie}, + booktitle = {Proceedings of the 40th International ACM SIGIR Conference on Research and Development in Information Retrieval}, + doi = {10.1145/3077136.3080751}, + pages = {1265--1268}, + publisher = {ACM}, + series = {SIGIR '17}, + title = {DBpedia-Entity V2: A Test Collection for Entity Search}, + year = {2017}, +} +""", adapted_from=["DBPedia"], ) diff --git a/mteb/tasks/Retrieval/eng/FEVERRetrieval.py b/mteb/tasks/Retrieval/eng/FEVERRetrieval.py index 2a6130e804..b0e9b89e8a 100644 --- a/mteb/tasks/Retrieval/eng/FEVERRetrieval.py +++ b/mteb/tasks/Retrieval/eng/FEVERRetrieval.py @@ -33,25 +33,27 @@ class FEVER(AbsTaskRetrieval): annotations_creators="human-annotated", dialect=[], sample_creation="found", - bibtex_citation="""@inproceedings{thorne-etal-2018-fever, - title = "{FEVER}: a Large-scale Dataset for Fact Extraction and {VER}ification", - author = "Thorne, James and - Vlachos, Andreas and - Christodoulopoulos, Christos and - Mittal, Arpit", - editor = "Walker, Marilyn and - Ji, Heng and - Stent, Amanda", - booktitle = "Proceedings of the 2018 Conference of the North {A}merican Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long Papers)", - month = jun, - year = "2018", - address = "New Orleans, Louisiana", - publisher = "Association for Computational Linguistics", - url = "https://aclanthology.org/N18-1074", - doi = "10.18653/v1/N18-1074", - pages = "809--819", - abstract = "In this paper we introduce a new publicly available dataset for verification against textual sources, FEVER: Fact Extraction and VERification. It consists of 185,445 claims generated by altering sentences extracted from Wikipedia and subsequently verified without knowledge of the sentence they were derived from. The claims are classified as Supported, Refuted or NotEnoughInfo by annotators achieving 0.6841 in Fleiss kappa. For the first two classes, the annotators also recorded the sentence(s) forming the necessary evidence for their judgment. To characterize the challenge of the dataset presented, we develop a pipeline approach and compare it to suitably designed oracles. The best accuracy we achieve on labeling a claim accompanied by the correct evidence is 31.87{\%}, while if we ignore the evidence we achieve 50.91{\%}. Thus we believe that FEVER is a challenging testbed that will help stimulate progress on claim verification against textual sources.", -}""", + bibtex_citation=r""" +@inproceedings{thorne-etal-2018-fever, + abstract = {In this paper we introduce a new publicly available dataset for verification against textual sources, FEVER: Fact Extraction and VERification. It consists of 185,445 claims generated by altering sentences extracted from Wikipedia and subsequently verified without knowledge of the sentence they were derived from. The claims are classified as Supported, Refuted or NotEnoughInfo by annotators achieving 0.6841 in Fleiss kappa. For the first two classes, the annotators also recorded the sentence(s) forming the necessary evidence for their judgment. To characterize the challenge of the dataset presented, we develop a pipeline approach and compare it to suitably designed oracles. The best accuracy we achieve on labeling a claim accompanied by the correct evidence is 31.87{\%}, while if we ignore the evidence we achieve 50.91{\%}. Thus we believe that FEVER is a challenging testbed that will help stimulate progress on claim verification against textual sources.}, + address = {New Orleans, Louisiana}, + author = {Thorne, James and +Vlachos, Andreas and +Christodoulopoulos, Christos and +Mittal, Arpit}, + booktitle = {Proceedings of the 2018 Conference of the North {A}merican Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long Papers)}, + doi = {10.18653/v1/N18-1074}, + editor = {Walker, Marilyn and +Ji, Heng and +Stent, Amanda}, + month = jun, + pages = {809--819}, + publisher = {Association for Computational Linguistics}, + title = {{FEVER}: a Large-scale Dataset for Fact Extraction and {VER}ification}, + url = {https://aclanthology.org/N18-1074}, + year = {2018}, +} +""", prompt={ "query": "Given a claim, retrieve documents that support or refute the claim" }, @@ -80,30 +82,32 @@ class FEVERHardNegatives(AbsTaskRetrieval): eval_langs=["eng-Latn"], main_score="ndcg_at_10", date=None, - domains=None, - task_subtypes=None, - license=None, - annotations_creators=None, + domains=["Encyclopaedic", "Written"], + task_subtypes=["Claim verification"], + license="cc-by-nc-sa-3.0", + annotations_creators="human-annotated", dialect=None, sample_creation=None, - bibtex_citation="""@inproceedings{thorne-etal-2018-fever, - title = "{FEVER}: a Large-scale Dataset for Fact Extraction and {VER}ification", - author = "Thorne, James and - Vlachos, Andreas and - Christodoulopoulos, Christos and - Mittal, Arpit", - editor = "Walker, Marilyn and - Ji, Heng and - Stent, Amanda", - booktitle = "Proceedings of the 2018 Conference of the North {A}merican Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long Papers)", - month = jun, - year = "2018", - address = "New Orleans, Louisiana", - publisher = "Association for Computational Linguistics", - url = "https://aclanthology.org/N18-1074", - doi = "10.18653/v1/N18-1074", - pages = "809--819", - abstract = "In this paper we introduce a new publicly available dataset for verification against textual sources, FEVER: Fact Extraction and VERification. It consists of 185,445 claims generated by altering sentences extracted from Wikipedia and subsequently verified without knowledge of the sentence they were derived from. The claims are classified as Supported, Refuted or NotEnoughInfo by annotators achieving 0.6841 in Fleiss kappa. For the first two classes, the annotators also recorded the sentence(s) forming the necessary evidence for their judgment. To characterize the challenge of the dataset presented, we develop a pipeline approach and compare it to suitably designed oracles. The best accuracy we achieve on labeling a claim accompanied by the correct evidence is 31.87{\%}, while if we ignore the evidence we achieve 50.91{\%}. Thus we believe that FEVER is a challenging testbed that will help stimulate progress on claim verification against textual sources.", -}""", + bibtex_citation=r""" +@inproceedings{thorne-etal-2018-fever, + abstract = {In this paper we introduce a new publicly available dataset for verification against textual sources, FEVER: Fact Extraction and VERification. It consists of 185,445 claims generated by altering sentences extracted from Wikipedia and subsequently verified without knowledge of the sentence they were derived from. The claims are classified as Supported, Refuted or NotEnoughInfo by annotators achieving 0.6841 in Fleiss kappa. For the first two classes, the annotators also recorded the sentence(s) forming the necessary evidence for their judgment. To characterize the challenge of the dataset presented, we develop a pipeline approach and compare it to suitably designed oracles. The best accuracy we achieve on labeling a claim accompanied by the correct evidence is 31.87{\%}, while if we ignore the evidence we achieve 50.91{\%}. Thus we believe that FEVER is a challenging testbed that will help stimulate progress on claim verification against textual sources.}, + address = {New Orleans, Louisiana}, + author = {Thorne, James and +Vlachos, Andreas and +Christodoulopoulos, Christos and +Mittal, Arpit}, + booktitle = {Proceedings of the 2018 Conference of the North {A}merican Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long Papers)}, + doi = {10.18653/v1/N18-1074}, + editor = {Walker, Marilyn and +Ji, Heng and +Stent, Amanda}, + month = jun, + pages = {809--819}, + publisher = {Association for Computational Linguistics}, + title = {{FEVER}: a Large-scale Dataset for Fact Extraction and {VER}ification}, + url = {https://aclanthology.org/N18-1074}, + year = {2018}, +} +""", adapted_from=["FEVER"], ) diff --git a/mteb/tasks/Retrieval/eng/FaithDialRetrieval.py b/mteb/tasks/Retrieval/eng/FaithDialRetrieval.py index 8cd87ed04b..2c883ac02d 100644 --- a/mteb/tasks/Retrieval/eng/FaithDialRetrieval.py +++ b/mteb/tasks/Retrieval/eng/FaithDialRetrieval.py @@ -36,19 +36,19 @@ class FaithDialRetrieval(AbsTaskRetrieval): annotations_creators="human-annotated", dialect=[], sample_creation="found", - bibtex_citation=""" - @article{dziri2022faithdial, - title = "{FaithDial: A Faithful Benchmark for Information-Seeking Dialogue}", - author = {Dziri, Nouha and Kamalloo, Ehsan and Milton, Sivan and Zaiane, Osmar and Yu, Mo and Ponti, Edoardo M and Reddy, Siva}, - journal = {Transactions of the Association for Computational Linguistics}, - volume = {10}, - pages = {1473--1490}, - year = {2022}, - month = {12}, - publisher = {MIT Press}, - doi={10.1162/tacl_a_00529} - } - """, + bibtex_citation=r""" +@article{dziri2022faithdial, + author = {Dziri, Nouha and Kamalloo, Ehsan and Milton, Sivan and Zaiane, Osmar and Yu, Mo and Ponti, Edoardo M and Reddy, Siva}, + doi = {10.1162/tacl_a_00529}, + journal = {Transactions of the Association for Computational Linguistics}, + month = {12}, + pages = {1473--1490}, + publisher = {MIT Press}, + title = {{FaithDial: A Faithful Benchmark for Information-Seeking Dialogue}}, + volume = {10}, + year = {2022}, +} +""", ) # TODO: Will be removed if curated and added to mteb HF diff --git a/mteb/tasks/Retrieval/eng/FeedbackQARetrieval.py b/mteb/tasks/Retrieval/eng/FeedbackQARetrieval.py index 44f0ac2522..8b388f2c5f 100644 --- a/mteb/tasks/Retrieval/eng/FeedbackQARetrieval.py +++ b/mteb/tasks/Retrieval/eng/FeedbackQARetrieval.py @@ -29,25 +29,25 @@ class FeedbackQARetrieval(AbsTaskRetrieval): annotations_creators="human-annotated", dialect=[], sample_creation="created", - bibtex_citation=""" + bibtex_citation=r""" @inproceedings{li-etal-2022-using, - title = "Using Interactive Feedback to Improve the Accuracy and Explainability of Question Answering Systems Post-Deployment", - author = "Li, Zichao and - Sharma, Prakhar and - Lu, Xing Han and - Cheung, Jackie and - Reddy, Siva", - editor = "Muresan, Smaranda and - Nakov, Preslav and - Villavicencio, Aline", - booktitle = "Findings of the Association for Computational Linguistics: ACL 2022", - month = may, - year = "2022", - address = "Dublin, Ireland", - publisher = "Association for Computational Linguistics", - url = "https://aclanthology.org/2022.findings-acl.75", - doi = "10.18653/v1/2022.findings-acl.75", - pages = "926--937" + address = {Dublin, Ireland}, + author = {Li, Zichao and +Sharma, Prakhar and +Lu, Xing Han and +Cheung, Jackie and +Reddy, Siva}, + booktitle = {Findings of the Association for Computational Linguistics: ACL 2022}, + doi = {10.18653/v1/2022.findings-acl.75}, + editor = {Muresan, Smaranda and +Nakov, Preslav and +Villavicencio, Aline}, + month = may, + pages = {926--937}, + publisher = {Association for Computational Linguistics}, + title = {Using Interactive Feedback to Improve the Accuracy and Explainability of Question Answering Systems Post-Deployment}, + url = {https://aclanthology.org/2022.findings-acl.75}, + year = {2022}, } """, ) diff --git a/mteb/tasks/Retrieval/eng/FiQA2018Retrieval.py b/mteb/tasks/Retrieval/eng/FiQA2018Retrieval.py index 7a99d48a95..d8d6534233 100644 --- a/mteb/tasks/Retrieval/eng/FiQA2018Retrieval.py +++ b/mteb/tasks/Retrieval/eng/FiQA2018Retrieval.py @@ -29,14 +29,15 @@ class FiQA2018(AbsTaskRetrieval): annotations_creators="human-annotated", dialect=[], sample_creation="found", - bibtex_citation="""@inproceedings{ -thakur2021beir, -title={{BEIR}: A Heterogeneous Benchmark for Zero-shot Evaluation of Information Retrieval Models}, -author={Nandan Thakur and Nils Reimers and Andreas R{\"u}ckl{\'e} and Abhishek Srivastava and Iryna Gurevych}, -booktitle={Thirty-fifth Conference on Neural Information Processing Systems Datasets and Benchmarks Track (Round 2)}, -year={2021}, -url={https://openreview.net/forum?id=wCu6T5xFjeJ} -}""", + bibtex_citation=r""" +@inproceedings{thakur2021beir, + author = {Nandan Thakur and Nils Reimers and Andreas R{\"u}ckl{\'e} and Abhishek Srivastava and Iryna Gurevych}, + booktitle = {Thirty-fifth Conference on Neural Information Processing Systems Datasets and Benchmarks Track (Round 2)}, + title = {{BEIR}: A Heterogeneous Benchmark for Zero-shot Evaluation of Information Retrieval Models}, + url = {https://openreview.net/forum?id=wCu6T5xFjeJ}, + year = {2021}, +} +""", prompt={ "query": "Given a financial question, retrieve user replies that best answer the question" }, diff --git a/mteb/tasks/Retrieval/eng/HagridRetrieval.py b/mteb/tasks/Retrieval/eng/HagridRetrieval.py index 74bab52076..0a851a1031 100644 --- a/mteb/tasks/Retrieval/eng/HagridRetrieval.py +++ b/mteb/tasks/Retrieval/eng/HagridRetrieval.py @@ -36,12 +36,14 @@ class HagridRetrieval(AbsTaskRetrieval): annotations_creators="expert-annotated", dialect=[], sample_creation="found", - bibtex_citation="""@article{hagrid, - title={{HAGRID}: A Human-LLM Collaborative Dataset for Generative Information-Seeking with Attribution}, - author={Ehsan Kamalloo and Aref Jafari and Xinyu Zhang and Nandan Thakur and Jimmy Lin}, - year={2023}, - journal={arXiv:2307.16883}, -}""", + bibtex_citation=r""" +@article{hagrid, + author = {Ehsan Kamalloo and Aref Jafari and Xinyu Zhang and Nandan Thakur and Jimmy Lin}, + journal = {arXiv:2307.16883}, + title = {{HAGRID}: A Human-LLM Collaborative Dataset for Generative Information-Seeking with Attribution}, + year = {2023}, +} +""", ) def load_data(self, **kwargs): diff --git a/mteb/tasks/Retrieval/eng/HellaSwagRetrieval.py b/mteb/tasks/Retrieval/eng/HellaSwagRetrieval.py index 81b53e5c42..1eaeed640e 100644 --- a/mteb/tasks/Retrieval/eng/HellaSwagRetrieval.py +++ b/mteb/tasks/Retrieval/eng/HellaSwagRetrieval.py @@ -28,17 +28,19 @@ class HellaSwag(AbsTaskRetrieval): annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation="""@article{xiao2024rar, - title={RAR-b: Reasoning as Retrieval Benchmark}, - author={Xiao, Chenghao and Hudson, G Thomas and Moubayed, Noura Al}, - journal={arXiv preprint arXiv:2404.06347}, - year={2024} + bibtex_citation=r""" +@article{xiao2024rar, + author = {Xiao, Chenghao and Hudson, G Thomas and Moubayed, Noura Al}, + journal = {arXiv preprint arXiv:2404.06347}, + title = {RAR-b: Reasoning as Retrieval Benchmark}, + year = {2024}, } + @article{zellers2019hellaswag, - title={Hellaswag: Can a machine really finish your sentence?}, - author={Zellers, Rowan and Holtzman, Ari and Bisk, Yonatan and Farhadi, Ali and Choi, Yejin}, - journal={arXiv preprint arXiv:1905.07830}, - year={2019} + author = {Zellers, Rowan and Holtzman, Ari and Bisk, Yonatan and Farhadi, Ali and Choi, Yejin}, + journal = {arXiv preprint arXiv:1905.07830}, + title = {Hellaswag: Can a machine really finish your sentence?}, + year = {2019}, } """, prompt={ diff --git a/mteb/tasks/Retrieval/eng/HotpotQARetrieval.py b/mteb/tasks/Retrieval/eng/HotpotQARetrieval.py index cb71abea12..0b8a9e212a 100644 --- a/mteb/tasks/Retrieval/eng/HotpotQARetrieval.py +++ b/mteb/tasks/Retrieval/eng/HotpotQARetrieval.py @@ -30,29 +30,31 @@ class HotpotQA(AbsTaskRetrieval): annotations_creators="human-annotated", dialect=[], sample_creation="found", - bibtex_citation="""@inproceedings{yang-etal-2018-hotpotqa, - title = "{H}otpot{QA}: A Dataset for Diverse, Explainable Multi-hop Question Answering", - author = "Yang, Zhilin and - Qi, Peng and - Zhang, Saizheng and - Bengio, Yoshua and - Cohen, William and - Salakhutdinov, Ruslan and - Manning, Christopher D.", - editor = "Riloff, Ellen and - Chiang, David and - Hockenmaier, Julia and - Tsujii, Jun{'}ichi", - booktitle = "Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing", - month = oct # "-" # nov, - year = "2018", - address = "Brussels, Belgium", - publisher = "Association for Computational Linguistics", - url = "https://aclanthology.org/D18-1259", - doi = "10.18653/v1/D18-1259", - pages = "2369--2380", - abstract = "Existing question answering (QA) datasets fail to train QA systems to perform complex reasoning and provide explanations for answers. We introduce HotpotQA, a new dataset with 113k Wikipedia-based question-answer pairs with four key features: (1) the questions require finding and reasoning over multiple supporting documents to answer; (2) the questions are diverse and not constrained to any pre-existing knowledge bases or knowledge schemas; (3) we provide sentence-level supporting facts required for reasoning, allowing QA systems to reason with strong supervision and explain the predictions; (4) we offer a new type of factoid comparison questions to test QA systems{'} ability to extract relevant facts and perform necessary comparison. We show that HotpotQA is challenging for the latest QA systems, and the supporting facts enable models to improve performance and make explainable predictions.", -}""", + bibtex_citation=r""" +@inproceedings{yang-etal-2018-hotpotqa, + abstract = {Existing question answering (QA) datasets fail to train QA systems to perform complex reasoning and provide explanations for answers. We introduce HotpotQA, a new dataset with 113k Wikipedia-based question-answer pairs with four key features: (1) the questions require finding and reasoning over multiple supporting documents to answer; (2) the questions are diverse and not constrained to any pre-existing knowledge bases or knowledge schemas; (3) we provide sentence-level supporting facts required for reasoning, allowing QA systems to reason with strong supervision and explain the predictions; (4) we offer a new type of factoid comparison questions to test QA systems{'} ability to extract relevant facts and perform necessary comparison. We show that HotpotQA is challenging for the latest QA systems, and the supporting facts enable models to improve performance and make explainable predictions.}, + address = {Brussels, Belgium}, + author = {Yang, Zhilin and +Qi, Peng and +Zhang, Saizheng and +Bengio, Yoshua and +Cohen, William and +Salakhutdinov, Ruslan and +Manning, Christopher D.}, + booktitle = {Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing}, + doi = {10.18653/v1/D18-1259}, + editor = {Riloff, Ellen and +Chiang, David and +Hockenmaier, Julia and +Tsujii, Jun{'}ichi}, + month = oct # {-} # nov, + pages = {2369--2380}, + publisher = {Association for Computational Linguistics}, + title = {{H}otpot{QA}: A Dataset for Diverse, Explainable Multi-hop Question Answering}, + url = {https://aclanthology.org/D18-1259}, + year = {2018}, +} +""", prompt={ "query": "Given a multi-hop question, retrieve documents that can help answer the question" }, @@ -84,28 +86,30 @@ class HotpotQAHardNegatives(AbsTaskRetrieval): annotations_creators="human-annotated", dialect=[], sample_creation="found", - bibtex_citation="""@inproceedings{yang-etal-2018-hotpotqa, - title = "{H}otpot{QA}: A Dataset for Diverse, Explainable Multi-hop Question Answering", - author = "Yang, Zhilin and - Qi, Peng and - Zhang, Saizheng and - Bengio, Yoshua and - Cohen, William and - Salakhutdinov, Ruslan and - Manning, Christopher D.", - editor = "Riloff, Ellen and - Chiang, David and - Hockenmaier, Julia and - Tsujii, Jun{'}ichi", - booktitle = "Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing", - month = oct # "-" # nov, - year = "2018", - address = "Brussels, Belgium", - publisher = "Association for Computational Linguistics", - url = "https://aclanthology.org/D18-1259", - doi = "10.18653/v1/D18-1259", - pages = "2369--2380", - abstract = "Existing question answering (QA) datasets fail to train QA systems to perform complex reasoning and provide explanations for answers. We introduce HotpotQA, a new dataset with 113k Wikipedia-based question-answer pairs with four key features: (1) the questions require finding and reasoning over multiple supporting documents to answer; (2) the questions are diverse and not constrained to any pre-existing knowledge bases or knowledge schemas; (3) we provide sentence-level supporting facts required for reasoning, allowing QA systems to reason with strong supervision and explain the predictions; (4) we offer a new type of factoid comparison questions to test QA systems{'} ability to extract relevant facts and perform necessary comparison. We show that HotpotQA is challenging for the latest QA systems, and the supporting facts enable models to improve performance and make explainable predictions.", -}""", + bibtex_citation=r""" +@inproceedings{yang-etal-2018-hotpotqa, + abstract = {Existing question answering (QA) datasets fail to train QA systems to perform complex reasoning and provide explanations for answers. We introduce HotpotQA, a new dataset with 113k Wikipedia-based question-answer pairs with four key features: (1) the questions require finding and reasoning over multiple supporting documents to answer; (2) the questions are diverse and not constrained to any pre-existing knowledge bases or knowledge schemas; (3) we provide sentence-level supporting facts required for reasoning, allowing QA systems to reason with strong supervision and explain the predictions; (4) we offer a new type of factoid comparison questions to test QA systems{'} ability to extract relevant facts and perform necessary comparison. We show that HotpotQA is challenging for the latest QA systems, and the supporting facts enable models to improve performance and make explainable predictions.}, + address = {Brussels, Belgium}, + author = {Yang, Zhilin and +Qi, Peng and +Zhang, Saizheng and +Bengio, Yoshua and +Cohen, William and +Salakhutdinov, Ruslan and +Manning, Christopher D.}, + booktitle = {Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing}, + doi = {10.18653/v1/D18-1259}, + editor = {Riloff, Ellen and +Chiang, David and +Hockenmaier, Julia and +Tsujii, Jun{'}ichi}, + month = oct # {-} # nov, + pages = {2369--2380}, + publisher = {Association for Computational Linguistics}, + title = {{H}otpot{QA}: A Dataset for Diverse, Explainable Multi-hop Question Answering}, + url = {https://aclanthology.org/D18-1259}, + year = {2018}, +} +""", adapted_from=["HotpotQA"], ) diff --git a/mteb/tasks/Retrieval/eng/LEMBNarrativeQARetrieval.py b/mteb/tasks/Retrieval/eng/LEMBNarrativeQARetrieval.py index 3d45290d71..e42c63796a 100644 --- a/mteb/tasks/Retrieval/eng/LEMBNarrativeQARetrieval.py +++ b/mteb/tasks/Retrieval/eng/LEMBNarrativeQARetrieval.py @@ -32,31 +32,31 @@ class LEMBNarrativeQARetrieval(AbsTaskRetrieval): annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation=""" - @article{kocisky-etal-2018-narrativeqa, - title = "The {N}arrative{QA} Reading Comprehension Challenge", - author = "Ko{\v{c}}isk{\'y}, Tom{\'a}{\v{s}} and - Schwarz, Jonathan and - Blunsom, Phil and - Dyer, Chris and - Hermann, Karl Moritz and - Melis, G{\'a}bor and - Grefenstette, Edward", - editor = "Lee, Lillian and - Johnson, Mark and - Toutanova, Kristina and - Roark, Brian", - journal = "Transactions of the Association for Computational Linguistics", - volume = "6", - year = "2018", - address = "Cambridge, MA", - publisher = "MIT Press", - url = "https://aclanthology.org/Q18-1023", - doi = "10.1162/tacl_a_00023", - pages = "317--328", - abstract = "", - } - """, + bibtex_citation=r""" +@article{kocisky-etal-2018-narrativeqa, + abstract = {}, + address = {Cambridge, MA}, + author = {Ko{\v{c}}isk{\'y}, Tom{\'a}{\v{s}} and +Schwarz, Jonathan and +Blunsom, Phil and +Dyer, Chris and +Hermann, Karl Moritz and +Melis, G{\'a}bor and +Grefenstette, Edward}, + doi = {10.1162/tacl_a_00023}, + editor = {Lee, Lillian and +Johnson, Mark and +Toutanova, Kristina and +Roark, Brian}, + journal = {Transactions of the Association for Computational Linguistics}, + pages = {317--328}, + publisher = {MIT Press}, + title = {The {N}arrative{QA} Reading Comprehension Challenge}, + url = {https://aclanthology.org/Q18-1023}, + volume = {6}, + year = {2018}, +} +""", ) def load_data(self, **kwargs): diff --git a/mteb/tasks/Retrieval/eng/LEMBNeedleRetrieval.py b/mteb/tasks/Retrieval/eng/LEMBNeedleRetrieval.py index c467843d01..1a752a1aab 100644 --- a/mteb/tasks/Retrieval/eng/LEMBNeedleRetrieval.py +++ b/mteb/tasks/Retrieval/eng/LEMBNeedleRetrieval.py @@ -41,14 +41,14 @@ class LEMBNeedleRetrieval(AbsTaskRetrieval): annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation=""" - @article{zhu2024longembed, - title={LongEmbed: Extending Embedding Models for Long Context Retrieval}, - author={Zhu, Dawei and Wang, Liang and Yang, Nan and Song, Yifan and Wu, Wenhao and Wei, Furu and Li, Sujian}, - journal={arXiv preprint arXiv:2404.12096}, - year={2024} - } - """, + bibtex_citation=r""" +@article{zhu2024longembed, + author = {Zhu, Dawei and Wang, Liang and Yang, Nan and Song, Yifan and Wu, Wenhao and Wei, Furu and Li, Sujian}, + journal = {arXiv preprint arXiv:2404.12096}, + title = {LongEmbed: Extending Embedding Models for Long Context Retrieval}, + year = {2024}, +} +""", ) def load_data(self, **kwargs): diff --git a/mteb/tasks/Retrieval/eng/LEMBPasskeyRetrieval.py b/mteb/tasks/Retrieval/eng/LEMBPasskeyRetrieval.py index f3c9b96485..560c72f562 100644 --- a/mteb/tasks/Retrieval/eng/LEMBPasskeyRetrieval.py +++ b/mteb/tasks/Retrieval/eng/LEMBPasskeyRetrieval.py @@ -41,14 +41,14 @@ class LEMBPasskeyRetrieval(AbsTaskRetrieval): annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation=""" - @article{zhu2024longembed, - title={LongEmbed: Extending Embedding Models for Long Context Retrieval}, - author={Zhu, Dawei and Wang, Liang and Yang, Nan and Song, Yifan and Wu, Wenhao and Wei, Furu and Li, Sujian}, - journal={arXiv preprint arXiv:2404.12096}, - year={2024} - } - """, + bibtex_citation=r""" +@article{zhu2024longembed, + author = {Zhu, Dawei and Wang, Liang and Yang, Nan and Song, Yifan and Wu, Wenhao and Wei, Furu and Li, Sujian}, + journal = {arXiv preprint arXiv:2404.12096}, + title = {LongEmbed: Extending Embedding Models for Long Context Retrieval}, + year = {2024}, +} +""", ) def load_data(self, **kwargs): diff --git a/mteb/tasks/Retrieval/eng/LEMBQMSumRetrieval.py b/mteb/tasks/Retrieval/eng/LEMBQMSumRetrieval.py index c302e4758a..2aca710367 100644 --- a/mteb/tasks/Retrieval/eng/LEMBQMSumRetrieval.py +++ b/mteb/tasks/Retrieval/eng/LEMBQMSumRetrieval.py @@ -32,40 +32,40 @@ class LEMBQMSumRetrieval(AbsTaskRetrieval): annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation=""" - @inproceedings{zhong-etal-2021-qmsum, - title = "{QMS}um: A New Benchmark for Query-based Multi-domain Meeting Summarization", - author = "Zhong, Ming and - Yin, Da and - Yu, Tao and - Zaidi, Ahmad and - Mutuma, Mutethia and - Jha, Rahul and - Awadallah, Ahmed Hassan and - Celikyilmaz, Asli and - Liu, Yang and - Qiu, Xipeng and - Radev, Dragomir", - editor = "Toutanova, Kristina and - Rumshisky, Anna and - Zettlemoyer, Luke and - Hakkani-Tur, Dilek and - Beltagy, Iz and - Bethard, Steven and - Cotterell, Ryan and - Chakraborty, Tanmoy and - Zhou, Yichao", - booktitle = "Proceedings of the 2021 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies", - month = jun, - year = "2021", - address = "Online", - publisher = "Association for Computational Linguistics", - url = "https://aclanthology.org/2021.naacl-main.472", - doi = "10.18653/v1/2021.naacl-main.472", - pages = "5905--5921", - abstract = "", - } - """, + bibtex_citation=r""" +@inproceedings{zhong-etal-2021-qmsum, + abstract = {}, + address = {Online}, + author = {Zhong, Ming and +Yin, Da and +Yu, Tao and +Zaidi, Ahmad and +Mutuma, Mutethia and +Jha, Rahul and +Awadallah, Ahmed Hassan and +Celikyilmaz, Asli and +Liu, Yang and +Qiu, Xipeng and +Radev, Dragomir}, + booktitle = {Proceedings of the 2021 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies}, + doi = {10.18653/v1/2021.naacl-main.472}, + editor = {Toutanova, Kristina and +Rumshisky, Anna and +Zettlemoyer, Luke and +Hakkani-Tur, Dilek and +Beltagy, Iz and +Bethard, Steven and +Cotterell, Ryan and +Chakraborty, Tanmoy and +Zhou, Yichao}, + month = jun, + pages = {5905--5921}, + publisher = {Association for Computational Linguistics}, + title = {{QMS}um: A New Benchmark for Query-based Multi-domain Meeting Summarization}, + url = {https://aclanthology.org/2021.naacl-main.472}, + year = {2021}, +} +""", ) def load_data(self, **kwargs): diff --git a/mteb/tasks/Retrieval/eng/LEMBSummScreenFDRetrieval.py b/mteb/tasks/Retrieval/eng/LEMBSummScreenFDRetrieval.py index c2c6b6db03..748032c767 100644 --- a/mteb/tasks/Retrieval/eng/LEMBSummScreenFDRetrieval.py +++ b/mteb/tasks/Retrieval/eng/LEMBSummScreenFDRetrieval.py @@ -32,27 +32,27 @@ class LEMBSummScreenFDRetrieval(AbsTaskRetrieval): annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation=""" - @inproceedings{chen-etal-2022-summscreen, - title = "{S}umm{S}creen: A Dataset for Abstractive Screenplay Summarization", - author = "Chen, Mingda and - Chu, Zewei and - Wiseman, Sam and - Gimpel, Kevin", - editor = "Muresan, Smaranda and - Nakov, Preslav and - Villavicencio, Aline", - booktitle = "Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)", - month = may, - year = "2022", - address = "Dublin, Ireland", - publisher = "Association for Computational Linguistics", - url = "https://aclanthology.org/2022.acl-long.589", - doi = "10.18653/v1/2022.acl-long.589", - pages = "8602--8615", - abstract = "", - } - """, + bibtex_citation=r""" +@inproceedings{chen-etal-2022-summscreen, + abstract = {}, + address = {Dublin, Ireland}, + author = {Chen, Mingda and +Chu, Zewei and +Wiseman, Sam and +Gimpel, Kevin}, + booktitle = {Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)}, + doi = {10.18653/v1/2022.acl-long.589}, + editor = {Muresan, Smaranda and +Nakov, Preslav and +Villavicencio, Aline}, + month = may, + pages = {8602--8615}, + publisher = {Association for Computational Linguistics}, + title = {{S}umm{S}creen: A Dataset for Abstractive Screenplay Summarization}, + url = {https://aclanthology.org/2022.acl-long.589}, + year = {2022}, +} +""", ) def load_data(self, **kwargs): diff --git a/mteb/tasks/Retrieval/eng/LEMBWikimQARetrieval.py b/mteb/tasks/Retrieval/eng/LEMBWikimQARetrieval.py index 04e8b3bb86..5e03c5caf8 100644 --- a/mteb/tasks/Retrieval/eng/LEMBWikimQARetrieval.py +++ b/mteb/tasks/Retrieval/eng/LEMBWikimQARetrieval.py @@ -32,15 +32,15 @@ class LEMBWikimQARetrieval(AbsTaskRetrieval): annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation=""" - @inproceedings{ho2020constructing, - title={Constructing A Multi-hop QA Dataset for Comprehensive Evaluation of Reasoning Steps}, - author={Ho, Xanh and Nguyen, Anh-Khoa Duong and Sugawara, Saku and Aizawa, Akiko}, - booktitle={Proceedings of the 28th International Conference on Computational Linguistics}, - pages={6609--6625}, - year={2020} - } - """, + bibtex_citation=r""" +@inproceedings{ho2020constructing, + author = {Ho, Xanh and Nguyen, Anh-Khoa Duong and Sugawara, Saku and Aizawa, Akiko}, + booktitle = {Proceedings of the 28th International Conference on Computational Linguistics}, + pages = {6609--6625}, + title = {Constructing A Multi-hop QA Dataset for Comprehensive Evaluation of Reasoning Steps}, + year = {2020}, +} +""", ) def load_data(self, **kwargs): diff --git a/mteb/tasks/Retrieval/eng/LegalBenchConsumerContractsQARetrieval.py b/mteb/tasks/Retrieval/eng/LegalBenchConsumerContractsQARetrieval.py index 39923194ec..743c9af0f1 100644 --- a/mteb/tasks/Retrieval/eng/LegalBenchConsumerContractsQARetrieval.py +++ b/mteb/tasks/Retrieval/eng/LegalBenchConsumerContractsQARetrieval.py @@ -27,17 +27,19 @@ class LegalBenchConsumerContractsQA(AbsTaskRetrieval): annotations_creators="derived", dialect=None, sample_creation="found", - bibtex_citation="""@article{koreeda2021contractnli, - title={ContractNLI: A dataset for document-level natural language inference for contracts}, - author={Koreeda, Yuta and Manning, Christopher D}, - journal={arXiv preprint arXiv:2110.01799}, - year={2021} - } + bibtex_citation=r""" +@article{hendrycks2021cuad, + author = {Hendrycks, Dan and Burns, Collin and Chen, Anya and Ball, Spencer}, + journal = {arXiv preprint arXiv:2103.06268}, + title = {Cuad: An expert-annotated nlp dataset for legal contract review}, + year = {2021}, +} - @article{hendrycks2021cuad, - title={Cuad: An expert-annotated nlp dataset for legal contract review}, - author={Hendrycks, Dan and Burns, Collin and Chen, Anya and Ball, Spencer}, - journal={arXiv preprint arXiv:2103.06268}, - year={2021} -}""", +@article{koreeda2021contractnli, + author = {Koreeda, Yuta and Manning, Christopher D}, + journal = {arXiv preprint arXiv:2110.01799}, + title = {ContractNLI: A dataset for document-level natural language inference for contracts}, + year = {2021}, +} +""", ) diff --git a/mteb/tasks/Retrieval/eng/LegalBenchCorporateLobbyingRetrieval.py b/mteb/tasks/Retrieval/eng/LegalBenchCorporateLobbyingRetrieval.py index a5003b09be..1138eb1155 100644 --- a/mteb/tasks/Retrieval/eng/LegalBenchCorporateLobbyingRetrieval.py +++ b/mteb/tasks/Retrieval/eng/LegalBenchCorporateLobbyingRetrieval.py @@ -27,74 +27,84 @@ class LegalBenchCorporateLobbying(AbsTaskRetrieval): annotations_creators="derived", dialect=None, sample_creation="found", - bibtex_citation="""@misc{guha2023legalbench, - title={LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, - author={Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, - year={2023}, - eprint={2308.11462}, - archivePrefix={arXiv}, - primaryClass={cs.CL} + bibtex_citation=r""" +@misc{guha2023legalbench, + archiveprefix = {arXiv}, + author = {Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li}, + eprint = {2308.11462}, + primaryclass = {cs.CL}, + title = {LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models}, + year = {2023}, } + +@article{hendrycks2021cuad, + author = {Hendrycks, Dan and Burns, Collin and Chen, Anya and Ball, Spencer}, + journal = {arXiv preprint arXiv:2103.06268}, + title = {Cuad: An expert-annotated nlp dataset for legal contract review}, + year = {2021}, +} + +@article{holzenberger2021factoring, + author = {Holzenberger, Nils and Van Durme, Benjamin}, + journal = {arXiv preprint arXiv:2105.07903}, + title = {Factoring statutory reasoning as language understanding challenges}, + year = {2021}, +} + @article{koreeda2021contractnli, - title={ContractNLI: A dataset for document-level natural language inference for contracts}, - author={Koreeda, Yuta and Manning, Christopher D}, - journal={arXiv preprint arXiv:2110.01799}, - year={2021} + author = {Koreeda, Yuta and Manning, Christopher D}, + journal = {arXiv preprint arXiv:2110.01799}, + title = {ContractNLI: A dataset for document-level natural language inference for contracts}, + year = {2021}, } -@article{hendrycks2021cuad, - title={Cuad: An expert-annotated nlp dataset for legal contract review}, - author={Hendrycks, Dan and Burns, Collin and Chen, Anya and Ball, Spencer}, - journal={arXiv preprint arXiv:2103.06268}, - year={2021} + +@article{lippi2019claudette, + author = {Lippi, Marco and Pa{\l}ka, Przemys{\l}aw and Contissa, Giuseppe and Lagioia, Francesca and Micklitz, Hans-Wolfgang and Sartor, Giovanni and Torroni, Paolo}, + journal = {Artificial Intelligence and Law}, + pages = {117--139}, + publisher = {Springer}, + title = {CLAUDETTE: an automated detector of potentially unfair clauses in online terms of service}, + volume = {27}, + year = {2019}, } + +@article{ravichander2019question, + author = {Ravichander, Abhilasha and Black, Alan W and Wilson, Shomir and Norton, Thomas and Sadeh, Norman}, + journal = {arXiv preprint arXiv:1911.00841}, + title = {Question answering for privacy policies: Combining computational and legal perspectives}, + year = {2019}, +} + @article{wang2023maud, - title={MAUD: An Expert-Annotated Legal NLP Dataset for Merger Agreement Understanding}, - author={Wang, Steven H and Scardigli, Antoine and Tang, Leonard and Chen, Wei and Levkin, Dimitry and Chen, Anya and Ball, Spencer and Woodside, Thomas and Zhang, Oliver and Hendrycks, Dan}, - journal={arXiv preprint arXiv:2301.00876}, - year={2023} + author = {Wang, Steven H and Scardigli, Antoine and Tang, Leonard and Chen, Wei and Levkin, Dimitry and Chen, Anya and Ball, Spencer and Woodside, Thomas and Zhang, Oliver and Hendrycks, Dan}, + journal = {arXiv preprint arXiv:2301.00876}, + title = {MAUD: An Expert-Annotated Legal NLP Dataset for Merger Agreement Understanding}, + year = {2023}, } + @inproceedings{wilson2016creation, - title={The creation and analysis of a website privacy policy corpus}, - author={Wilson, Shomir and Schaub, Florian and Dara, Aswarth Abhilash and Liu, Frederick and Cherivirala, Sushain and Leon, Pedro Giovanni and Andersen, Mads Schaarup and Zimmeck, Sebastian and Sathyendra, Kanthashree Mysore and Russell, N Cameron and others}, - booktitle={Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)}, - pages={1330--1340}, - year={2016} + author = {Wilson, Shomir and Schaub, Florian and Dara, Aswarth Abhilash and Liu, Frederick and Cherivirala, Sushain and Leon, Pedro Giovanni and Andersen, Mads Schaarup and Zimmeck, Sebastian and Sathyendra, Kanthashree Mysore and Russell, N Cameron and others}, + booktitle = {Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)}, + pages = {1330--1340}, + title = {The creation and analysis of a website privacy policy corpus}, + year = {2016}, } + @inproceedings{zheng2021does, - title={When does pretraining help? assessing self-supervised learning for law and the casehold dataset of 53,000+ legal holdings}, - author={Zheng, Lucia and Guha, Neel and Anderson, Brandon R and Henderson, Peter and Ho, Daniel E}, - booktitle={Proceedings of the eighteenth international conference on artificial intelligence and law}, - pages={159--168}, - year={2021} + author = {Zheng, Lucia and Guha, Neel and Anderson, Brandon R and Henderson, Peter and Ho, Daniel E}, + booktitle = {Proceedings of the eighteenth international conference on artificial intelligence and law}, + pages = {159--168}, + title = {When does pretraining help? assessing self-supervised learning for law and the casehold dataset of 53,000+ legal holdings}, + year = {2021}, } + @article{zimmeck2019maps, - title={Maps: Scaling privacy compliance analysis to a million apps}, - author={Zimmeck, Sebastian and Story, Peter and Smullen, Daniel and Ravichander, Abhilasha and Wang, Ziqi and Reidenberg, Joel R and Russell, N Cameron and Sadeh, Norman}, - journal={Proc. Priv. Enhancing Tech.}, - volume={2019}, - pages={66}, - year={2019} -} -@article{ravichander2019question, - title={Question answering for privacy policies: Combining computational and legal perspectives}, - author={Ravichander, Abhilasha and Black, Alan W and Wilson, Shomir and Norton, Thomas and Sadeh, Norman}, - journal={arXiv preprint arXiv:1911.00841}, - year={2019} -} -@article{holzenberger2021factoring, - title={Factoring statutory reasoning as language understanding challenges}, - author={Holzenberger, Nils and Van Durme, Benjamin}, - journal={arXiv preprint arXiv:2105.07903}, - year={2021} -} -@article{lippi2019claudette, - title={CLAUDETTE: an automated detector of potentially unfair clauses in online terms of service}, - author={Lippi, Marco and Pa{\l}ka, Przemys{\l}aw and Contissa, Giuseppe and Lagioia, Francesca and Micklitz, Hans-Wolfgang and Sartor, Giovanni and Torroni, Paolo}, - journal={Artificial Intelligence and Law}, - volume={27}, - pages={117--139}, - year={2019}, - publisher={Springer} + author = {Zimmeck, Sebastian and Story, Peter and Smullen, Daniel and Ravichander, Abhilasha and Wang, Ziqi and Reidenberg, Joel R and Russell, N Cameron and Sadeh, Norman}, + journal = {Proc. Priv. Enhancing Tech.}, + pages = {66}, + title = {Maps: Scaling privacy compliance analysis to a million apps}, + volume = {2019}, + year = {2019}, } """, ) diff --git a/mteb/tasks/Retrieval/eng/LegalSummarizationRetrieval.py b/mteb/tasks/Retrieval/eng/LegalSummarizationRetrieval.py index 3fc4cf167d..0e0d8ddb80 100644 --- a/mteb/tasks/Retrieval/eng/LegalSummarizationRetrieval.py +++ b/mteb/tasks/Retrieval/eng/LegalSummarizationRetrieval.py @@ -27,16 +27,18 @@ class LegalSummarization(AbsTaskRetrieval): annotations_creators="derived", dialect=None, sample_creation="found", - bibtex_citation="""@inproceedings{manor-li-2019-plain, - title = "Plain {E}nglish Summarization of Contracts", - author = "Manor, Laura and - Li, Junyi Jessy", - booktitle = "Proceedings of the Natural Legal Language Processing Workshop 2019", - month = jun, - year = "2019", - address = "Minneapolis, Minnesota", - publisher = "Association for Computational Linguistics", - url = "https://www.aclweb.org/anthology/W19-2201", - pages = "1--11", -}""", + bibtex_citation=r""" +@inproceedings{manor-li-2019-plain, + address = {Minneapolis, Minnesota}, + author = {Manor, Laura and +Li, Junyi Jessy}, + booktitle = {Proceedings of the Natural Legal Language Processing Workshop 2019}, + month = jun, + pages = {1--11}, + publisher = {Association for Computational Linguistics}, + title = {Plain {E}nglish Summarization of Contracts}, + url = {https://www.aclweb.org/anthology/W19-2201}, + year = {2019}, +} +""", ) diff --git a/mteb/tasks/Retrieval/eng/LitSearchRetrieval.py b/mteb/tasks/Retrieval/eng/LitSearchRetrieval.py index b9c3683ed4..894f009d87 100644 --- a/mteb/tasks/Retrieval/eng/LitSearchRetrieval.py +++ b/mteb/tasks/Retrieval/eng/LitSearchRetrieval.py @@ -35,11 +35,13 @@ class LitSearchRetrieval(AbsTaskRetrieval): annotations_creators="LM-generated", # generated by GPT-4 dialect=[], sample_creation="found", # queries LLM generated, corpus samples are found (extracted from S2ORC) - bibtex_citation="""@article{ajith2024litsearch, - title={LitSearch: A Retrieval Benchmark for Scientific Literature Search}, - author={Ajith, Anirudh and Xia, Mengzhou and Chevalier, Alexis and Goyal, Tanya and Chen, Danqi and Gao, Tianyu}, - year={2024} - }""", + bibtex_citation=r""" +@article{ajith2024litsearch, + author = {Ajith, Anirudh and Xia, Mengzhou and Chevalier, Alexis and Goyal, Tanya and Chen, Danqi and Gao, Tianyu}, + title = {LitSearch: A Retrieval Benchmark for Scientific Literature Search}, + year = {2024}, +} +""", ) def load_data(self, **kwargs): diff --git a/mteb/tasks/Retrieval/eng/MLQuestions.py b/mteb/tasks/Retrieval/eng/MLQuestions.py index 6b594be445..a381abe269 100644 --- a/mteb/tasks/Retrieval/eng/MLQuestions.py +++ b/mteb/tasks/Retrieval/eng/MLQuestions.py @@ -39,23 +39,23 @@ class MLQuestionsRetrieval(AbsTaskRetrieval): annotations_creators="human-annotated", dialect=[], sample_creation="found", - bibtex_citation=""" - @inproceedings{kulshreshtha-etal-2021-back, - title = "Back-Training excels Self-Training at Unsupervised Domain Adaptation of Question Generation and Passage Retrieval", - author = "Kulshreshtha, Devang and - Belfer, Robert and - Serban, Iulian Vlad and - Reddy, Siva", - booktitle = "Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing", - month = nov, - year = "2021", - address = "Online and Punta Cana, Dominican Republic", - publisher = "Association for Computational Linguistics", - url = "https://aclanthology.org/2021.emnlp-main.566", - pages = "7064--7078", - abstract = "In this work, we introduce back-training, an alternative to self-training for unsupervised domain adaptation (UDA). While self-training generates synthetic training data where natural inputs are aligned with noisy outputs, back-training results in natural outputs aligned with noisy inputs. This significantly reduces the gap between target domain and synthetic data distribution, and reduces model overfitting to source domain. We run UDA experiments on question generation and passage retrieval from the Natural Questions domain to machine learning and biomedical domains. We find that back-training vastly outperforms self-training by a mean improvement of 7.8 BLEU-4 points on generation, and 17.6{\%} top-20 retrieval accuracy across both domains. We further propose consistency filters to remove low-quality synthetic data before training. We also release a new domain-adaptation dataset - MLQuestions containing 35K unaligned questions, 50K unaligned passages, and 3K aligned question-passage pairs.", - } - """, + bibtex_citation=r""" +@inproceedings{kulshreshtha-etal-2021-back, + abstract = {In this work, we introduce back-training, an alternative to self-training for unsupervised domain adaptation (UDA). While self-training generates synthetic training data where natural inputs are aligned with noisy outputs, back-training results in natural outputs aligned with noisy inputs. This significantly reduces the gap between target domain and synthetic data distribution, and reduces model overfitting to source domain. We run UDA experiments on question generation and passage retrieval from the Natural Questions domain to machine learning and biomedical domains. We find that back-training vastly outperforms self-training by a mean improvement of 7.8 BLEU-4 points on generation, and 17.6{\%} top-20 retrieval accuracy across both domains. We further propose consistency filters to remove low-quality synthetic data before training. We also release a new domain-adaptation dataset - MLQuestions containing 35K unaligned questions, 50K unaligned passages, and 3K aligned question-passage pairs.}, + address = {Online and Punta Cana, Dominican Republic}, + author = {Kulshreshtha, Devang and +Belfer, Robert and +Serban, Iulian Vlad and +Reddy, Siva}, + booktitle = {Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing}, + month = nov, + pages = {7064--7078}, + publisher = {Association for Computational Linguistics}, + title = {Back-Training excels Self-Training at Unsupervised Domain Adaptation of Question Generation and Passage Retrieval}, + url = {https://aclanthology.org/2021.emnlp-main.566}, + year = {2021}, +} +""", ) def load_data(self, **kwargs): diff --git a/mteb/tasks/Retrieval/eng/MSMARCORetrieval.py b/mteb/tasks/Retrieval/eng/MSMARCORetrieval.py index 252bf075b4..26f78f76e0 100644 --- a/mteb/tasks/Retrieval/eng/MSMARCORetrieval.py +++ b/mteb/tasks/Retrieval/eng/MSMARCORetrieval.py @@ -40,26 +40,27 @@ class MSMARCO(AbsTaskRetrieval): annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation="""@article{DBLP:journals/corr/NguyenRSGTMD16, - author = {Tri Nguyen and - Mir Rosenberg and - Xia Song and - Jianfeng Gao and - Saurabh Tiwary and - Rangan Majumder and - Li Deng}, - title = {{MS} {MARCO:} {A} Human Generated MAchine Reading COmprehension Dataset}, - journal = {CoRR}, - volume = {abs/1611.09268}, - year = {2016}, - url = {http://arxiv.org/abs/1611.09268}, - archivePrefix = {arXiv}, - eprint = {1611.09268}, + bibtex_citation=r""" +@article{DBLP:journals/corr/NguyenRSGTMD16, + archiveprefix = {arXiv}, + author = {Tri Nguyen and +Mir Rosenberg and +Xia Song and +Jianfeng Gao and +Saurabh Tiwary and +Rangan Majumder and +Li Deng}, + bibsource = {dblp computer science bibliography, https://dblp.org}, + biburl = {https://dblp.org/rec/journals/corr/NguyenRSGTMD16.bib}, + eprint = {1611.09268}, + journal = {CoRR}, timestamp = {Mon, 13 Aug 2018 16:49:03 +0200}, - biburl = {https://dblp.org/rec/journals/corr/NguyenRSGTMD16.bib}, - bibsource = {dblp computer science bibliography, https://dblp.org} + title = {{MS} {MARCO:} {A} Human Generated MAchine Reading COmprehension Dataset}, + url = {http://arxiv.org/abs/1611.09268}, + volume = {abs/1611.09268}, + year = {2016}, } -}""", +""", prompt={ "query": "Given a web search query, retrieve relevant passages that answer the query" }, @@ -101,25 +102,26 @@ class MSMARCOHardNegatives(AbsTaskRetrieval): annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation="""@article{DBLP:journals/corr/NguyenRSGTMD16, - author = {Tri Nguyen and - Mir Rosenberg and - Xia Song and - Jianfeng Gao and - Saurabh Tiwary and - Rangan Majumder and - Li Deng}, - title = {{MS} {MARCO:} {A} Human Generated MAchine Reading COmprehension Dataset}, - journal = {CoRR}, - volume = {abs/1611.09268}, - year = {2016}, - url = {http://arxiv.org/abs/1611.09268}, - archivePrefix = {arXiv}, - eprint = {1611.09268}, + bibtex_citation=r""" +@article{DBLP:journals/corr/NguyenRSGTMD16, + archiveprefix = {arXiv}, + author = {Tri Nguyen and +Mir Rosenberg and +Xia Song and +Jianfeng Gao and +Saurabh Tiwary and +Rangan Majumder and +Li Deng}, + bibsource = {dblp computer science bibliography, https://dblp.org}, + biburl = {https://dblp.org/rec/journals/corr/NguyenRSGTMD16.bib}, + eprint = {1611.09268}, + journal = {CoRR}, timestamp = {Mon, 13 Aug 2018 16:49:03 +0200}, - biburl = {https://dblp.org/rec/journals/corr/NguyenRSGTMD16.bib}, - bibsource = {dblp computer science bibliography, https://dblp.org} + title = {{MS} {MARCO:} {A} Human Generated MAchine Reading COmprehension Dataset}, + url = {http://arxiv.org/abs/1611.09268}, + volume = {abs/1611.09268}, + year = {2016}, } -}""", +""", adapted_from=["MSMARCO"], ) diff --git a/mteb/tasks/Retrieval/eng/MSMARCOv2Retrieval.py b/mteb/tasks/Retrieval/eng/MSMARCOv2Retrieval.py index 0a784805f9..920d79c855 100644 --- a/mteb/tasks/Retrieval/eng/MSMARCOv2Retrieval.py +++ b/mteb/tasks/Retrieval/eng/MSMARCOv2Retrieval.py @@ -38,25 +38,26 @@ class MSMARCOv2(AbsTaskRetrieval): annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation="""@article{DBLP:journals/corr/NguyenRSGTMD16, - author = {Tri Nguyen and - Mir Rosenberg and - Xia Song and - Jianfeng Gao and - Saurabh Tiwary and - Rangan Majumder and - Li Deng}, - title = {{MS} {MARCO:} {A} Human Generated MAchine Reading COmprehension Dataset}, - journal = {CoRR}, - volume = {abs/1611.09268}, - year = {2016}, - url = {http://arxiv.org/abs/1611.09268}, - archivePrefix = {arXiv}, - eprint = {1611.09268}, - timestamp = {Mon, 13 Aug 2018 16:49:03 +0200}, - biburl = {https://dblp.org/rec/journals/corr/NguyenRSGTMD16.bib}, - bibsource = {dblp computer science bibliography, https://dblp.org} - } - }""", + bibtex_citation=r""" +@article{DBLP:journals/corr/NguyenRSGTMD16, + archiveprefix = {arXiv}, + author = {Tri Nguyen and +Mir Rosenberg and +Xia Song and +Jianfeng Gao and +Saurabh Tiwary and +Rangan Majumder and +Li Deng}, + bibsource = {dblp computer science bibliography, https://dblp.org}, + biburl = {https://dblp.org/rec/journals/corr/NguyenRSGTMD16.bib}, + eprint = {1611.09268}, + journal = {CoRR}, + timestamp = {Mon, 13 Aug 2018 16:49:03 +0200}, + title = {{MS} {MARCO:} {A} Human Generated MAchine Reading COmprehension Dataset}, + url = {http://arxiv.org/abs/1611.09268}, + volume = {abs/1611.09268}, + year = {2016}, +} +""", adapted_from=["MSMARCO"], ) diff --git a/mteb/tasks/Retrieval/eng/MedicalQARetrieval.py b/mteb/tasks/Retrieval/eng/MedicalQARetrieval.py index 12607572bd..a3017155ee 100644 --- a/mteb/tasks/Retrieval/eng/MedicalQARetrieval.py +++ b/mteb/tasks/Retrieval/eng/MedicalQARetrieval.py @@ -26,14 +26,16 @@ class MedicalQARetrieval(AbsTaskRetrieval): annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation="""@ARTICLE{BenAbacha-BMC-2019, - author = {Asma, Ben Abacha and Dina, Demner{-}Fushman}, - title = {A Question-Entailment Approach to Question Answering}, - journal = {{BMC} Bioinform.}, - volume = {20}, - number = {1}, - pages = {511:1--511:23}, - year = {2019}, - url = {https://bmcbioinformatics.biomedcentral.com/articles/10.1186/s12859-019-3119-4} - } """, + bibtex_citation=r""" +@article{BenAbacha-BMC-2019, + author = {Asma, Ben Abacha and Dina, Demner{-}Fushman}, + journal = {{BMC} Bioinform.}, + number = {1}, + pages = {511:1--511:23}, + title = {A Question-Entailment Approach to Question Answering}, + url = {https://bmcbioinformatics.biomedcentral.com/articles/10.1186/s12859-019-3119-4}, + volume = {20}, + year = {2019}, +} +""", ) diff --git a/mteb/tasks/Retrieval/eng/NFCorpusRetrieval.py b/mteb/tasks/Retrieval/eng/NFCorpusRetrieval.py index 31f4eb60b1..bc69775d72 100644 --- a/mteb/tasks/Retrieval/eng/NFCorpusRetrieval.py +++ b/mteb/tasks/Retrieval/eng/NFCorpusRetrieval.py @@ -27,16 +27,18 @@ class NFCorpus(AbsTaskRetrieval): annotations_creators=None, dialect=None, sample_creation=None, - bibtex_citation="""@inproceedings{boteva2016, + bibtex_citation=r""" +@inproceedings{boteva2016, author = {Boteva, Vera and Gholipour, Demian and Sokolov, Artem and Riezler, Stefan}, - title = {A Full-Text Learning to Rank Dataset for Medical Information Retrieval}, + city = {Padova}, + country = {Italy}, journal = {Proceedings of the 38th European Conference on Information Retrieval}, journal-abbrev = {ECIR}, + title = {A Full-Text Learning to Rank Dataset for Medical Information Retrieval}, + url = {http://www.cl.uni-heidelberg.de/~riezler/publications/papers/ECIR2016.pdf}, year = {2016}, - city = {Padova}, - country = {Italy}, - url = {http://www.cl.uni-heidelberg.de/~riezler/publications/papers/ECIR2016.pdf} -}""", +} +""", prompt={ "query": "Given a question, retrieve relevant documents that best answer the question" }, diff --git a/mteb/tasks/Retrieval/eng/NQRetrieval.py b/mteb/tasks/Retrieval/eng/NQRetrieval.py index 33334493d3..105e46224a 100644 --- a/mteb/tasks/Retrieval/eng/NQRetrieval.py +++ b/mteb/tasks/Retrieval/eng/NQRetrieval.py @@ -27,12 +27,18 @@ class NQ(AbsTaskRetrieval): annotations_creators="human-annotated", dialect=[], sample_creation="found", - bibtex_citation="""@article{47761,title = {Natural Questions: a Benchmark for Question Answering Research}, - author = {Tom Kwiatkowski and Jennimaria Palomaki and Olivia Redfield and Michael Collins and Ankur Parikh - and Chris Alberti and Danielle Epstein and Illia Polosukhin and Matthew Kelcey and Jacob Devlin and Kenton Lee - and Kristina N. Toutanova and Llion Jones and Ming-Wei Chang and Andrew Dai and Jakob Uszkoreit and Quoc Le - and Slav Petrov},year = {2019},journal = {Transactions of the Association of Computational - Linguistics}}""", + bibtex_citation=r""" +@article{47761, + author = {Tom Kwiatkowski and Jennimaria Palomaki and Olivia Redfield and Michael Collins and Ankur Parikh +and Chris Alberti and Danielle Epstein and Illia Polosukhin and Matthew Kelcey and Jacob Devlin and Kenton Lee +and Kristina N. Toutanova and Llion Jones and Ming-Wei Chang and Andrew Dai and Jakob Uszkoreit and Quoc Le +and Slav Petrov}, + journal = {Transactions of the Association of Computational +Linguistics}, + title = {Natural Questions: a Benchmark for Question Answering Research}, + year = {2019}, +} +""", prompt={ "query": "Given a question, retrieve Wikipedia passages that answer the question" }, @@ -61,12 +67,18 @@ class NQHardNegatives(AbsTaskRetrieval): annotations_creators=None, dialect=None, sample_creation=None, - bibtex_citation="""@article{47761,title = {Natural Questions: a Benchmark for Question Answering Research}, - author = {Tom Kwiatkowski and Jennimaria Palomaki and Olivia Redfield and Michael Collins and Ankur Parikh - and Chris Alberti and Danielle Epstein and Illia Polosukhin and Matthew Kelcey and Jacob Devlin and Kenton Lee - and Kristina N. Toutanova and Llion Jones and Ming-Wei Chang and Andrew Dai and Jakob Uszkoreit and Quoc Le - and Slav Petrov},year = {2019},journal = {Transactions of the Association of Computational - Linguistics}}""", + bibtex_citation=r""" +@article{47761, + author = {Tom Kwiatkowski and Jennimaria Palomaki and Olivia Redfield and Michael Collins and Ankur Parikh +and Chris Alberti and Danielle Epstein and Illia Polosukhin and Matthew Kelcey and Jacob Devlin and Kenton Lee +and Kristina N. Toutanova and Llion Jones and Ming-Wei Chang and Andrew Dai and Jakob Uszkoreit and Quoc Le +and Slav Petrov}, + journal = {Transactions of the Association of Computational +Linguistics}, + title = {Natural Questions: a Benchmark for Question Answering Research}, + year = {2019}, +} +""", prompt={ "query": "Given a question, retrieve Wikipedia passages that answer the question" }, diff --git a/mteb/tasks/Retrieval/eng/NanoArguAnaRetrieval.py b/mteb/tasks/Retrieval/eng/NanoArguAnaRetrieval.py index 9f8ccbd783..190a9810fa 100644 --- a/mteb/tasks/Retrieval/eng/NanoArguAnaRetrieval.py +++ b/mteb/tasks/Retrieval/eng/NanoArguAnaRetrieval.py @@ -30,16 +30,18 @@ class NanoArguAnaRetrieval(AbsTaskRetrieval): annotations_creators="expert-annotated", dialect=[], sample_creation="found", - bibtex_citation="""@inproceedings{boteva2016, + bibtex_citation=r""" +@inproceedings{boteva2016, author = {Boteva, Vera and Gholipour, Demian and Sokolov, Artem and Riezler, Stefan}, - title = {A Full-Text Learning to Rank Dataset for Medical Information Retrieval}, + city = {Padova}, + country = {Italy}, journal = {Proceedings of the 38th European Conference on Information Retrieval}, journal-abbrev = {ECIR}, + title = {A Full-Text Learning to Rank Dataset for Medical Information Retrieval}, + url = {http://www.cl.uni-heidelberg.de/~riezler/publications/papers/ECIR2016.pdf}, year = {2016}, - city = {Padova}, - country = {Italy}, - url = {http://www.cl.uni-heidelberg.de/~riezler/publications/papers/ECIR2016.pdf} -}""", +} +""", prompt={"query": "Given a claim, find documents that refute the claim"}, adapted_from=["ArguAna"], ) diff --git a/mteb/tasks/Retrieval/eng/NanoClimateFeverRetrieval.py b/mteb/tasks/Retrieval/eng/NanoClimateFeverRetrieval.py index bde0dc4b32..910dafa0e4 100644 --- a/mteb/tasks/Retrieval/eng/NanoClimateFeverRetrieval.py +++ b/mteb/tasks/Retrieval/eng/NanoClimateFeverRetrieval.py @@ -30,14 +30,16 @@ class NanoClimateFeverRetrieval(AbsTaskRetrieval): annotations_creators="expert-annotated", dialect=[], sample_creation="found", - bibtex_citation="""@misc{diggelmann2021climatefever, - title={CLIMATE-FEVER: A Dataset for Verification of Real-World Climate Claims}, - author={Thomas Diggelmann and Jordan Boyd-Graber and Jannis Bulian and Massimiliano Ciaramita and Markus Leippold}, - year={2021}, - eprint={2012.00614}, - archivePrefix={arXiv}, - primaryClass={cs.CL} -}""", + bibtex_citation=r""" +@misc{diggelmann2021climatefever, + archiveprefix = {arXiv}, + author = {Thomas Diggelmann and Jordan Boyd-Graber and Jannis Bulian and Massimiliano Ciaramita and Markus Leippold}, + eprint = {2012.00614}, + primaryclass = {cs.CL}, + title = {CLIMATE-FEVER: A Dataset for Verification of Real-World Climate Claims}, + year = {2021}, +} +""", prompt={ "query": "Given a claim about climate change, retrieve documents that support or refute the claim" }, diff --git a/mteb/tasks/Retrieval/eng/NanoDBPediaRetrieval.py b/mteb/tasks/Retrieval/eng/NanoDBPediaRetrieval.py index da21922d72..b0cd4fd753 100644 --- a/mteb/tasks/Retrieval/eng/NanoDBPediaRetrieval.py +++ b/mteb/tasks/Retrieval/eng/NanoDBPediaRetrieval.py @@ -30,7 +30,14 @@ class NanoDBPediaRetrieval(AbsTaskRetrieval): annotations_creators="expert-annotated", dialect=[], sample_creation="found", - bibtex_citation="""@article{lehmann2015dbpedia, title={DBpedia: A large-scale, multilingual knowledge base extracted from Wikipedia}, author={Lehmann, Jens and et al.}, journal={Semantic Web}, year={2015}}""", + bibtex_citation=r""" +@article{lehmann2015dbpedia, + author = {Lehmann, Jens and et al.}, + journal = {Semantic Web}, + title = {DBpedia: A large-scale, multilingual knowledge base extracted from Wikipedia}, + year = {2015}, +} +""", prompt={ "query": "Given a query, retrieve relevant entity descriptions from DBPedia" }, diff --git a/mteb/tasks/Retrieval/eng/NanoFEVERRetrieval.py b/mteb/tasks/Retrieval/eng/NanoFEVERRetrieval.py index 8fbb463ecb..aa560aa40d 100644 --- a/mteb/tasks/Retrieval/eng/NanoFEVERRetrieval.py +++ b/mteb/tasks/Retrieval/eng/NanoFEVERRetrieval.py @@ -33,25 +33,27 @@ class NanoFEVERRetrieval(AbsTaskRetrieval): annotations_creators="expert-annotated", dialect=[], sample_creation="found", - bibtex_citation="""@inproceedings{thorne-etal-2018-fever, - title = "{FEVER}: a Large-scale Dataset for Fact Extraction and {VER}ification", - author = "Thorne, James and - Vlachos, Andreas and - Christodoulopoulos, Christos and - Mittal, Arpit", - editor = "Walker, Marilyn and - Ji, Heng and - Stent, Amanda", - booktitle = "Proceedings of the 2018 Conference of the North {A}merican Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long Papers)", - month = jun, - year = "2018", - address = "New Orleans, Louisiana", - publisher = "Association for Computational Linguistics", - url = "https://aclanthology.org/N18-1074", - doi = "10.18653/v1/N18-1074", - pages = "809--819", - abstract = "In this paper we introduce a new publicly available dataset for verification against textual sources, FEVER: Fact Extraction and VERification. It consists of 185,445 claims generated by altering sentences extracted from Wikipedia and subsequently verified without knowledge of the sentence they were derived from. The claims are classified as Supported, Refuted or NotEnoughInfo by annotators achieving 0.6841 in Fleiss kappa. For the first two classes, the annotators also recorded the sentence(s) forming the necessary evidence for their judgment. To characterize the challenge of the dataset presented, we develop a pipeline approach and compare it to suitably designed oracles. The best accuracy we achieve on labeling a claim accompanied by the correct evidence is 31.87{\%}, while if we ignore the evidence we achieve 50.91{\%}. Thus we believe that FEVER is a challenging testbed that will help stimulate progress on claim verification against textual sources.", -}""", + bibtex_citation=r""" +@inproceedings{thorne-etal-2018-fever, + abstract = {In this paper we introduce a new publicly available dataset for verification against textual sources, FEVER: Fact Extraction and VERification. It consists of 185,445 claims generated by altering sentences extracted from Wikipedia and subsequently verified without knowledge of the sentence they were derived from. The claims are classified as Supported, Refuted or NotEnoughInfo by annotators achieving 0.6841 in Fleiss kappa. For the first two classes, the annotators also recorded the sentence(s) forming the necessary evidence for their judgment. To characterize the challenge of the dataset presented, we develop a pipeline approach and compare it to suitably designed oracles. The best accuracy we achieve on labeling a claim accompanied by the correct evidence is 31.87{\%}, while if we ignore the evidence we achieve 50.91{\%}. Thus we believe that FEVER is a challenging testbed that will help stimulate progress on claim verification against textual sources.}, + address = {New Orleans, Louisiana}, + author = {Thorne, James and +Vlachos, Andreas and +Christodoulopoulos, Christos and +Mittal, Arpit}, + booktitle = {Proceedings of the 2018 Conference of the North {A}merican Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long Papers)}, + doi = {10.18653/v1/N18-1074}, + editor = {Walker, Marilyn and +Ji, Heng and +Stent, Amanda}, + month = jun, + pages = {809--819}, + publisher = {Association for Computational Linguistics}, + title = {{FEVER}: a Large-scale Dataset for Fact Extraction and {VER}ification}, + url = {https://aclanthology.org/N18-1074}, + year = {2018}, +} +""", prompt={ "query": "Given a claim, retrieve documents that support or refute the claim" }, diff --git a/mteb/tasks/Retrieval/eng/NanoFiQA2018Retrieval.py b/mteb/tasks/Retrieval/eng/NanoFiQA2018Retrieval.py index acef51ee26..0061fa6c79 100644 --- a/mteb/tasks/Retrieval/eng/NanoFiQA2018Retrieval.py +++ b/mteb/tasks/Retrieval/eng/NanoFiQA2018Retrieval.py @@ -30,14 +30,15 @@ class NanoFiQA2018Retrieval(AbsTaskRetrieval): annotations_creators="human-annotated", dialect=[], sample_creation="found", - bibtex_citation="""@inproceedings{ -thakur2021beir, -title={{BEIR}: A Heterogeneous Benchmark for Zero-shot Evaluation of Information Retrieval Models}, -author={Nandan Thakur and Nils Reimers and Andreas R{\"u}ckl{\'e} and Abhishek Srivastava and Iryna Gurevych}, -booktitle={Thirty-fifth Conference on Neural Information Processing Systems Datasets and Benchmarks Track (Round 2)}, -year={2021}, -url={https://openreview.net/forum?id=wCu6T5xFjeJ} -}""", + bibtex_citation=r""" +@inproceedings{thakur2021beir, + author = {Nandan Thakur and Nils Reimers and Andreas R{\"u}ckl{\'e} and Abhishek Srivastava and Iryna Gurevych}, + booktitle = {Thirty-fifth Conference on Neural Information Processing Systems Datasets and Benchmarks Track (Round 2)}, + title = {{BEIR}: A Heterogeneous Benchmark for Zero-shot Evaluation of Information Retrieval Models}, + url = {https://openreview.net/forum?id=wCu6T5xFjeJ}, + year = {2021}, +} +""", prompt={ "query": "Given a financial question, retrieve user replies that best answer the question" }, diff --git a/mteb/tasks/Retrieval/eng/NanoHotpotQARetrieval.py b/mteb/tasks/Retrieval/eng/NanoHotpotQARetrieval.py index cc033a1a2c..4726f328fe 100644 --- a/mteb/tasks/Retrieval/eng/NanoHotpotQARetrieval.py +++ b/mteb/tasks/Retrieval/eng/NanoHotpotQARetrieval.py @@ -32,29 +32,31 @@ class NanoHotpotQARetrieval(AbsTaskRetrieval): annotations_creators="human-annotated", dialect=[], sample_creation="found", - bibtex_citation="""@inproceedings{yang-etal-2018-hotpotqa, - title = "{H}otpot{QA}: A Dataset for Diverse, Explainable Multi-hop Question Answering", - author = "Yang, Zhilin and - Qi, Peng and - Zhang, Saizheng and - Bengio, Yoshua and - Cohen, William and - Salakhutdinov, Ruslan and - Manning, Christopher D.", - editor = "Riloff, Ellen and - Chiang, David and - Hockenmaier, Julia and - Tsujii, Jun{'}ichi", - booktitle = "Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing", - month = oct # "-" # nov, - year = "2018", - address = "Brussels, Belgium", - publisher = "Association for Computational Linguistics", - url = "https://aclanthology.org/D18-1259", - doi = "10.18653/v1/D18-1259", - pages = "2369--2380", - abstract = "Existing question answering (QA) datasets fail to train QA systems to perform complex reasoning and provide explanations for answers. We introduce HotpotQA, a new dataset with 113k Wikipedia-based question-answer pairs with four key features: (1) the questions require finding and reasoning over multiple supporting documents to answer; (2) the questions are diverse and not constrained to any pre-existing knowledge bases or knowledge schemas; (3) we provide sentence-level supporting facts required for reasoning, allowing QA systems to reason with strong supervision and explain the predictions; (4) we offer a new type of factoid comparison questions to test QA systems{'} ability to extract relevant facts and perform necessary comparison. We show that HotpotQA is challenging for the latest QA systems, and the supporting facts enable models to improve performance and make explainable predictions.", -}""", + bibtex_citation=r""" +@inproceedings{yang-etal-2018-hotpotqa, + abstract = {Existing question answering (QA) datasets fail to train QA systems to perform complex reasoning and provide explanations for answers. We introduce HotpotQA, a new dataset with 113k Wikipedia-based question-answer pairs with four key features: (1) the questions require finding and reasoning over multiple supporting documents to answer; (2) the questions are diverse and not constrained to any pre-existing knowledge bases or knowledge schemas; (3) we provide sentence-level supporting facts required for reasoning, allowing QA systems to reason with strong supervision and explain the predictions; (4) we offer a new type of factoid comparison questions to test QA systems{'} ability to extract relevant facts and perform necessary comparison. We show that HotpotQA is challenging for the latest QA systems, and the supporting facts enable models to improve performance and make explainable predictions.}, + address = {Brussels, Belgium}, + author = {Yang, Zhilin and +Qi, Peng and +Zhang, Saizheng and +Bengio, Yoshua and +Cohen, William and +Salakhutdinov, Ruslan and +Manning, Christopher D.}, + booktitle = {Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing}, + doi = {10.18653/v1/D18-1259}, + editor = {Riloff, Ellen and +Chiang, David and +Hockenmaier, Julia and +Tsujii, Jun{'}ichi}, + month = oct # {-} # nov, + pages = {2369--2380}, + publisher = {Association for Computational Linguistics}, + title = {{H}otpot{QA}: A Dataset for Diverse, Explainable Multi-hop Question Answering}, + url = {https://aclanthology.org/D18-1259}, + year = {2018}, +} +""", prompt={ "query": "Given a multi-hop question, retrieve documents that can help answer the question" }, diff --git a/mteb/tasks/Retrieval/eng/NanoMSMARCORetrieval.py b/mteb/tasks/Retrieval/eng/NanoMSMARCORetrieval.py index d30c3c24b4..b2d5241cdc 100644 --- a/mteb/tasks/Retrieval/eng/NanoMSMARCORetrieval.py +++ b/mteb/tasks/Retrieval/eng/NanoMSMARCORetrieval.py @@ -30,26 +30,27 @@ class NanoMSMARCORetrieval(AbsTaskRetrieval): annotations_creators="human-annotated", dialect=[], sample_creation="found", - bibtex_citation="""@article{DBLP:journals/corr/NguyenRSGTMD16, - author = {Tri Nguyen and - Mir Rosenberg and - Xia Song and - Jianfeng Gao and - Saurabh Tiwary and - Rangan Majumder and - Li Deng}, - title = {{MS} {MARCO:} {A} Human Generated MAchine Reading COmprehension Dataset}, - journal = {CoRR}, - volume = {abs/1611.09268}, - year = {2016}, - url = {http://arxiv.org/abs/1611.09268}, - archivePrefix = {arXiv}, - eprint = {1611.09268}, + bibtex_citation=r""" +@article{DBLP:journals/corr/NguyenRSGTMD16, + archiveprefix = {arXiv}, + author = {Tri Nguyen and +Mir Rosenberg and +Xia Song and +Jianfeng Gao and +Saurabh Tiwary and +Rangan Majumder and +Li Deng}, + bibsource = {dblp computer science bibliography, https://dblp.org}, + biburl = {https://dblp.org/rec/journals/corr/NguyenRSGTMD16.bib}, + eprint = {1611.09268}, + journal = {CoRR}, timestamp = {Mon, 13 Aug 2018 16:49:03 +0200}, - biburl = {https://dblp.org/rec/journals/corr/NguyenRSGTMD16.bib}, - bibsource = {dblp computer science bibliography, https://dblp.org} + title = {{MS} {MARCO:} {A} Human Generated MAchine Reading COmprehension Dataset}, + url = {http://arxiv.org/abs/1611.09268}, + volume = {abs/1611.09268}, + year = {2016}, } -}""", +""", prompt={ "query": "Given a web search query, retrieve relevant passages that answer the query" }, diff --git a/mteb/tasks/Retrieval/eng/NanoNFCorpusRetrieval.py b/mteb/tasks/Retrieval/eng/NanoNFCorpusRetrieval.py index 157491df9b..e72d9647e3 100644 --- a/mteb/tasks/Retrieval/eng/NanoNFCorpusRetrieval.py +++ b/mteb/tasks/Retrieval/eng/NanoNFCorpusRetrieval.py @@ -30,16 +30,18 @@ class NanoNFCorpusRetrieval(AbsTaskRetrieval): annotations_creators="expert-annotated", dialect=[], sample_creation="found", - bibtex_citation="""@inproceedings{boteva2016, + bibtex_citation=r""" +@inproceedings{boteva2016, author = {Boteva, Vera and Gholipour, Demian and Sokolov, Artem and Riezler, Stefan}, - title = {A Full-Text Learning to Rank Dataset for Medical Information Retrieval}, + city = {Padova}, + country = {Italy}, journal = {Proceedings of the 38th European Conference on Information Retrieval}, journal-abbrev = {ECIR}, + title = {A Full-Text Learning to Rank Dataset for Medical Information Retrieval}, + url = {http://www.cl.uni-heidelberg.de/~riezler/publications/papers/ECIR2016.pdf}, year = {2016}, - city = {Padova}, - country = {Italy}, - url = {http://www.cl.uni-heidelberg.de/~riezler/publications/papers/ECIR2016.pdf} -}""", +} +""", prompt={ "query": "Given a question, retrieve relevant documents that best answer the question" }, diff --git a/mteb/tasks/Retrieval/eng/NanoNQRetrieval.py b/mteb/tasks/Retrieval/eng/NanoNQRetrieval.py index 0cac9088bc..52c741d988 100644 --- a/mteb/tasks/Retrieval/eng/NanoNQRetrieval.py +++ b/mteb/tasks/Retrieval/eng/NanoNQRetrieval.py @@ -30,12 +30,18 @@ class NanoNQRetrieval(AbsTaskRetrieval): annotations_creators="human-annotated", dialect=[], sample_creation="found", - bibtex_citation="""@article{47761,title = {Natural Questions: a Benchmark for Question Answering Research}, - author = {Tom Kwiatkowski and Jennimaria Palomaki and Olivia Redfield and Michael Collins and Ankur Parikh - and Chris Alberti and Danielle Epstein and Illia Polosukhin and Matthew Kelcey and Jacob Devlin and Kenton Lee - and Kristina N. Toutanova and Llion Jones and Ming-Wei Chang and Andrew Dai and Jakob Uszkoreit and Quoc Le - and Slav Petrov},year = {2019},journal = {Transactions of the Association of Computational - Linguistics}}""", + bibtex_citation=r""" +@article{47761, + author = {Tom Kwiatkowski and Jennimaria Palomaki and Olivia Redfield and Michael Collins and Ankur Parikh +and Chris Alberti and Danielle Epstein and Illia Polosukhin and Matthew Kelcey and Jacob Devlin and Kenton Lee +and Kristina N. Toutanova and Llion Jones and Ming-Wei Chang and Andrew Dai and Jakob Uszkoreit and Quoc Le +and Slav Petrov}, + journal = {Transactions of the Association of Computational +Linguistics}, + title = {Natural Questions: a Benchmark for Question Answering Research}, + year = {2019}, +} +""", prompt={ "query": "Given a question, retrieve Wikipedia passages that answer the question" }, diff --git a/mteb/tasks/Retrieval/eng/NanoQuoraRetrieval.py b/mteb/tasks/Retrieval/eng/NanoQuoraRetrieval.py index f22e2c4420..a8cd253cc9 100644 --- a/mteb/tasks/Retrieval/eng/NanoQuoraRetrieval.py +++ b/mteb/tasks/Retrieval/eng/NanoQuoraRetrieval.py @@ -32,13 +32,15 @@ class NanoQuoraRetrieval(AbsTaskRetrieval): annotations_creators="human-annotated", dialect=[], sample_creation="found", - bibtex_citation="""@misc{quora-question-pairs, - author = {DataCanary, hilfialkaff, Lili Jiang, Meg Risdal, Nikhil Dandekar, tomtung}, - title = {Quora Question Pairs}, - publisher = {Kaggle}, - year = {2017}, - url = {https://kaggle.com/competitions/quora-question-pairs} -}""", + bibtex_citation=r""" +@misc{quora-question-pairs, + author = {DataCanary, hilfialkaff, Lili Jiang, Meg Risdal, Nikhil Dandekar, tomtung}, + publisher = {Kaggle}, + title = {Quora Question Pairs}, + url = {https://kaggle.com/competitions/quora-question-pairs}, + year = {2017}, +} +""", prompt={ "query": "Given a question, retrieve questions that are semantically equivalent to the given question" }, diff --git a/mteb/tasks/Retrieval/eng/NanoSCIDOCSRetrieval.py b/mteb/tasks/Retrieval/eng/NanoSCIDOCSRetrieval.py index 7f376ca30d..ebd97ace44 100644 --- a/mteb/tasks/Retrieval/eng/NanoSCIDOCSRetrieval.py +++ b/mteb/tasks/Retrieval/eng/NanoSCIDOCSRetrieval.py @@ -32,12 +32,14 @@ class NanoSCIDOCSRetrieval(AbsTaskRetrieval): annotations_creators="expert-annotated", dialect=[], sample_creation="found", - bibtex_citation="""@inproceedings{specter2020cohan, - title={SPECTER: Document-level Representation Learning using Citation-informed Transformers}, - author={Arman Cohan and Sergey Feldman and Iz Beltagy and Doug Downey and Daniel S. Weld}, - booktitle={ACL}, - year={2020} -}""", + bibtex_citation=r""" +@inproceedings{specter2020cohan, + author = {Arman Cohan and Sergey Feldman and Iz Beltagy and Doug Downey and Daniel S. Weld}, + booktitle = {ACL}, + title = {SPECTER: Document-level Representation Learning using Citation-informed Transformers}, + year = {2020}, +} +""", prompt={ "query": "Given a scientific paper title, retrieve paper abstracts that are cited by the given paper" }, diff --git a/mteb/tasks/Retrieval/eng/NanoSciFactRetrieval.py b/mteb/tasks/Retrieval/eng/NanoSciFactRetrieval.py index 63e1827de8..8e2716c401 100644 --- a/mteb/tasks/Retrieval/eng/NanoSciFactRetrieval.py +++ b/mteb/tasks/Retrieval/eng/NanoSciFactRetrieval.py @@ -30,12 +30,14 @@ class NanoSciFactRetrieval(AbsTaskRetrieval): annotations_creators="expert-annotated", dialect=[], sample_creation="found", - bibtex_citation="""@inproceedings{specter2020cohan, - title={SPECTER: Document-level Representation Learning using Citation-informed Transformers}, - author={Arman Cohan and Sergey Feldman and Iz Beltagy and Doug Downey and Daniel S. Weld}, - booktitle={ACL}, - year={2020} -}""", + bibtex_citation=r""" +@inproceedings{specter2020cohan, + author = {Arman Cohan and Sergey Feldman and Iz Beltagy and Doug Downey and Daniel S. Weld}, + booktitle = {ACL}, + title = {SPECTER: Document-level Representation Learning using Citation-informed Transformers}, + year = {2020}, +} +""", prompt={ "query": "Given a scientific claim, retrieve documents that support or refute the claim" }, diff --git a/mteb/tasks/Retrieval/eng/NanoTouche2020Retrieval.py b/mteb/tasks/Retrieval/eng/NanoTouche2020Retrieval.py index cd07fa8453..d48f90ba2b 100644 --- a/mteb/tasks/Retrieval/eng/NanoTouche2020Retrieval.py +++ b/mteb/tasks/Retrieval/eng/NanoTouche2020Retrieval.py @@ -30,23 +30,25 @@ class NanoTouche2020Retrieval(AbsTaskRetrieval): annotations_creators="human-annotated", dialect=[], sample_creation="found", - bibtex_citation="""@dataset{potthast_2022_6862281, - author = {Potthast, Martin and - Gienapp, Lukas and - Wachsmuth, Henning and - Hagen, Matthias and - Fröbe, Maik and - Bondarenko, Alexander and - Ajjour, Yamen and - Stein, Benno}, - title = {{Touché20-Argument-Retrieval-for-Controversial- - Questions}}, - month = jul, - year = 2022, - publisher = {Zenodo}, - doi = {10.5281/zenodo.6862281}, - url = {https://doi.org/10.5281/zenodo.6862281} -}""", + bibtex_citation=r""" +@dataset{potthast_2022_6862281, + author = {Potthast, Martin and +Gienapp, Lukas and +Wachsmuth, Henning and +Hagen, Matthias and +Fröbe, Maik and +Bondarenko, Alexander and +Ajjour, Yamen and +Stein, Benno}, + doi = {10.5281/zenodo.6862281}, + month = jul, + publisher = {Zenodo}, + title = {{Touché20-Argument-Retrieval-for-Controversial- +Questions}}, + url = {https://doi.org/10.5281/zenodo.6862281}, + year = {2022}, +} +""", prompt={ "query": "Given a question, retrieve detailed and persuasive arguments that answer the question" }, diff --git a/mteb/tasks/Retrieval/eng/NarrativeQARetrieval.py b/mteb/tasks/Retrieval/eng/NarrativeQARetrieval.py index 988048bb0b..bd5ce1dc5a 100644 --- a/mteb/tasks/Retrieval/eng/NarrativeQARetrieval.py +++ b/mteb/tasks/Retrieval/eng/NarrativeQARetrieval.py @@ -34,14 +34,16 @@ class NarrativeQARetrieval(AbsTaskRetrieval): annotations_creators=None, dialect=None, sample_creation=None, - bibtex_citation="""@misc{kočiský2017narrativeqa, - title={The NarrativeQA Reading Comprehension Challenge}, - author={Tomáš Kočiský and Jonathan Schwarz and Phil Blunsom and Chris Dyer and Karl Moritz Hermann and Gábor Melis and Edward Grefenstette}, - year={2017}, - eprint={1712.07040}, - archivePrefix={arXiv}, - primaryClass={cs.CL} -}""", + bibtex_citation=r""" +@misc{kočiský2017narrativeqa, + archiveprefix = {arXiv}, + author = {Tomáš Kočiský and Jonathan Schwarz and Phil Blunsom and Chris Dyer and Karl Moritz Hermann and Gábor Melis and Edward Grefenstette}, + eprint = {1712.07040}, + primaryclass = {cs.CL}, + title = {The NarrativeQA Reading Comprehension Challenge}, + year = {2017}, +} +""", ) def load_data(self, **kwargs): diff --git a/mteb/tasks/Retrieval/eng/PiqaRetrieval.py b/mteb/tasks/Retrieval/eng/PiqaRetrieval.py index 335c252a7e..4b3e7126a9 100644 --- a/mteb/tasks/Retrieval/eng/PiqaRetrieval.py +++ b/mteb/tasks/Retrieval/eng/PiqaRetrieval.py @@ -28,20 +28,22 @@ class PIQA(AbsTaskRetrieval): annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation="""@article{xiao2024rar, - title={RAR-b: Reasoning as Retrieval Benchmark}, - author={Xiao, Chenghao and Hudson, G Thomas and Moubayed, Noura Al}, - journal={arXiv preprint arXiv:2404.06347}, - year={2024} -} + bibtex_citation=r""" @inproceedings{bisk2020piqa, - title={Piqa: Reasoning about physical commonsense in natural language}, - author={Bisk, Yonatan and Zellers, Rowan and Gao, Jianfeng and Choi, Yejin and others}, - booktitle={Proceedings of the AAAI conference on artificial intelligence}, - volume={34}, - number={05}, - pages={7432--7439}, - year={2020} + author = {Bisk, Yonatan and Zellers, Rowan and Gao, Jianfeng and Choi, Yejin and others}, + booktitle = {Proceedings of the AAAI conference on artificial intelligence}, + number = {05}, + pages = {7432--7439}, + title = {Piqa: Reasoning about physical commonsense in natural language}, + volume = {34}, + year = {2020}, +} + +@article{xiao2024rar, + author = {Xiao, Chenghao and Hudson, G Thomas and Moubayed, Noura Al}, + journal = {arXiv preprint arXiv:2404.06347}, + title = {RAR-b: Reasoning as Retrieval Benchmark}, + year = {2024}, } """, prompt={"query": "Given the following goal, retrieve a possible solution."}, diff --git a/mteb/tasks/Retrieval/eng/QuailRetrieval.py b/mteb/tasks/Retrieval/eng/QuailRetrieval.py index 221e11cc0f..b89eed9792 100644 --- a/mteb/tasks/Retrieval/eng/QuailRetrieval.py +++ b/mteb/tasks/Retrieval/eng/QuailRetrieval.py @@ -28,20 +28,22 @@ class Quail(AbsTaskRetrieval): annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation="""@article{xiao2024rar, - title={RAR-b: Reasoning as Retrieval Benchmark}, - author={Xiao, Chenghao and Hudson, G Thomas and Moubayed, Noura Al}, - journal={arXiv preprint arXiv:2404.06347}, - year={2024} -} + bibtex_citation=r""" @inproceedings{rogers2020getting, - title={Getting closer to AI complete question answering: A set of prerequisite real tasks}, - author={Rogers, Anna and Kovaleva, Olga and Downey, Matthew and Rumshisky, Anna}, - booktitle={Proceedings of the AAAI conference on artificial intelligence}, - volume={34}, - number={05}, - pages={8722--8731}, - year={2020} + author = {Rogers, Anna and Kovaleva, Olga and Downey, Matthew and Rumshisky, Anna}, + booktitle = {Proceedings of the AAAI conference on artificial intelligence}, + number = {05}, + pages = {8722--8731}, + title = {Getting closer to AI complete question answering: A set of prerequisite real tasks}, + volume = {34}, + year = {2020}, +} + +@article{xiao2024rar, + author = {Xiao, Chenghao and Hudson, G Thomas and Moubayed, Noura Al}, + journal = {arXiv preprint arXiv:2404.06347}, + title = {RAR-b: Reasoning as Retrieval Benchmark}, + year = {2024}, } """, prompt={ diff --git a/mteb/tasks/Retrieval/eng/QuoraRetrieval.py b/mteb/tasks/Retrieval/eng/QuoraRetrieval.py index 9210ab1249..de5b59d953 100644 --- a/mteb/tasks/Retrieval/eng/QuoraRetrieval.py +++ b/mteb/tasks/Retrieval/eng/QuoraRetrieval.py @@ -32,13 +32,15 @@ class QuoraRetrieval(AbsTaskRetrieval): annotations_creators="human-annotated", dialect=[], sample_creation="found", - bibtex_citation="""@misc{quora-question-pairs, - author = {DataCanary, hilfialkaff, Lili Jiang, Meg Risdal, Nikhil Dandekar, tomtung}, - title = {Quora Question Pairs}, - publisher = {Kaggle}, - year = {2017}, - url = {https://kaggle.com/competitions/quora-question-pairs} -}""", + bibtex_citation=r""" +@misc{quora-question-pairs, + author = {DataCanary, hilfialkaff, Lili Jiang, Meg Risdal, Nikhil Dandekar, tomtung}, + publisher = {Kaggle}, + title = {Quora Question Pairs}, + url = {https://kaggle.com/competitions/quora-question-pairs}, + year = {2017}, +} +""", prompt={ "query": "Given a question, retrieve questions that are semantically equivalent to the given question" }, @@ -72,12 +74,14 @@ class QuoraRetrievalHardNegatives(AbsTaskRetrieval): annotations_creators=None, dialect=None, sample_creation=None, - bibtex_citation="""@misc{quora-question-pairs, - author = {DataCanary, hilfialkaff, Lili Jiang, Meg Risdal, Nikhil Dandekar, tomtung}, - title = {Quora Question Pairs}, - publisher = {Kaggle}, - year = {2017}, - url = {https://kaggle.com/competitions/quora-question-pairs} -}""", + bibtex_citation=r""" +@misc{quora-question-pairs, + author = {DataCanary, hilfialkaff, Lili Jiang, Meg Risdal, Nikhil Dandekar, tomtung}, + publisher = {Kaggle}, + title = {Quora Question Pairs}, + url = {https://kaggle.com/competitions/quora-question-pairs}, + year = {2017}, +} +""", adapted_from=["QuoraRetrieval"], ) diff --git a/mteb/tasks/Retrieval/eng/RARbCodeRetrieval.py b/mteb/tasks/Retrieval/eng/RARbCodeRetrieval.py index b42cd4bd71..bdc4518fd2 100644 --- a/mteb/tasks/Retrieval/eng/RARbCodeRetrieval.py +++ b/mteb/tasks/Retrieval/eng/RARbCodeRetrieval.py @@ -28,28 +28,26 @@ class RARbCode(AbsTaskRetrieval): annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation="""@article{xiao2024rar, - title={RAR-b: Reasoning as Retrieval Benchmark}, - author={Xiao, Chenghao and Hudson, G Thomas and Moubayed, Noura Al}, - journal={arXiv preprint arXiv:2404.06347}, - year={2024} + bibtex_citation=r""" +@article{husain2019codesearchnet, + author = {Husain, Hamel and Wu, Ho-Hsiang and Gazit, Tiferet and Allamanis, Miltiadis and Brockschmidt, Marc}, + journal = {arXiv preprint arXiv:1909.09436}, + title = {Codesearchnet challenge: Evaluating the state of semantic code search}, + year = {2019}, } + @article{muennighoff2023octopack, - title={Octopack: Instruction tuning code large language models}, - author={Muennighoff, Niklas and Liu, Qian and Zebaze, Armel and Zheng, Qinkai and Hui, Binyuan and Zhuo, Terry Yue and Singh, Swayam and Tang, Xiangru and Von Werra, Leandro and Longpre, Shayne}, - journal={arXiv preprint arXiv:2308.07124}, - year={2023} + author = {Muennighoff, Niklas and Liu, Qian and Zebaze, Armel and Zheng, Qinkai and Hui, Binyuan and Zhuo, Terry Yue and Singh, Swayam and Tang, Xiangru and Von Werra, Leandro and Longpre, Shayne}, + journal = {arXiv preprint arXiv:2308.07124}, + title = {Octopack: Instruction tuning code large language models}, + year = {2023}, } -@article{austin2021program, - title={Program Synthesis with Large Language Models}, - author={Austin, Jacob and Odena, Augustus and Nye, Maxwell and Bosma, Maarten and Michalewski, Henryk and Dohan, David and Jiang, Ellen and Cai, Carrie and Terry, Michael and Le, Quoc and others}, - journal={arXiv preprint arXiv:2108.07732}, - year={2021} -@article{husain2019codesearchnet, - title={Codesearchnet challenge: Evaluating the state of semantic code search}, - author={Husain, Hamel and Wu, Ho-Hsiang and Gazit, Tiferet and Allamanis, Miltiadis and Brockschmidt, Marc}, - journal={arXiv preprint arXiv:1909.09436}, - year={2019} + +@article{xiao2024rar, + author = {Xiao, Chenghao and Hudson, G Thomas and Moubayed, Noura Al}, + journal = {arXiv preprint arXiv:2404.06347}, + title = {RAR-b: Reasoning as Retrieval Benchmark}, + year = {2024}, } """, prompt={"query": "Retrieve the answer for the following coding problem."}, diff --git a/mteb/tasks/Retrieval/eng/RARbMathRetrieval.py b/mteb/tasks/Retrieval/eng/RARbMathRetrieval.py index 88855a8eaf..05e55becd4 100644 --- a/mteb/tasks/Retrieval/eng/RARbMathRetrieval.py +++ b/mteb/tasks/Retrieval/eng/RARbMathRetrieval.py @@ -28,29 +28,33 @@ class RARbMath(AbsTaskRetrieval): annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation="""@article{xiao2024rar, - title={RAR-b: Reasoning as Retrieval Benchmark}, - author={Xiao, Chenghao and Hudson, G Thomas and Moubayed, Noura Al}, - journal={arXiv preprint arXiv:2404.06347}, - year={2024} + bibtex_citation=r""" +@article{cobbe2021training, + author = {Cobbe, Karl and Kosaraju, Vineet and Bavarian, Mohammad and Chen, Mark and Jun, Heewoo and Kaiser, Lukasz and Plappert, Matthias and Tworek, Jerry and Hilton, Jacob and Nakano, Reiichiro and others}, + journal = {arXiv preprint arXiv:2110.14168}, + title = {Training verifiers to solve math word problems}, + year = {2021}, } + @article{hendrycks2021measuring, - title={Measuring mathematical problem solving with the math dataset}, - author={Hendrycks, Dan and Burns, Collin and Kadavath, Saurav and Arora, Akul and Basart, Steven and Tang, Eric and Song, Dawn and Steinhardt, Jacob}, - journal={arXiv preprint arXiv:2103.03874}, - year={2021} + author = {Hendrycks, Dan and Burns, Collin and Kadavath, Saurav and Arora, Akul and Basart, Steven and Tang, Eric and Song, Dawn and Steinhardt, Jacob}, + journal = {arXiv preprint arXiv:2103.03874}, + title = {Measuring mathematical problem solving with the math dataset}, + year = {2021}, } -@article{cobbe2021training, - title={Training verifiers to solve math word problems}, - author={Cobbe, Karl and Kosaraju, Vineet and Bavarian, Mohammad and Chen, Mark and Jun, Heewoo and Kaiser, Lukasz and Plappert, Matthias and Tworek, Jerry and Hilton, Jacob and Nakano, Reiichiro and others}, - journal={arXiv preprint arXiv:2110.14168}, - year={2021} + +@article{xiao2024rar, + author = {Xiao, Chenghao and Hudson, G Thomas and Moubayed, Noura Al}, + journal = {arXiv preprint arXiv:2404.06347}, + title = {RAR-b: Reasoning as Retrieval Benchmark}, + year = {2024}, } + @article{yu2023metamath, - title={Metamath: Bootstrap your own mathematical questions for large language models}, - author={Yu, Longhui and Jiang, Weisen and Shi, Han and Yu, Jincheng and Liu, Zhengying and Zhang, Yu and Kwok, James T and Li, Zhenguo and Weller, Adrian and Liu, Weiyang}, - journal={arXiv preprint arXiv:2309.12284}, - year={2023} + author = {Yu, Longhui and Jiang, Weisen and Shi, Han and Yu, Jincheng and Liu, Zhengying and Zhang, Yu and Kwok, James T and Li, Zhenguo and Weller, Adrian and Liu, Weiyang}, + journal = {arXiv preprint arXiv:2309.12284}, + title = {Metamath: Bootstrap your own mathematical questions for large language models}, + year = {2023}, } """, prompt={"query": "Retrieve the answer for the following math problem."}, diff --git a/mteb/tasks/Retrieval/eng/SCIDOCSRetrieval.py b/mteb/tasks/Retrieval/eng/SCIDOCSRetrieval.py index 231c695d48..4ab6911e42 100644 --- a/mteb/tasks/Retrieval/eng/SCIDOCSRetrieval.py +++ b/mteb/tasks/Retrieval/eng/SCIDOCSRetrieval.py @@ -30,12 +30,14 @@ class SCIDOCS(AbsTaskRetrieval): annotations_creators=None, dialect=[], sample_creation="found", - bibtex_citation="""@inproceedings{specter2020cohan, - title={SPECTER: Document-level Representation Learning using Citation-informed Transformers}, - author={Arman Cohan and Sergey Feldman and Iz Beltagy and Doug Downey and Daniel S. Weld}, - booktitle={ACL}, - year={2020} -}""", + bibtex_citation=r""" +@inproceedings{specter2020cohan, + author = {Arman Cohan and Sergey Feldman and Iz Beltagy and Doug Downey and Daniel S. Weld}, + booktitle = {ACL}, + title = {SPECTER: Document-level Representation Learning using Citation-informed Transformers}, + year = {2020}, +} +""", prompt={ "query": "Given a scientific paper title, retrieve paper abstracts that are cited by the given paper" }, diff --git a/mteb/tasks/Retrieval/eng/SciFactRetrieval.py b/mteb/tasks/Retrieval/eng/SciFactRetrieval.py index a44eb052bd..f36f7247cf 100644 --- a/mteb/tasks/Retrieval/eng/SciFactRetrieval.py +++ b/mteb/tasks/Retrieval/eng/SciFactRetrieval.py @@ -27,12 +27,14 @@ class SciFact(AbsTaskRetrieval): annotations_creators=None, dialect=None, sample_creation=None, - bibtex_citation="""@inproceedings{specter2020cohan, - title={SPECTER: Document-level Representation Learning using Citation-informed Transformers}, - author={Arman Cohan and Sergey Feldman and Iz Beltagy and Doug Downey and Daniel S. Weld}, - booktitle={ACL}, - year={2020} -}""", + bibtex_citation=r""" +@inproceedings{specter2020cohan, + author = {Arman Cohan and Sergey Feldman and Iz Beltagy and Doug Downey and Daniel S. Weld}, + booktitle = {ACL}, + title = {SPECTER: Document-level Representation Learning using Citation-informed Transformers}, + year = {2020}, +} +""", prompt={ "query": "Given a scientific claim, retrieve documents that support or refute the claim" }, diff --git a/mteb/tasks/Retrieval/eng/SiqaRetrieval.py b/mteb/tasks/Retrieval/eng/SiqaRetrieval.py index b8c42f7675..922e750a6a 100644 --- a/mteb/tasks/Retrieval/eng/SiqaRetrieval.py +++ b/mteb/tasks/Retrieval/eng/SiqaRetrieval.py @@ -28,17 +28,19 @@ class SIQA(AbsTaskRetrieval): annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation="""@article{xiao2024rar, - title={RAR-b: Reasoning as Retrieval Benchmark}, - author={Xiao, Chenghao and Hudson, G Thomas and Moubayed, Noura Al}, - journal={arXiv preprint arXiv:2404.06347}, - year={2024} -} + bibtex_citation=r""" @article{sap2019socialiqa, - title={Socialiqa: Commonsense reasoning about social interactions}, - author={Sap, Maarten and Rashkin, Hannah and Chen, Derek and LeBras, Ronan and Choi, Yejin}, - journal={arXiv preprint arXiv:1904.09728}, - year={2019} + author = {Sap, Maarten and Rashkin, Hannah and Chen, Derek and LeBras, Ronan and Choi, Yejin}, + journal = {arXiv preprint arXiv:1904.09728}, + title = {Socialiqa: Commonsense reasoning about social interactions}, + year = {2019}, +} + +@article{xiao2024rar, + author = {Xiao, Chenghao and Hudson, G Thomas and Moubayed, Noura Al}, + journal = {arXiv preprint arXiv:2404.06347}, + title = {RAR-b: Reasoning as Retrieval Benchmark}, + year = {2024}, } """, prompt={ diff --git a/mteb/tasks/Retrieval/eng/SpartQARetrieval.py b/mteb/tasks/Retrieval/eng/SpartQARetrieval.py index c0262f01cd..f4fb275e37 100644 --- a/mteb/tasks/Retrieval/eng/SpartQARetrieval.py +++ b/mteb/tasks/Retrieval/eng/SpartQARetrieval.py @@ -28,17 +28,19 @@ class SpartQA(AbsTaskRetrieval): annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation="""@article{xiao2024rar, - title={RAR-b: Reasoning as Retrieval Benchmark}, - author={Xiao, Chenghao and Hudson, G Thomas and Moubayed, Noura Al}, - journal={arXiv preprint arXiv:2404.06347}, - year={2024} -} + bibtex_citation=r""" @article{mirzaee2021spartqa, - title={Spartqa:: A textual question answering benchmark for spatial reasoning}, - author={Mirzaee, Roshanak and Faghihi, Hossein Rajaby and Ning, Qiang and Kordjmashidi, Parisa}, - journal={arXiv preprint arXiv:2104.05832}, - year={2021} + author = {Mirzaee, Roshanak and Faghihi, Hossein Rajaby and Ning, Qiang and Kordjmashidi, Parisa}, + journal = {arXiv preprint arXiv:2104.05832}, + title = {Spartqa:: A textual question answering benchmark for spatial reasoning}, + year = {2021}, +} + +@article{xiao2024rar, + author = {Xiao, Chenghao and Hudson, G Thomas and Moubayed, Noura Al}, + journal = {arXiv preprint arXiv:2404.06347}, + title = {RAR-b: Reasoning as Retrieval Benchmark}, + year = {2024}, } """, prompt={ diff --git a/mteb/tasks/Retrieval/eng/TRECCOVIDRetrieval.py b/mteb/tasks/Retrieval/eng/TRECCOVIDRetrieval.py index f48adcb255..3b3764a4fd 100644 --- a/mteb/tasks/Retrieval/eng/TRECCOVIDRetrieval.py +++ b/mteb/tasks/Retrieval/eng/TRECCOVIDRetrieval.py @@ -27,14 +27,16 @@ class TRECCOVID(AbsTaskRetrieval): annotations_creators=None, dialect=None, sample_creation=None, - bibtex_citation="""@misc{roberts2021searching, - title={Searching for Scientific Evidence in a Pandemic: An Overview of TREC-COVID}, - author={Kirk Roberts and Tasmeer Alam and Steven Bedrick and Dina Demner-Fushman and Kyle Lo and Ian Soboroff and Ellen Voorhees and Lucy Lu Wang and William R Hersh}, - year={2021}, - eprint={2104.09632}, - archivePrefix={arXiv}, - primaryClass={cs.IR} -}""", + bibtex_citation=r""" +@misc{roberts2021searching, + archiveprefix = {arXiv}, + author = {Kirk Roberts and Tasmeer Alam and Steven Bedrick and Dina Demner-Fushman and Kyle Lo and Ian Soboroff and Ellen Voorhees and Lucy Lu Wang and William R Hersh}, + eprint = {2104.09632}, + primaryclass = {cs.IR}, + title = {Searching for Scientific Evidence in a Pandemic: An Overview of TREC-COVID}, + year = {2021}, +} +""", prompt={ "query": "Given a query on COVID-19, retrieve documents that answer the query" }, diff --git a/mteb/tasks/Retrieval/eng/TempReasonL1Retrieval.py b/mteb/tasks/Retrieval/eng/TempReasonL1Retrieval.py index 392dd1c1b7..4a939593b3 100644 --- a/mteb/tasks/Retrieval/eng/TempReasonL1Retrieval.py +++ b/mteb/tasks/Retrieval/eng/TempReasonL1Retrieval.py @@ -28,17 +28,19 @@ class TempReasonL1(AbsTaskRetrieval): annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation="""@article{xiao2024rar, - title={RAR-b: Reasoning as Retrieval Benchmark}, - author={Xiao, Chenghao and Hudson, G Thomas and Moubayed, Noura Al}, - journal={arXiv preprint arXiv:2404.06347}, - year={2024} -} + bibtex_citation=r""" @article{tan2023towards, - title={Towards benchmarking and improving the temporal reasoning capability of large language models}, - author={Tan, Qingyu and Ng, Hwee Tou and Bing, Lidong}, - journal={arXiv preprint arXiv:2306.08952}, - year={2023} + author = {Tan, Qingyu and Ng, Hwee Tou and Bing, Lidong}, + journal = {arXiv preprint arXiv:2306.08952}, + title = {Towards benchmarking and improving the temporal reasoning capability of large language models}, + year = {2023}, +} + +@article{xiao2024rar, + author = {Xiao, Chenghao and Hudson, G Thomas and Moubayed, Noura Al}, + journal = {arXiv preprint arXiv:2404.06347}, + title = {RAR-b: Reasoning as Retrieval Benchmark}, + year = {2024}, } """, prompt={ diff --git a/mteb/tasks/Retrieval/eng/TempReasonL2ContextRetrieval.py b/mteb/tasks/Retrieval/eng/TempReasonL2ContextRetrieval.py index 924c1621f5..25e1fa96da 100644 --- a/mteb/tasks/Retrieval/eng/TempReasonL2ContextRetrieval.py +++ b/mteb/tasks/Retrieval/eng/TempReasonL2ContextRetrieval.py @@ -28,17 +28,19 @@ class TempReasonL2Context(AbsTaskRetrieval): annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation="""@article{xiao2024rar, - title={RAR-b: Reasoning as Retrieval Benchmark}, - author={Xiao, Chenghao and Hudson, G Thomas and Moubayed, Noura Al}, - journal={arXiv preprint arXiv:2404.06347}, - year={2024} -} + bibtex_citation=r""" @article{tan2023towards, - title={Towards benchmarking and improving the temporal reasoning capability of large language models}, - author={Tan, Qingyu and Ng, Hwee Tou and Bing, Lidong}, - journal={arXiv preprint arXiv:2306.08952}, - year={2023} + author = {Tan, Qingyu and Ng, Hwee Tou and Bing, Lidong}, + journal = {arXiv preprint arXiv:2306.08952}, + title = {Towards benchmarking and improving the temporal reasoning capability of large language models}, + year = {2023}, +} + +@article{xiao2024rar, + author = {Xiao, Chenghao and Hudson, G Thomas and Moubayed, Noura Al}, + journal = {arXiv preprint arXiv:2404.06347}, + title = {RAR-b: Reasoning as Retrieval Benchmark}, + year = {2024}, } """, prompt={ diff --git a/mteb/tasks/Retrieval/eng/TempReasonL2FactRetrieval.py b/mteb/tasks/Retrieval/eng/TempReasonL2FactRetrieval.py index 4e1fc53a29..4f280a9d65 100644 --- a/mteb/tasks/Retrieval/eng/TempReasonL2FactRetrieval.py +++ b/mteb/tasks/Retrieval/eng/TempReasonL2FactRetrieval.py @@ -28,17 +28,19 @@ class TempReasonL2Fact(AbsTaskRetrieval): annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation="""@article{xiao2024rar, - title={RAR-b: Reasoning as Retrieval Benchmark}, - author={Xiao, Chenghao and Hudson, G Thomas and Moubayed, Noura Al}, - journal={arXiv preprint arXiv:2404.06347}, - year={2024} -} + bibtex_citation=r""" @article{tan2023towards, - title={Towards benchmarking and improving the temporal reasoning capability of large language models}, - author={Tan, Qingyu and Ng, Hwee Tou and Bing, Lidong}, - journal={arXiv preprint arXiv:2306.08952}, - year={2023} + author = {Tan, Qingyu and Ng, Hwee Tou and Bing, Lidong}, + journal = {arXiv preprint arXiv:2306.08952}, + title = {Towards benchmarking and improving the temporal reasoning capability of large language models}, + year = {2023}, +} + +@article{xiao2024rar, + author = {Xiao, Chenghao and Hudson, G Thomas and Moubayed, Noura Al}, + journal = {arXiv preprint arXiv:2404.06347}, + title = {RAR-b: Reasoning as Retrieval Benchmark}, + year = {2024}, } """, prompt={ diff --git a/mteb/tasks/Retrieval/eng/TempReasonL2PureRetrieval.py b/mteb/tasks/Retrieval/eng/TempReasonL2PureRetrieval.py index b69989af03..4e0899004d 100644 --- a/mteb/tasks/Retrieval/eng/TempReasonL2PureRetrieval.py +++ b/mteb/tasks/Retrieval/eng/TempReasonL2PureRetrieval.py @@ -28,17 +28,19 @@ class TempReasonL2Pure(AbsTaskRetrieval): annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation="""@article{xiao2024rar, - title={RAR-b: Reasoning as Retrieval Benchmark}, - author={Xiao, Chenghao and Hudson, G Thomas and Moubayed, Noura Al}, - journal={arXiv preprint arXiv:2404.06347}, - year={2024} -} + bibtex_citation=r""" @article{tan2023towards, - title={Towards benchmarking and improving the temporal reasoning capability of large language models}, - author={Tan, Qingyu and Ng, Hwee Tou and Bing, Lidong}, - journal={arXiv preprint arXiv:2306.08952}, - year={2023} + author = {Tan, Qingyu and Ng, Hwee Tou and Bing, Lidong}, + journal = {arXiv preprint arXiv:2306.08952}, + title = {Towards benchmarking and improving the temporal reasoning capability of large language models}, + year = {2023}, +} + +@article{xiao2024rar, + author = {Xiao, Chenghao and Hudson, G Thomas and Moubayed, Noura Al}, + journal = {arXiv preprint arXiv:2404.06347}, + title = {RAR-b: Reasoning as Retrieval Benchmark}, + year = {2024}, } """, prompt={"query": "Given the following question, retrieve the correct answer."}, diff --git a/mteb/tasks/Retrieval/eng/TempReasonL3ContextRetrieval.py b/mteb/tasks/Retrieval/eng/TempReasonL3ContextRetrieval.py index 65f70ab13a..26e7b44404 100644 --- a/mteb/tasks/Retrieval/eng/TempReasonL3ContextRetrieval.py +++ b/mteb/tasks/Retrieval/eng/TempReasonL3ContextRetrieval.py @@ -28,17 +28,19 @@ class TempReasonL3Context(AbsTaskRetrieval): annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation="""@article{xiao2024rar, - title={RAR-b: Reasoning as Retrieval Benchmark}, - author={Xiao, Chenghao and Hudson, G Thomas and Moubayed, Noura Al}, - journal={arXiv preprint arXiv:2404.06347}, - year={2024} -} + bibtex_citation=r""" @article{tan2023towards, - title={Towards benchmarking and improving the temporal reasoning capability of large language models}, - author={Tan, Qingyu and Ng, Hwee Tou and Bing, Lidong}, - journal={arXiv preprint arXiv:2306.08952}, - year={2023} + author = {Tan, Qingyu and Ng, Hwee Tou and Bing, Lidong}, + journal = {arXiv preprint arXiv:2306.08952}, + title = {Towards benchmarking and improving the temporal reasoning capability of large language models}, + year = {2023}, +} + +@article{xiao2024rar, + author = {Xiao, Chenghao and Hudson, G Thomas and Moubayed, Noura Al}, + journal = {arXiv preprint arXiv:2404.06347}, + title = {RAR-b: Reasoning as Retrieval Benchmark}, + year = {2024}, } """, prompt={ diff --git a/mteb/tasks/Retrieval/eng/TempReasonL3FactRetrieval.py b/mteb/tasks/Retrieval/eng/TempReasonL3FactRetrieval.py index 65db6a70ba..532469d67c 100644 --- a/mteb/tasks/Retrieval/eng/TempReasonL3FactRetrieval.py +++ b/mteb/tasks/Retrieval/eng/TempReasonL3FactRetrieval.py @@ -28,17 +28,19 @@ class TempReasonL3Fact(AbsTaskRetrieval): annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation="""@article{xiao2024rar, - title={RAR-b: Reasoning as Retrieval Benchmark}, - author={Xiao, Chenghao and Hudson, G Thomas and Moubayed, Noura Al}, - journal={arXiv preprint arXiv:2404.06347}, - year={2024} -} + bibtex_citation=r""" @article{tan2023towards, - title={Towards benchmarking and improving the temporal reasoning capability of large language models}, - author={Tan, Qingyu and Ng, Hwee Tou and Bing, Lidong}, - journal={arXiv preprint arXiv:2306.08952}, - year={2023} + author = {Tan, Qingyu and Ng, Hwee Tou and Bing, Lidong}, + journal = {arXiv preprint arXiv:2306.08952}, + title = {Towards benchmarking and improving the temporal reasoning capability of large language models}, + year = {2023}, +} + +@article{xiao2024rar, + author = {Xiao, Chenghao and Hudson, G Thomas and Moubayed, Noura Al}, + journal = {arXiv preprint arXiv:2404.06347}, + title = {RAR-b: Reasoning as Retrieval Benchmark}, + year = {2024}, } """, prompt={ diff --git a/mteb/tasks/Retrieval/eng/TempReasonL3PureRetrieval.py b/mteb/tasks/Retrieval/eng/TempReasonL3PureRetrieval.py index 32738f7180..12340dd35b 100644 --- a/mteb/tasks/Retrieval/eng/TempReasonL3PureRetrieval.py +++ b/mteb/tasks/Retrieval/eng/TempReasonL3PureRetrieval.py @@ -28,17 +28,19 @@ class TempReasonL3Pure(AbsTaskRetrieval): annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation="""@article{xiao2024rar, - title={RAR-b: Reasoning as Retrieval Benchmark}, - author={Xiao, Chenghao and Hudson, G Thomas and Moubayed, Noura Al}, - journal={arXiv preprint arXiv:2404.06347}, - year={2024} -} + bibtex_citation=r""" @article{tan2023towards, - title={Towards benchmarking and improving the temporal reasoning capability of large language models}, - author={Tan, Qingyu and Ng, Hwee Tou and Bing, Lidong}, - journal={arXiv preprint arXiv:2306.08952}, - year={2023} + author = {Tan, Qingyu and Ng, Hwee Tou and Bing, Lidong}, + journal = {arXiv preprint arXiv:2306.08952}, + title = {Towards benchmarking and improving the temporal reasoning capability of large language models}, + year = {2023}, +} + +@article{xiao2024rar, + author = {Xiao, Chenghao and Hudson, G Thomas and Moubayed, Noura Al}, + journal = {arXiv preprint arXiv:2404.06347}, + title = {RAR-b: Reasoning as Retrieval Benchmark}, + year = {2024}, } """, prompt={"query": "Given the following question, retrieve the correct answer."}, diff --git a/mteb/tasks/Retrieval/eng/TopiOCQARetrieval.py b/mteb/tasks/Retrieval/eng/TopiOCQARetrieval.py index 5814fbc648..dddb0e0c89 100644 --- a/mteb/tasks/Retrieval/eng/TopiOCQARetrieval.py +++ b/mteb/tasks/Retrieval/eng/TopiOCQARetrieval.py @@ -40,16 +40,16 @@ class TopiOCQARetrieval(AbsTaskRetrieval): annotations_creators="human-annotated", dialect=[], sample_creation="found", - bibtex_citation=""" - @misc{adlakha2022topiocqa, - title={TopiOCQA: Open-domain Conversational Question Answering with Topic Switching}, - author={Vaibhav Adlakha and Shehzaad Dhuliawala and Kaheer Suleman and Harm de Vries and Siva Reddy}, - year={2022}, - eprint={2110.00768}, - archivePrefix={arXiv}, - primaryClass={cs.CL} - } - """, + bibtex_citation=r""" +@misc{adlakha2022topiocqa, + archiveprefix = {arXiv}, + author = {Vaibhav Adlakha and Shehzaad Dhuliawala and Kaheer Suleman and Harm de Vries and Siva Reddy}, + eprint = {2110.00768}, + primaryclass = {cs.CL}, + title = {TopiOCQA: Open-domain Conversational Question Answering with Topic Switching}, + year = {2022}, +} +""", ) # TODO: Will be removed if curated and added to mteb HF @@ -123,15 +123,15 @@ class TopiOCQARetrievalHardNegatives(AbsTaskRetrieval): annotations_creators="human-annotated", dialect=[], sample_creation="found", - bibtex_citation=""" - @misc{adlakha2022topiocqa, - title={TopiOCQA: Open-domain Conversational Question Answering with Topic Switching}, - author={Vaibhav Adlakha and Shehzaad Dhuliawala and Kaheer Suleman and Harm de Vries and Siva Reddy}, - year={2022}, - eprint={2110.00768}, - archivePrefix={arXiv}, - primaryClass={cs.CL} - } - """, + bibtex_citation=r""" +@misc{adlakha2022topiocqa, + archiveprefix = {arXiv}, + author = {Vaibhav Adlakha and Shehzaad Dhuliawala and Kaheer Suleman and Harm de Vries and Siva Reddy}, + eprint = {2110.00768}, + primaryclass = {cs.CL}, + title = {TopiOCQA: Open-domain Conversational Question Answering with Topic Switching}, + year = {2022}, +} +""", adapted_from=["TopiOCQA"], ) diff --git a/mteb/tasks/Retrieval/eng/Touche2020Retrieval.py b/mteb/tasks/Retrieval/eng/Touche2020Retrieval.py index e97452da60..dd196b7f3e 100644 --- a/mteb/tasks/Retrieval/eng/Touche2020Retrieval.py +++ b/mteb/tasks/Retrieval/eng/Touche2020Retrieval.py @@ -28,23 +28,25 @@ class Touche2020(AbsTaskRetrieval): annotations_creators="human-annotated", dialect=[], sample_creation="found", - bibtex_citation="""@dataset{potthast_2022_6862281, - author = {Potthast, Martin and - Gienapp, Lukas and - Wachsmuth, Henning and - Hagen, Matthias and - Fröbe, Maik and - Bondarenko, Alexander and - Ajjour, Yamen and - Stein, Benno}, - title = {{Touché20-Argument-Retrieval-for-Controversial- - Questions}}, - month = jul, - year = 2022, - publisher = {Zenodo}, - doi = {10.5281/zenodo.6862281}, - url = {https://doi.org/10.5281/zenodo.6862281} -}""", + bibtex_citation=r""" +@dataset{potthast_2022_6862281, + author = {Potthast, Martin and +Gienapp, Lukas and +Wachsmuth, Henning and +Hagen, Matthias and +Fröbe, Maik and +Bondarenko, Alexander and +Ajjour, Yamen and +Stein, Benno}, + doi = {10.5281/zenodo.6862281}, + month = jul, + publisher = {Zenodo}, + title = {{Touché20-Argument-Retrieval-for-Controversial- +Questions}}, + url = {https://doi.org/10.5281/zenodo.6862281}, + year = {2022}, +} +""", prompt={ "query": "Given a question, retrieve detailed and persuasive arguments that answer the question" }, @@ -74,12 +76,14 @@ class Touche2020v3Retrieval(AbsTaskRetrieval): annotations_creators="human-annotated", dialect=[], sample_creation="found", - bibtex_citation="""@INPROCEEDINGS{Thakur_etal_SIGIR2024, - author = "Nandan Thakur and Luiz Bonifacio and Maik {Fr\"{o}be} and Alexander Bondarenko and Ehsan Kamalloo and Martin Potthast and Matthias Hagen and Jimmy Lin", - title = "Systematic Evaluation of Neural Retrieval Models on the {Touch\'{e}} 2020 Argument Retrieval Subset of {BEIR}", - booktitle = "Proceedings of the 47th International ACM SIGIR Conference on Research and Development in Information Retrieval", - year = 2024, - address_ = "Washington, D.C." -}""", + bibtex_citation=r""" +@inproceedings{Thakur_etal_SIGIR2024, + address_ = {Washington, D.C.}, + author = {Nandan Thakur and Luiz Bonifacio and Maik {Fr\"{o}be} and Alexander Bondarenko and Ehsan Kamalloo and Martin Potthast and Matthias Hagen and Jimmy Lin}, + booktitle = {Proceedings of the 47th International ACM SIGIR Conference on Research and Development in Information Retrieval}, + title = {Systematic Evaluation of Neural Retrieval Models on the {Touch\'{e}} 2020 Argument Retrieval Subset of {BEIR}}, + year = {2024}, +} +""", adapted_from=["Touche2020"], ) diff --git a/mteb/tasks/Retrieval/eng/WinoGrandeRetrieval.py b/mteb/tasks/Retrieval/eng/WinoGrandeRetrieval.py index 01b5f2d1cc..e308f2afd5 100644 --- a/mteb/tasks/Retrieval/eng/WinoGrandeRetrieval.py +++ b/mteb/tasks/Retrieval/eng/WinoGrandeRetrieval.py @@ -28,21 +28,23 @@ class WinoGrande(AbsTaskRetrieval): annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation="""@article{xiao2024rar, - title={RAR-b: Reasoning as Retrieval Benchmark}, - author={Xiao, Chenghao and Hudson, G Thomas and Moubayed, Noura Al}, - journal={arXiv preprint arXiv:2404.06347}, - year={2024} -} + bibtex_citation=r""" @article{sakaguchi2021winogrande, - title={Winogrande: An adversarial winograd schema challenge at scale}, - author={Sakaguchi, Keisuke and Bras, Ronan Le and Bhagavatula, Chandra and Choi, Yejin}, - journal={Communications of the ACM}, - volume={64}, - number={9}, - pages={99--106}, - year={2021}, - publisher={ACM New York, NY, USA} + author = {Sakaguchi, Keisuke and Bras, Ronan Le and Bhagavatula, Chandra and Choi, Yejin}, + journal = {Communications of the ACM}, + number = {9}, + pages = {99--106}, + publisher = {ACM New York, NY, USA}, + title = {Winogrande: An adversarial winograd schema challenge at scale}, + volume = {64}, + year = {2021}, +} + +@article{xiao2024rar, + author = {Xiao, Chenghao and Hudson, G Thomas and Moubayed, Noura Al}, + journal = {arXiv preprint arXiv:2404.06347}, + title = {RAR-b: Reasoning as Retrieval Benchmark}, + year = {2024}, } """, prompt={ diff --git a/mteb/tasks/Retrieval/est/estqa.py b/mteb/tasks/Retrieval/est/estqa.py index b8eebb61d9..71dbaa7aa4 100644 --- a/mteb/tasks/Retrieval/est/estqa.py +++ b/mteb/tasks/Retrieval/est/estqa.py @@ -32,12 +32,12 @@ class EstQA(AbsTaskRetrieval): annotations_creators="human-annotated", dialect=[], sample_creation="found", - bibtex_citation=""" + bibtex_citation=r""" @mastersthesis{mastersthesis, - author = {Anu Käver}, - title = {Extractive Question Answering for Estonian Language}, - school = {Tallinn University of Technology (TalTech)}, - year = 2021 + author = {Anu Käver}, + school = {Tallinn University of Technology (TalTech)}, + title = {Extractive Question Answering for Estonian Language}, + year = {2021}, } """, ) diff --git a/mteb/tasks/Retrieval/fra/AlloprofRetrieval.py b/mteb/tasks/Retrieval/fra/AlloprofRetrieval.py index ada02b511b..ccc8e7c581 100644 --- a/mteb/tasks/Retrieval/fra/AlloprofRetrieval.py +++ b/mteb/tasks/Retrieval/fra/AlloprofRetrieval.py @@ -30,16 +30,18 @@ class AlloprofRetrieval(AbsTaskRetrieval): annotations_creators="human-annotated", dialect=[], sample_creation="found", - bibtex_citation="""@misc{lef23, - doi = {10.48550/ARXIV.2302.07738}, - url = {https://arxiv.org/abs/2302.07738}, + bibtex_citation=r""" +@misc{lef23, author = {Lefebvre-Brossard, Antoine and Gazaille, Stephane and Desmarais, Michel C.}, + copyright = {Creative Commons Attribution Non Commercial Share Alike 4.0 International}, + doi = {10.48550/ARXIV.2302.07738}, keywords = {Computation and Language (cs.CL), Information Retrieval (cs.IR), Machine Learning (cs.LG), FOS: Computer and information sciences, FOS: Computer and information sciences}, - title = {Alloprof: a new French question-answer education dataset and its use in an information retrieval case study}, publisher = {arXiv}, + title = {Alloprof: a new French question-answer education dataset and its use in an information retrieval case study}, + url = {https://arxiv.org/abs/2302.07738}, year = {2023}, - copyright = {Creative Commons Attribution Non Commercial Share Alike 4.0 International} -}""", +} +""", ) def load_data(self, **kwargs): diff --git a/mteb/tasks/Retrieval/fra/BSARDRetrieval.py b/mteb/tasks/Retrieval/fra/BSARDRetrieval.py index 93509c51fc..9052a40a30 100644 --- a/mteb/tasks/Retrieval/fra/BSARDRetrieval.py +++ b/mteb/tasks/Retrieval/fra/BSARDRetrieval.py @@ -32,18 +32,20 @@ class BSARDRetrieval(AbsTaskRetrieval): annotations_creators="expert-annotated", dialect=[], sample_creation="found", - bibtex_citation="""@inproceedings{louis2022statutory, - title = {A Statutory Article Retrieval Dataset in French}, + bibtex_citation=r""" +@inproceedings{louis2022statutory, + address = {Dublin, Ireland}, author = {Louis, Antoine and Spanakis, Gerasimos}, booktitle = {Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics}, + doi = {10.18653/v1/2022.acl-long.468}, month = may, - year = {2022}, - address = {Dublin, Ireland}, + pages = {6789–6803}, publisher = {Association for Computational Linguistics}, + title = {A Statutory Article Retrieval Dataset in French}, url = {https://aclanthology.org/2022.acl-long.468/}, - doi = {10.18653/v1/2022.acl-long.468}, - pages = {6789–6803}, -}""", + year = {2022}, +} +""", ) def load_data(self, **kwargs): diff --git a/mteb/tasks/Retrieval/fra/FQuADRetrieval.py b/mteb/tasks/Retrieval/fra/FQuADRetrieval.py index 20a54b8232..c54a760599 100644 --- a/mteb/tasks/Retrieval/fra/FQuADRetrieval.py +++ b/mteb/tasks/Retrieval/fra/FQuADRetrieval.py @@ -31,25 +31,27 @@ class FQuADRetrieval(AbsTaskRetrieval): annotations_creators="human-annotated", dialect=[], sample_creation="created", - bibtex_citation="""@inproceedings{dhoffschmidt-etal-2020-fquad, - title = "{FQ}u{AD}: {F}rench Question Answering Dataset", - author = "d{'}Hoffschmidt, Martin and - Belblidia, Wacim and - Heinrich, Quentin and - Brendl{\'e}, Tom and - Vidal, Maxime", - editor = "Cohn, Trevor and - He, Yulan and - Liu, Yang", - booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2020", - month = nov, - year = "2020", - address = "Online", - publisher = "Association for Computational Linguistics", - url = "https://aclanthology.org/2020.findings-emnlp.107", - doi = "10.18653/v1/2020.findings-emnlp.107", - pages = "1193--1208", -}""", + bibtex_citation=r""" +@inproceedings{dhoffschmidt-etal-2020-fquad, + address = {Online}, + author = {d{'}Hoffschmidt, Martin and +Belblidia, Wacim and +Heinrich, Quentin and +Brendl{\'e}, Tom and +Vidal, Maxime}, + booktitle = {Findings of the Association for Computational Linguistics: EMNLP 2020}, + doi = {10.18653/v1/2020.findings-emnlp.107}, + editor = {Cohn, Trevor and +He, Yulan and +Liu, Yang}, + month = nov, + pages = {1193--1208}, + publisher = {Association for Computational Linguistics}, + title = {{FQ}u{AD}: {F}rench Question Answering Dataset}, + url = {https://aclanthology.org/2020.findings-emnlp.107}, + year = {2020}, +} +""", ) def load_data(self, **kwargs): diff --git a/mteb/tasks/Retrieval/fra/SyntecRetrieval.py b/mteb/tasks/Retrieval/fra/SyntecRetrieval.py index fb17776268..5a161c5b15 100644 --- a/mteb/tasks/Retrieval/fra/SyntecRetrieval.py +++ b/mteb/tasks/Retrieval/fra/SyntecRetrieval.py @@ -31,14 +31,16 @@ class SyntecRetrieval(AbsTaskRetrieval): annotations_creators="human-annotated", dialect=[], sample_creation="created", - bibtex_citation="""@misc{ciancone2024extending, - title={Extending the Massive Text Embedding Benchmark to French}, - author={Mathieu Ciancone and Imene Kerboua and Marion Schaeffer and Wissam Siblini}, - year={2024}, - eprint={2405.20468}, - archivePrefix={arXiv}, - primaryClass={cs.CL} -}""", + bibtex_citation=r""" +@misc{ciancone2024extending, + archiveprefix = {arXiv}, + author = {Mathieu Ciancone and Imene Kerboua and Marion Schaeffer and Wissam Siblini}, + eprint = {2405.20468}, + primaryclass = {cs.CL}, + title = {Extending the Massive Text Embedding Benchmark to French}, + year = {2024}, +} +""", ) def load_data(self, **kwargs): diff --git a/mteb/tasks/Retrieval/hun/HunSum2.py b/mteb/tasks/Retrieval/hun/HunSum2.py index 2a82834450..a8399d554b 100644 --- a/mteb/tasks/Retrieval/hun/HunSum2.py +++ b/mteb/tasks/Retrieval/hun/HunSum2.py @@ -34,14 +34,14 @@ class HunSum2AbstractiveRetrieval(AbsTaskRetrieval): annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation=""" + bibtex_citation=r""" @misc{barta2024news, - title={From News to Summaries: Building a Hungarian Corpus for Extractive and Abstractive Summarization}, - author={Botond Barta and Dorina Lakatos and Attila Nagy and Milán Konor Nyist and Judit Ács}, - year={2024}, - eprint={2404.03555}, - archivePrefix={arXiv}, - primaryClass={cs.CL} + archiveprefix = {arXiv}, + author = {Botond Barta and Dorina Lakatos and Attila Nagy and Milán Konor Nyist and Judit Ács}, + eprint = {2404.03555}, + primaryclass = {cs.CL}, + title = {From News to Summaries: Building a Hungarian Corpus for Extractive and Abstractive Summarization}, + year = {2024}, } """, ) diff --git a/mteb/tasks/Retrieval/jpn/JaQuADRetrieval.py b/mteb/tasks/Retrieval/jpn/JaQuADRetrieval.py index 07fb165632..3f16441fee 100644 --- a/mteb/tasks/Retrieval/jpn/JaQuADRetrieval.py +++ b/mteb/tasks/Retrieval/jpn/JaQuADRetrieval.py @@ -29,14 +29,16 @@ class JaQuADRetrieval(AbsTaskRetrieval): annotations_creators="human-annotated", dialect=None, sample_creation="found", - bibtex_citation="""@misc{so2022jaquad, - title={{JaQuAD: Japanese Question Answering Dataset for Machine Reading Comprehension}}, - author={ByungHoon So and Kyuhong Byun and Kyungwon Kang and Seongjin Cho}, - year={2022}, - eprint={2202.01764}, - archivePrefix={arXiv}, - primaryClass={cs.CL} -}""", + bibtex_citation=r""" +@misc{so2022jaquad, + archiveprefix = {arXiv}, + author = {ByungHoon So and Kyuhong Byun and Kyungwon Kang and Seongjin Cho}, + eprint = {2202.01764}, + primaryclass = {cs.CL}, + title = {{JaQuAD: Japanese Question Answering Dataset for Machine Reading Comprehension}}, + year = {2022}, +} +""", ) def load_data(self, **kwargs): diff --git a/mteb/tasks/Retrieval/jpn/JaqketRetrieval.py b/mteb/tasks/Retrieval/jpn/JaqketRetrieval.py index bff152e239..19c42cb474 100644 --- a/mteb/tasks/Retrieval/jpn/JaqketRetrieval.py +++ b/mteb/tasks/Retrieval/jpn/JaqketRetrieval.py @@ -26,12 +26,14 @@ class JaqketRetrieval(AbsTaskRetrieval): annotations_creators="human-annotated", dialect=[], sample_creation="found", - bibtex_citation="""@InProceedings{Kurihara_nlp2020, -author = "鈴木正敏 and 鈴木潤 and 松田耕史 and ⻄田京介 and 井之上直也", -title = "JAQKET: クイズを題材にした日本語 QA データセットの構築", -booktitle = "言語処理学会第26回年次大会", -year = "2020", -url = "https://www.anlp.jp/proceedings/annual_meeting/2020/pdf_dir/P2-24.pdf" -note= "in Japanese" -}""", + bibtex_citation=r""" +@inproceedings{Kurihara_nlp2020, + author = {鈴木正敏 and 鈴木潤 and 松田耕史 and ⻄田京介 and 井之上直也}, + booktitle = {言語処理学会第26回年次大会}, + note = {in Japanese}, + title = {JAQKET: クイズを題材にした日本語 QA データセットの構築}, + url = {https://www.anlp.jp/proceedings/annual_meeting/2020/pdf_dir/P2-24.pdf}, + year = {2020}, +} +""", ) diff --git a/mteb/tasks/Retrieval/kor/AutoRAGRetrieval.py b/mteb/tasks/Retrieval/kor/AutoRAGRetrieval.py index 10102d73ac..18ae3e45c3 100644 --- a/mteb/tasks/Retrieval/kor/AutoRAGRetrieval.py +++ b/mteb/tasks/Retrieval/kor/AutoRAGRetrieval.py @@ -28,13 +28,15 @@ class AutoRAGRetrieval(AbsTaskRetrieval): annotations_creators="human-annotated", dialect=[], sample_creation="created", - bibtex_citation="""@misc{kim2024autoragautomatedframeworkoptimization, - title={AutoRAG: Automated Framework for optimization of Retrieval Augmented Generation Pipeline}, - author={Dongkyu Kim and Byoungwook Kim and Donggeon Han and Matouš Eibich}, - year={2024}, - eprint={2410.20878}, - archivePrefix={arXiv}, - primaryClass={cs.CL}, - url={https://arxiv.org/abs/2410.20878}, -}""", + bibtex_citation=r""" +@misc{kim2024autoragautomatedframeworkoptimization, + archiveprefix = {arXiv}, + author = {Dongkyu Kim and Byoungwook Kim and Donggeon Han and Matouš Eibich}, + eprint = {2410.20878}, + primaryclass = {cs.CL}, + title = {AutoRAG: Automated Framework for optimization of Retrieval Augmented Generation Pipeline}, + url = {https://arxiv.org/abs/2410.20878}, + year = {2024}, +} +""", ) diff --git a/mteb/tasks/Retrieval/kor/KoStrategyQA.py b/mteb/tasks/Retrieval/kor/KoStrategyQA.py index ce64da5432..4c5b7ec906 100644 --- a/mteb/tasks/Retrieval/kor/KoStrategyQA.py +++ b/mteb/tasks/Retrieval/kor/KoStrategyQA.py @@ -27,10 +27,12 @@ class KoStrategyQA(AbsTaskRetrieval): annotations_creators=None, dialect=None, sample_creation=None, - bibtex_citation="""@article{geva2021strategyqa, - title = {{Did Aristotle Use a Laptop? A Question Answering Benchmark with Implicit Reasoning Strategies}}, + bibtex_citation=r""" +@article{geva2021strategyqa, author = {Geva, Mor and Khashabi, Daniel and Segal, Elad and Khot, Tushar and Roth, Dan and Berant, Jonathan}, journal = {Transactions of the Association for Computational Linguistics (TACL)}, + title = {{Did Aristotle Use a Laptop? A Question Answering Benchmark with Implicit Reasoning Strategies}}, year = {2021}, -}""", +} +""", ) diff --git a/mteb/tasks/Retrieval/multilingual/BelebeleRetrieval.py b/mteb/tasks/Retrieval/multilingual/BelebeleRetrieval.py index 8e134751b0..9284c4cc45 100644 --- a/mteb/tasks/Retrieval/multilingual/BelebeleRetrieval.py +++ b/mteb/tasks/Retrieval/multilingual/BelebeleRetrieval.py @@ -202,12 +202,14 @@ class BelebeleRetrieval(MultilingualTask, AbsTaskRetrieval): task_subtypes=["Question answering"], annotations_creators="expert-annotated", dialect=[], - bibtex_citation="""@article{bandarkar2023belebele, - title={The Belebele Benchmark: a Parallel Reading Comprehension Dataset in 122 Language Variants}, - author={Lucas Bandarkar and Davis Liang and Benjamin Muller and Mikel Artetxe and Satya Narayan Shukla and Donald Husa and Naman Goyal and Abhinandan Krishnan and Luke Zettlemoyer and Madian Khabsa}, - year={2023}, - journal={arXiv preprint arXiv:2308.16884} -}""", + bibtex_citation=r""" +@article{bandarkar2023belebele, + author = {Lucas Bandarkar and Davis Liang and Benjamin Muller and Mikel Artetxe and Satya Narayan Shukla and Donald Husa and Naman Goyal and Abhinandan Krishnan and Luke Zettlemoyer and Madian Khabsa}, + journal = {arXiv preprint arXiv:2308.16884}, + title = {The Belebele Benchmark: a Parallel Reading Comprehension Dataset in 122 Language Variants}, + year = {2023}, +} +""", ) def load_data(self, **kwargs) -> None: diff --git a/mteb/tasks/Retrieval/multilingual/CrossLingualSemanticDiscriminationWMT19.py b/mteb/tasks/Retrieval/multilingual/CrossLingualSemanticDiscriminationWMT19.py index 4ca7c5e495..6528faba3e 100644 --- a/mteb/tasks/Retrieval/multilingual/CrossLingualSemanticDiscriminationWMT19.py +++ b/mteb/tasks/Retrieval/multilingual/CrossLingualSemanticDiscriminationWMT19.py @@ -52,7 +52,7 @@ class CrossLingualSemanticDiscriminationWMT19(AbsTaskRetrieval, MultilingualTask annotations_creators="derived", dialect=[], sample_creation="LM-generated and verified", - bibtex_citation="preprint_coming", + bibtex_citation="", # preprint_coming ) def __init__(self, **kwargs): diff --git a/mteb/tasks/Retrieval/multilingual/CrossLingualSemanticDiscriminationWMT21.py b/mteb/tasks/Retrieval/multilingual/CrossLingualSemanticDiscriminationWMT21.py index f5c0262308..c569bab909 100644 --- a/mteb/tasks/Retrieval/multilingual/CrossLingualSemanticDiscriminationWMT21.py +++ b/mteb/tasks/Retrieval/multilingual/CrossLingualSemanticDiscriminationWMT21.py @@ -52,7 +52,7 @@ class CrossLingualSemanticDiscriminationWMT21(AbsTaskRetrieval, MultilingualTask annotations_creators="derived", dialect=[], sample_creation="LM-generated and verified", - bibtex_citation="preprint_coming", + bibtex_citation="", # preprint_coming ) def __init__(self, **kwargs): diff --git a/mteb/tasks/Retrieval/multilingual/IndicQARetrieval.py b/mteb/tasks/Retrieval/multilingual/IndicQARetrieval.py index 62a166f89c..e489142d15 100644 --- a/mteb/tasks/Retrieval/multilingual/IndicQARetrieval.py +++ b/mteb/tasks/Retrieval/multilingual/IndicQARetrieval.py @@ -47,13 +47,15 @@ class IndicQARetrieval(MultilingualTask, AbsTaskRetrieval): annotations_creators="human-annotated", dialect=[], sample_creation="machine-translated and verified", - bibtex_citation="""@article{doddapaneni2022towards, - title = {Towards Leaving No Indic Language Behind: Building Monolingual Corpora, Benchmark and Models for Indic Languages}, - author = {Sumanth Doddapaneni and Rahul Aralikatte and Gowtham Ramesh and Shreyansh Goyal and Mitesh M. Khapra and Anoop Kunchukuttan and Pratyush Kumar}, - journal = {Annual Meeting of the Association for Computational Linguistics}, - year = {2022}, - doi = {10.18653/v1/2023.acl-long.693} -}""", + bibtex_citation=r""" +@article{doddapaneni2022towards, + author = {Sumanth Doddapaneni and Rahul Aralikatte and Gowtham Ramesh and Shreyansh Goyal and Mitesh M. Khapra and Anoop Kunchukuttan and Pratyush Kumar}, + doi = {10.18653/v1/2023.acl-long.693}, + journal = {Annual Meeting of the Association for Computational Linguistics}, + title = {Towards Leaving No Indic Language Behind: Building Monolingual Corpora, Benchmark and Models for Indic Languages}, + year = {2022}, +} +""", ) def load_data(self, **kwargs): diff --git a/mteb/tasks/Retrieval/multilingual/MIRACLRetrieval.py b/mteb/tasks/Retrieval/multilingual/MIRACLRetrieval.py index a4101abbdb..d32d4d4cbd 100644 --- a/mteb/tasks/Retrieval/multilingual/MIRACLRetrieval.py +++ b/mteb/tasks/Retrieval/multilingual/MIRACLRetrieval.py @@ -125,20 +125,22 @@ class MIRACLRetrieval(MultilingualTask, AbsTaskRetrieval): annotations_creators="expert-annotated", dialect=[], sample_creation="created", - bibtex_citation="""@article{10.1162/tacl_a_00595, - author = {Zhang, Xinyu and Thakur, Nandan and Ogundepo, Odunayo and Kamalloo, Ehsan and Alfonso-Hermelo, David and Li, Xiaoguang and Liu, Qun and Rezagholizadeh, Mehdi and Lin, Jimmy}, - title = "{MIRACL: A Multilingual Retrieval Dataset Covering 18 Diverse Languages}", - journal = {Transactions of the Association for Computational Linguistics}, - volume = {11}, - pages = {1114-1131}, - year = {2023}, - month = {09}, - abstract = "{MIRACL is a multilingual dataset for ad hoc retrieval across 18 languages that collectively encompass over three billion native speakers around the world. This resource is designed to support monolingual retrieval tasks, where the queries and the corpora are in the same language. In total, we have gathered over 726k high-quality relevance judgments for 78k queries over Wikipedia in these languages, where all annotations have been performed by native speakers hired by our team. MIRACL covers languages that are both typologically close as well as distant from 10 language families and 13 sub-families, associated with varying amounts of publicly available resources. Extensive automatic heuristic verification and manual assessments were performed during the annotation process to control data quality. In total, MIRACL represents an investment of around five person-years of human annotator effort. Our goal is to spur research on improving retrieval across a continuum of languages, thus enhancing information access capabilities for diverse populations around the world, particularly those that have traditionally been underserved. MIRACL is available at http://miracl.ai/.}", - issn = {2307-387X}, - doi = {10.1162/tacl_a_00595}, - url = {https://doi.org/10.1162/tacl\_a\_00595}, - eprint = {https://direct.mit.edu/tacl/article-pdf/doi/10.1162/tacl\_a\_00595/2157340/tacl\_a\_00595.pdf}, -}""", + bibtex_citation=r""" +@article{10.1162/tacl_a_00595, + abstract = {{MIRACL is a multilingual dataset for ad hoc retrieval across 18 languages that collectively encompass over three billion native speakers around the world. This resource is designed to support monolingual retrieval tasks, where the queries and the corpora are in the same language. In total, we have gathered over 726k high-quality relevance judgments for 78k queries over Wikipedia in these languages, where all annotations have been performed by native speakers hired by our team. MIRACL covers languages that are both typologically close as well as distant from 10 language families and 13 sub-families, associated with varying amounts of publicly available resources. Extensive automatic heuristic verification and manual assessments were performed during the annotation process to control data quality. In total, MIRACL represents an investment of around five person-years of human annotator effort. Our goal is to spur research on improving retrieval across a continuum of languages, thus enhancing information access capabilities for diverse populations around the world, particularly those that have traditionally been underserved. MIRACL is available at http://miracl.ai/.}}, + author = {Zhang, Xinyu and Thakur, Nandan and Ogundepo, Odunayo and Kamalloo, Ehsan and Alfonso-Hermelo, David and Li, Xiaoguang and Liu, Qun and Rezagholizadeh, Mehdi and Lin, Jimmy}, + doi = {10.1162/tacl_a_00595}, + eprint = {https://direct.mit.edu/tacl/article-pdf/doi/10.1162/tacl\_a\_00595/2157340/tacl\_a\_00595.pdf}, + issn = {2307-387X}, + journal = {Transactions of the Association for Computational Linguistics}, + month = {09}, + pages = {1114-1131}, + title = {{MIRACL: A Multilingual Retrieval Dataset Covering 18 Diverse Languages}}, + url = {https://doi.org/10.1162/tacl\_a\_00595}, + volume = {11}, + year = {2023}, +} +""", prompt={ "query": "Given a question, retrieve Wikipedia passages that answer the question" }, @@ -320,20 +322,22 @@ class MIRACLRetrievalHardNegatives(MultilingualTask, AbsTaskRetrieval): annotations_creators="expert-annotated", dialect=[], sample_creation="created", - bibtex_citation="""@article{10.1162/tacl_a_00595, - author = {Zhang, Xinyu and Thakur, Nandan and Ogundepo, Odunayo and Kamalloo, Ehsan and Alfonso-Hermelo, David and Li, Xiaoguang and Liu, Qun and Rezagholizadeh, Mehdi and Lin, Jimmy}, - title = "{MIRACL: A Multilingual Retrieval Dataset Covering 18 Diverse Languages}", - journal = {Transactions of the Association for Computational Linguistics}, - volume = {11}, - pages = {1114-1131}, - year = {2023}, - month = {09}, - abstract = "{MIRACL is a multilingual dataset for ad hoc retrieval across 18 languages that collectively encompass over three billion native speakers around the world. This resource is designed to support monolingual retrieval tasks, where the queries and the corpora are in the same language. In total, we have gathered over 726k high-quality relevance judgments for 78k queries over Wikipedia in these languages, where all annotations have been performed by native speakers hired by our team. MIRACL covers languages that are both typologically close as well as distant from 10 language families and 13 sub-families, associated with varying amounts of publicly available resources. Extensive automatic heuristic verification and manual assessments were performed during the annotation process to control data quality. In total, MIRACL represents an investment of around five person-years of human annotator effort. Our goal is to spur research on improving retrieval across a continuum of languages, thus enhancing information access capabilities for diverse populations around the world, particularly those that have traditionally been underserved. MIRACL is available at http://miracl.ai/.}", - issn = {2307-387X}, - doi = {10.1162/tacl_a_00595}, - url = {https://doi.org/10.1162/tacl\_a\_00595}, - eprint = {https://direct.mit.edu/tacl/article-pdf/doi/10.1162/tacl\_a\_00595/2157340/tacl\_a\_00595.pdf}, -}""", + bibtex_citation=r""" +@article{10.1162/tacl_a_00595, + abstract = {{MIRACL is a multilingual dataset for ad hoc retrieval across 18 languages that collectively encompass over three billion native speakers around the world. This resource is designed to support monolingual retrieval tasks, where the queries and the corpora are in the same language. In total, we have gathered over 726k high-quality relevance judgments for 78k queries over Wikipedia in these languages, where all annotations have been performed by native speakers hired by our team. MIRACL covers languages that are both typologically close as well as distant from 10 language families and 13 sub-families, associated with varying amounts of publicly available resources. Extensive automatic heuristic verification and manual assessments were performed during the annotation process to control data quality. In total, MIRACL represents an investment of around five person-years of human annotator effort. Our goal is to spur research on improving retrieval across a continuum of languages, thus enhancing information access capabilities for diverse populations around the world, particularly those that have traditionally been underserved. MIRACL is available at http://miracl.ai/.}}, + author = {Zhang, Xinyu and Thakur, Nandan and Ogundepo, Odunayo and Kamalloo, Ehsan and Alfonso-Hermelo, David and Li, Xiaoguang and Liu, Qun and Rezagholizadeh, Mehdi and Lin, Jimmy}, + doi = {10.1162/tacl_a_00595}, + eprint = {https://direct.mit.edu/tacl/article-pdf/doi/10.1162/tacl\_a\_00595/2157340/tacl\_a\_00595.pdf}, + issn = {2307-387X}, + journal = {Transactions of the Association for Computational Linguistics}, + month = {09}, + pages = {1114-1131}, + title = {{MIRACL: A Multilingual Retrieval Dataset Covering 18 Diverse Languages}}, + url = {https://doi.org/10.1162/tacl\_a\_00595}, + volume = {11}, + year = {2023}, +} +""", adapted_from=["MIRACLRetrieval"], ) diff --git a/mteb/tasks/Retrieval/multilingual/MLQARetrieval.py b/mteb/tasks/Retrieval/multilingual/MLQARetrieval.py index c03f280b22..8cc7125fb7 100644 --- a/mteb/tasks/Retrieval/multilingual/MLQARetrieval.py +++ b/mteb/tasks/Retrieval/multilingual/MLQARetrieval.py @@ -103,13 +103,15 @@ class MLQARetrieval(AbsTaskRetrieval, MultilingualTask): annotations_creators="human-annotated", dialect=[], sample_creation="found", - bibtex_citation="""@article{lewis2019mlqa, - title = {MLQA: Evaluating Cross-lingual Extractive Question Answering}, - author = {Lewis, Patrick and Oguz, Barlas and Rinott, Ruty and Riedel, Sebastian and Schwenk, Holger}, - journal = {arXiv preprint arXiv:1910.07475}, - year = 2019, - eid = {arXiv: 1910.07475} - }""", + bibtex_citation=r""" +@article{lewis2019mlqa, + author = {Lewis, Patrick and Oguz, Barlas and Rinott, Ruty and Riedel, Sebastian and Schwenk, Holger}, + eid = {arXiv: 1910.07475}, + journal = {arXiv preprint arXiv:1910.07475}, + title = {MLQA: Evaluating Cross-lingual Extractive Question Answering}, + year = {2019}, +} +""", ) def load_data(self, **kwargs): diff --git a/mteb/tasks/Retrieval/multilingual/MintakaRetrieval.py b/mteb/tasks/Retrieval/multilingual/MintakaRetrieval.py index 3a44ba4e09..50e0d197f4 100644 --- a/mteb/tasks/Retrieval/multilingual/MintakaRetrieval.py +++ b/mteb/tasks/Retrieval/multilingual/MintakaRetrieval.py @@ -87,19 +87,21 @@ class MintakaRetrieval(MultilingualTask, AbsTaskRetrieval): annotations_creators="derived", # best guess dialect=[], sample_creation="human-translated", - bibtex_citation="""@inproceedings{sen-etal-2022-mintaka, - title = "Mintaka: A Complex, Natural, and Multilingual Dataset for End-to-End Question Answering", - author = "Sen, Priyanka and - Aji, Alham Fikri and - Saffari, Amir", - booktitle = "Proceedings of the 29th International Conference on Computational Linguistics", - month = oct, - year = "2022", - address = "Gyeongju, Republic of Korea", - publisher = "International Committee on Computational Linguistics", - url = "https://aclanthology.org/2022.coling-1.138", - pages = "1604--1619" -}""", + bibtex_citation=r""" +@inproceedings{sen-etal-2022-mintaka, + address = {Gyeongju, Republic of Korea}, + author = {Sen, Priyanka and +Aji, Alham Fikri and +Saffari, Amir}, + booktitle = {Proceedings of the 29th International Conference on Computational Linguistics}, + month = oct, + pages = {1604--1619}, + publisher = {International Committee on Computational Linguistics}, + title = {Mintaka: A Complex, Natural, and Multilingual Dataset for End-to-End Question Answering}, + url = {https://aclanthology.org/2022.coling-1.138}, + year = {2022}, +} +""", ) def load_data(self, **kwargs): diff --git a/mteb/tasks/Retrieval/multilingual/MrTidyRetrieval.py b/mteb/tasks/Retrieval/multilingual/MrTidyRetrieval.py index 5fcd725ac5..a11ef0df92 100644 --- a/mteb/tasks/Retrieval/multilingual/MrTidyRetrieval.py +++ b/mteb/tasks/Retrieval/multilingual/MrTidyRetrieval.py @@ -107,12 +107,14 @@ class MrTidyRetrieval(MultilingualTask, AbsTaskRetrieval): annotations_creators="human-annotated", dialect=[], sample_creation="found", - bibtex_citation="""@article{mrtydi, - title={{Mr. TyDi}: A Multi-lingual Benchmark for Dense Retrieval}, - author={Xinyu Zhang and Xueguang Ma and Peng Shi and Jimmy Lin}, - year={2021}, - journal={arXiv:2108.08787}, - }""", + bibtex_citation=r""" +@article{mrtydi, + author = {Xinyu Zhang and Xueguang Ma and Peng Shi and Jimmy Lin}, + journal = {arXiv:2108.08787}, + title = {{Mr. TyDi}: A Multi-lingual Benchmark for Dense Retrieval}, + year = {2021}, +} +""", ) def load_data(self, **kwargs): diff --git a/mteb/tasks/Retrieval/multilingual/MultiLongDocRetrieval.py b/mteb/tasks/Retrieval/multilingual/MultiLongDocRetrieval.py index 65cd3c6467..e143dac611 100644 --- a/mteb/tasks/Retrieval/multilingual/MultiLongDocRetrieval.py +++ b/mteb/tasks/Retrieval/multilingual/MultiLongDocRetrieval.py @@ -92,13 +92,14 @@ class MultiLongDocRetrieval(MultilingualTask, AbsTaskRetrieval): annotations_creators="LM-generated", # gpt-3.5 dialect=[], sample_creation="found", - bibtex_citation="""@misc{bge-m3, - title={BGE M3-Embedding: Multi-Lingual, Multi-Functionality, Multi-Granularity Text Embeddings Through Self-Knowledge Distillation}, - author={Jianlv Chen and Shitao Xiao and Peitian Zhang and Kun Luo and Defu Lian and Zheng Liu}, - year={2024}, - eprint={2402.03216}, - archivePrefix={arXiv}, - primaryClass={cs.CL} + bibtex_citation=r""" +@misc{bge-m3, + archiveprefix = {arXiv}, + author = {Jianlv Chen and Shitao Xiao and Peitian Zhang and Kun Luo and Defu Lian and Zheng Liu}, + eprint = {2402.03216}, + primaryclass = {cs.CL}, + title = {BGE M3-Embedding: Multi-Lingual, Multi-Functionality, Multi-Granularity Text Embeddings Through Self-Knowledge Distillation}, + year = {2024}, } """, ) diff --git a/mteb/tasks/Retrieval/multilingual/NeuCLIR2022Retrieval.py b/mteb/tasks/Retrieval/multilingual/NeuCLIR2022Retrieval.py index 2872a4d396..865473c0dc 100644 --- a/mteb/tasks/Retrieval/multilingual/NeuCLIR2022Retrieval.py +++ b/mteb/tasks/Retrieval/multilingual/NeuCLIR2022Retrieval.py @@ -80,12 +80,14 @@ class NeuCLIR2022Retrieval(MultilingualTask, AbsTaskRetrieval): annotations_creators="expert-annotated", dialect=[], sample_creation="found", - bibtex_citation="""@article{lawrie2023overview, - title={Overview of the TREC 2022 NeuCLIR track}, - author={Lawrie, Dawn and MacAvaney, Sean and Mayfield, James and McNamee, Paul and Oard, Douglas W and Soldaini, Luca and Yang, Eugene}, - journal={arXiv preprint arXiv:2304.12367}, - year={2023} -}""", + bibtex_citation=r""" +@article{lawrie2023overview, + author = {Lawrie, Dawn and MacAvaney, Sean and Mayfield, James and McNamee, Paul and Oard, Douglas W and Soldaini, Luca and Yang, Eugene}, + journal = {arXiv preprint arXiv:2304.12367}, + title = {Overview of the TREC 2022 NeuCLIR track}, + year = {2023}, +} +""", ) def load_data(self, **kwargs): @@ -193,12 +195,14 @@ class NeuCLIR2022RetrievalHardNegatives(MultilingualTask, AbsTaskRetrieval): annotations_creators="expert-annotated", dialect=[], sample_creation="found", - bibtex_citation="""@article{lawrie2023overview, - title={Overview of the TREC 2022 NeuCLIR track}, - author={Lawrie, Dawn and MacAvaney, Sean and Mayfield, James and McNamee, Paul and Oard, Douglas W and Soldaini, Luca and Yang, Eugene}, - journal={arXiv preprint arXiv:2304.12367}, - year={2023} -}""", + bibtex_citation=r""" +@article{lawrie2023overview, + author = {Lawrie, Dawn and MacAvaney, Sean and Mayfield, James and McNamee, Paul and Oard, Douglas W and Soldaini, Luca and Yang, Eugene}, + journal = {arXiv preprint arXiv:2304.12367}, + title = {Overview of the TREC 2022 NeuCLIR track}, + year = {2023}, +} +""", adapted_from=["NeuCLIR2022Retrieval"], ) diff --git a/mteb/tasks/Retrieval/multilingual/NeuCLIR2023Retrieval.py b/mteb/tasks/Retrieval/multilingual/NeuCLIR2023Retrieval.py index 675505df85..f28198b474 100644 --- a/mteb/tasks/Retrieval/multilingual/NeuCLIR2023Retrieval.py +++ b/mteb/tasks/Retrieval/multilingual/NeuCLIR2023Retrieval.py @@ -79,14 +79,16 @@ class NeuCLIR2023Retrieval(MultilingualTask, AbsTaskRetrieval): annotations_creators="expert-annotated", dialect=[], sample_creation="found", - bibtex_citation="""@misc{lawrie2024overview, - title={Overview of the TREC 2023 NeuCLIR Track}, - author={Dawn Lawrie and Sean MacAvaney and James Mayfield and Paul McNamee and Douglas W. Oard and Luca Soldaini and Eugene Yang}, - year={2024}, - eprint={2404.08071}, - archivePrefix={arXiv}, - primaryClass={cs.IR} -}""", + bibtex_citation=r""" +@misc{lawrie2024overview, + archiveprefix = {arXiv}, + author = {Dawn Lawrie and Sean MacAvaney and James Mayfield and Paul McNamee and Douglas W. Oard and Luca Soldaini and Eugene Yang}, + eprint = {2404.08071}, + primaryclass = {cs.IR}, + title = {Overview of the TREC 2023 NeuCLIR Track}, + year = {2024}, +} +""", ) def load_data(self, **kwargs): @@ -194,14 +196,16 @@ class NeuCLIR2023RetrievalHardNegatives(MultilingualTask, AbsTaskRetrieval): annotations_creators="expert-annotated", dialect=[], sample_creation="found", - bibtex_citation="""@misc{lawrie2024overview, - title={Overview of the TREC 2023 NeuCLIR Track}, - author={Dawn Lawrie and Sean MacAvaney and James Mayfield and Paul McNamee and Douglas W. Oard and Luca Soldaini and Eugene Yang}, - year={2024}, - eprint={2404.08071}, - archivePrefix={arXiv}, - primaryClass={cs.IR} -}""", + bibtex_citation=r""" +@misc{lawrie2024overview, + archiveprefix = {arXiv}, + author = {Dawn Lawrie and Sean MacAvaney and James Mayfield and Paul McNamee and Douglas W. Oard and Luca Soldaini and Eugene Yang}, + eprint = {2404.08071}, + primaryclass = {cs.IR}, + title = {Overview of the TREC 2023 NeuCLIR Track}, + year = {2024}, +} +""", adapted_from=["NeuCLIR2022Retrieval"], ) diff --git a/mteb/tasks/Retrieval/multilingual/PublicHealthQARetrieval.py b/mteb/tasks/Retrieval/multilingual/PublicHealthQARetrieval.py index 6f7d188b7b..974173c82b 100644 --- a/mteb/tasks/Retrieval/multilingual/PublicHealthQARetrieval.py +++ b/mteb/tasks/Retrieval/multilingual/PublicHealthQARetrieval.py @@ -85,14 +85,14 @@ class PublicHealthQARetrieval(MultilingualTask, AbsTaskRetrieval): annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation=""" -@misc {xing_han_lu_2024, - author = { {Xing Han Lu} }, - title = { publichealth-qa (Revision 3b67b6b) }, - year = 2024, - url = { https://huggingface.co/datasets/xhluca/publichealth-qa }, - doi = { 10.57967/hf/2247 }, - publisher = { Hugging Face } + bibtex_citation=r""" +@misc{xing_han_lu_2024, + author = { {Xing Han Lu} }, + doi = { 10.57967/hf/2247 }, + publisher = { Hugging Face }, + title = { publichealth-qa (Revision 3b67b6b) }, + url = { https://huggingface.co/datasets/xhluca/publichealth-qa }, + year = {2024}, } """, ) diff --git a/mteb/tasks/Retrieval/multilingual/StatcanDialogueDatasetRetrieval.py b/mteb/tasks/Retrieval/multilingual/StatcanDialogueDatasetRetrieval.py index ab7e178c82..c607c7c408 100644 --- a/mteb/tasks/Retrieval/multilingual/StatcanDialogueDatasetRetrieval.py +++ b/mteb/tasks/Retrieval/multilingual/StatcanDialogueDatasetRetrieval.py @@ -85,19 +85,19 @@ class StatcanDialogueDatasetRetrieval(MultilingualTask, AbsTaskRetrieval): annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation=""" + bibtex_citation=r""" @inproceedings{lu-etal-2023-statcan, - title = "The {S}tat{C}an Dialogue Dataset: Retrieving Data Tables through Conversations with Genuine Intents", - author = "Lu, Xing Han and - Reddy, Siva and - de Vries, Harm", - booktitle = "Proceedings of the 17th Conference of the European Chapter of the Association for Computational Linguistics", - month = may, - year = "2023", - address = "Dubrovnik, Croatia", - publisher = "Association for Computational Linguistics", - url = "https://arxiv.org/abs/2304.01412", - pages = "2799--2829", + address = {Dubrovnik, Croatia}, + author = {Lu, Xing Han and +Reddy, Siva and +de Vries, Harm}, + booktitle = {Proceedings of the 17th Conference of the European Chapter of the Association for Computational Linguistics}, + month = may, + pages = {2799--2829}, + publisher = {Association for Computational Linguistics}, + title = {The {S}tat{C}an Dialogue Dataset: Retrieving Data Tables through Conversations with Genuine Intents}, + url = {https://arxiv.org/abs/2304.01412}, + year = {2023}, } """, ) diff --git a/mteb/tasks/Retrieval/multilingual/WebFAQRetrieval.py b/mteb/tasks/Retrieval/multilingual/WebFAQRetrieval.py index 703103a782..64e5646396 100644 --- a/mteb/tasks/Retrieval/multilingual/WebFAQRetrieval.py +++ b/mteb/tasks/Retrieval/multilingual/WebFAQRetrieval.py @@ -147,15 +147,17 @@ class WebFAQRetrieval(MultilingualTask, AbsTaskRetrieval): annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation="""@misc{dinzinger2025webfaq, - title={WebFAQ: A Multilingual Collection of Natural Q&A Datasets for Dense Retrieval}, - author={Michael Dinzinger and Laura Caspari and Kanishka Ghosh Dastidar and Jelena Mitrović and Michael Granitzer}, - year={2025}, - eprint={2502.20936}, - archivePrefix={arXiv}, - primaryClass={cs.CL}, - url={https://arxiv.org/abs/2502.20936}, -}""", + bibtex_citation=r""" +@misc{dinzinger2025webfaq, + archiveprefix = {arXiv}, + author = {Michael Dinzinger and Laura Caspari and Kanishka Ghosh Dastidar and Jelena Mitrović and Michael Granitzer}, + eprint = {2502.20936}, + primaryclass = {cs.CL}, + title = {WebFAQ: A Multilingual Collection of Natural Q&A Datasets for Dense Retrieval}, + url = {https://arxiv.org/abs/2502.20936}, + year = {2025}, +} +""", ) def load_data(self, **kwargs): diff --git a/mteb/tasks/Retrieval/multilingual/XMarketRetrieval.py b/mteb/tasks/Retrieval/multilingual/XMarketRetrieval.py index 01d240eb9d..f630009419 100644 --- a/mteb/tasks/Retrieval/multilingual/XMarketRetrieval.py +++ b/mteb/tasks/Retrieval/multilingual/XMarketRetrieval.py @@ -86,15 +86,20 @@ class XMarket(MultilingualTask, AbsTaskRetrieval): annotations_creators=None, dialect=None, sample_creation=None, - bibtex_citation="""@inproceedings{Bonab_2021, series={CIKM ’21}, - title={Cross-Market Product Recommendation}, - url={http://dx.doi.org/10.1145/3459637.3482493}, - DOI={10.1145/3459637.3482493}, - booktitle={Proceedings of the 30th ACM International Conference on Information & Knowledge Management}, - publisher={ACM}, - author={Bonab, Hamed and Aliannejadi, Mohammad and Vardasbi, Ali and Kanoulas, Evangelos and Allan, James}, - year={2021}, - month=oct, collection={CIKM ’21} }""", + bibtex_citation=r""" +@inproceedings{Bonab_2021, + author = {Bonab, Hamed and Aliannejadi, Mohammad and Vardasbi, Ali and Kanoulas, Evangelos and Allan, James}, + booktitle = {Proceedings of the 30th ACM International Conference on Information & Knowledge Management}, + collection = {CIKM ’21}, + doi = {10.1145/3459637.3482493}, + month = oct, + publisher = {ACM}, + series = {CIKM ’21}, + title = {Cross-Market Product Recommendation}, + url = {http://dx.doi.org/10.1145/3459637.3482493}, + year = {2021}, +} +""", ) def load_data(self, **kwargs): diff --git a/mteb/tasks/Retrieval/multilingual/XPQARetrieval.py b/mteb/tasks/Retrieval/multilingual/XPQARetrieval.py index 72cbbd6dab..942481138f 100644 --- a/mteb/tasks/Retrieval/multilingual/XPQARetrieval.py +++ b/mteb/tasks/Retrieval/multilingual/XPQARetrieval.py @@ -85,13 +85,15 @@ class XPQARetrieval(AbsTaskRetrieval, MultilingualTask): annotations_creators="human-annotated", dialect=[], sample_creation="found", - bibtex_citation="""@inproceedings{shen2023xpqa, - title={xPQA: Cross-Lingual Product Question Answering in 12 Languages}, - author={Shen, Xiaoyu and Asai, Akari and Byrne, Bill and De Gispert, Adria}, - booktitle={Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 5: Industry Track)}, - pages={103--115}, - year={2023} - }""", + bibtex_citation=r""" +@inproceedings{shen2023xpqa, + author = {Shen, Xiaoyu and Asai, Akari and Byrne, Bill and De Gispert, Adria}, + booktitle = {Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 5: Industry Track)}, + pages = {103--115}, + title = {xPQA: Cross-Lingual Product Question Answering in 12 Languages}, + year = {2023}, +} +""", ) def load_data(self, **kwargs): diff --git a/mteb/tasks/Retrieval/multilingual/XQuADRetrieval.py b/mteb/tasks/Retrieval/multilingual/XQuADRetrieval.py index 4d952896e3..2886eeffe4 100644 --- a/mteb/tasks/Retrieval/multilingual/XQuADRetrieval.py +++ b/mteb/tasks/Retrieval/multilingual/XQuADRetrieval.py @@ -47,23 +47,25 @@ class XQuADRetrieval(MultilingualTask, AbsTaskRetrieval): annotations_creators="human-annotated", dialect=[], sample_creation="created", - bibtex_citation="""@article{Artetxe:etal:2019, - author = {Mikel Artetxe and Sebastian Ruder and Dani Yogatama}, - title = {On the cross-lingual transferability of monolingual representations}, - journal = {CoRR}, - volume = {abs/1910.11856}, - year = {2019}, - archivePrefix = {arXiv}, - eprint = {1910.11856} + bibtex_citation=r""" +@article{Artetxe:etal:2019, + archiveprefix = {arXiv}, + author = {Mikel Artetxe and Sebastian Ruder and Dani Yogatama}, + eprint = {1910.11856}, + journal = {CoRR}, + title = {On the cross-lingual transferability of monolingual representations}, + volume = {abs/1910.11856}, + year = {2019}, } -@inproceedings{ - dumitrescu2021liro, - title={LiRo: Benchmark and leaderboard for Romanian language tasks}, - author={Stefan Daniel Dumitrescu and Petru Rebeja and Beata Lorincz and Mihaela Gaman and Andrei Avram and Mihai Ilie and Andrei Pruteanu and Adriana Stan and Lorena Rosia and Cristina Iacobescu and Luciana Morogan and George Dima and Gabriel Marchidan and Traian Rebedea and Madalina Chitez and Dani Yogatama and Sebastian Ruder and Radu Tudor Ionescu and Razvan Pascanu and Viorica Patraucean}, - booktitle={Thirty-fifth Conference on Neural Information Processing Systems Datasets and Benchmarks Track (Round 1)}, - year={2021}, - url={https://openreview.net/forum?id=JH61CD7afTv} -}""", + +@inproceedings{dumitrescu2021liro, + author = {Stefan Daniel Dumitrescu and Petru Rebeja and Beata Lorincz and Mihaela Gaman and Andrei Avram and Mihai Ilie and Andrei Pruteanu and Adriana Stan and Lorena Rosia and Cristina Iacobescu and Luciana Morogan and George Dima and Gabriel Marchidan and Traian Rebedea and Madalina Chitez and Dani Yogatama and Sebastian Ruder and Radu Tudor Ionescu and Razvan Pascanu and Viorica Patraucean}, + booktitle = {Thirty-fifth Conference on Neural Information Processing Systems Datasets and Benchmarks Track (Round 1)}, + title = {LiRo: Benchmark and leaderboard for Romanian language tasks}, + url = {https://openreview.net/forum?id=JH61CD7afTv}, + year = {2021}, +} +""", ) def load_data(self, **kwargs): diff --git a/mteb/tasks/Retrieval/nld/ArguAnaNLRetrieval.py b/mteb/tasks/Retrieval/nld/ArguAnaNLRetrieval.py index 2094bf8081..d38c2d500b 100644 --- a/mteb/tasks/Retrieval/nld/ArguAnaNLRetrieval.py +++ b/mteb/tasks/Retrieval/nld/ArguAnaNLRetrieval.py @@ -30,14 +30,16 @@ class ArguAnaNL(AbsTaskRetrieval): annotations_creators="derived", dialect=[], sample_creation="machine-translated and verified", # manually checked a small subset - bibtex_citation="""@misc{banar2024beirnlzeroshotinformationretrieval, - title={BEIR-NL: Zero-shot Information Retrieval Benchmark for the Dutch Language}, - author={Nikolay Banar and Ehsan Lotfi and Walter Daelemans}, - year={2024}, - eprint={2412.08329}, - archivePrefix={arXiv}, - primaryClass={cs.CL}, - url={https://arxiv.org/abs/2412.08329}, -}""", + bibtex_citation=r""" +@misc{banar2024beirnlzeroshotinformationretrieval, + archiveprefix = {arXiv}, + author = {Nikolay Banar and Ehsan Lotfi and Walter Daelemans}, + eprint = {2412.08329}, + primaryclass = {cs.CL}, + title = {BEIR-NL: Zero-shot Information Retrieval Benchmark for the Dutch Language}, + url = {https://arxiv.org/abs/2412.08329}, + year = {2024}, +} +""", adapted_from=["ArguAna"], ) diff --git a/mteb/tasks/Retrieval/nld/CQADupstackAndroidNLRetrieval.py b/mteb/tasks/Retrieval/nld/CQADupstackAndroidNLRetrieval.py index 86a40620a5..98dc5c6848 100644 --- a/mteb/tasks/Retrieval/nld/CQADupstackAndroidNLRetrieval.py +++ b/mteb/tasks/Retrieval/nld/CQADupstackAndroidNLRetrieval.py @@ -31,15 +31,17 @@ class CQADupstackAndroidNLRetrieval(AbsTaskRetrieval): annotations_creators="derived", dialect=[], sample_creation="machine-translated and verified", # manually checked a small subset - bibtex_citation="""@misc{banar2024beirnlzeroshotinformationretrieval, - title={BEIR-NL: Zero-shot Information Retrieval Benchmark for the Dutch Language}, - author={Nikolay Banar and Ehsan Lotfi and Walter Daelemans}, - year={2024}, - eprint={2412.08329}, - archivePrefix={arXiv}, - primaryClass={cs.CL}, - url={https://arxiv.org/abs/2412.08329}, -}""", + bibtex_citation=r""" +@misc{banar2024beirnlzeroshotinformationretrieval, + archiveprefix = {arXiv}, + author = {Nikolay Banar and Ehsan Lotfi and Walter Daelemans}, + eprint = {2412.08329}, + primaryclass = {cs.CL}, + title = {BEIR-NL: Zero-shot Information Retrieval Benchmark for the Dutch Language}, + url = {https://arxiv.org/abs/2412.08329}, + year = {2024}, +} +""", adapted_from=["CQADupstackAndroid"], ) diff --git a/mteb/tasks/Retrieval/nld/CQADupstackEnglishNLRetrieval.py b/mteb/tasks/Retrieval/nld/CQADupstackEnglishNLRetrieval.py index e47cf3d139..de9f34a85a 100644 --- a/mteb/tasks/Retrieval/nld/CQADupstackEnglishNLRetrieval.py +++ b/mteb/tasks/Retrieval/nld/CQADupstackEnglishNLRetrieval.py @@ -31,15 +31,17 @@ class CQADupstackEnglishNLRetrieval(AbsTaskRetrieval): annotations_creators="derived", dialect=[], sample_creation="machine-translated and verified", # manually checked a small subset - bibtex_citation="""@misc{banar2024beirnlzeroshotinformationretrieval, - title={BEIR-NL: Zero-shot Information Retrieval Benchmark for the Dutch Language}, - author={Nikolay Banar and Ehsan Lotfi and Walter Daelemans}, - year={2024}, - eprint={2412.08329}, - archivePrefix={arXiv}, - primaryClass={cs.CL}, - url={https://arxiv.org/abs/2412.08329}, -}""", + bibtex_citation=r""" +@misc{banar2024beirnlzeroshotinformationretrieval, + archiveprefix = {arXiv}, + author = {Nikolay Banar and Ehsan Lotfi and Walter Daelemans}, + eprint = {2412.08329}, + primaryclass = {cs.CL}, + title = {BEIR-NL: Zero-shot Information Retrieval Benchmark for the Dutch Language}, + url = {https://arxiv.org/abs/2412.08329}, + year = {2024}, +} +""", adapted_from=["CQADupstackEnglish"], ) diff --git a/mteb/tasks/Retrieval/nld/CQADupstackGamingNLRetrieval.py b/mteb/tasks/Retrieval/nld/CQADupstackGamingNLRetrieval.py index 6c787dc23d..d808774507 100644 --- a/mteb/tasks/Retrieval/nld/CQADupstackGamingNLRetrieval.py +++ b/mteb/tasks/Retrieval/nld/CQADupstackGamingNLRetrieval.py @@ -31,15 +31,17 @@ class CQADupstackGamingNLRetrieval(AbsTaskRetrieval): annotations_creators="derived", dialect=[], sample_creation="machine-translated and verified", # manually checked a small subset - bibtex_citation="""@misc{banar2024beirnlzeroshotinformationretrieval, - title={BEIR-NL: Zero-shot Information Retrieval Benchmark for the Dutch Language}, - author={Nikolay Banar and Ehsan Lotfi and Walter Daelemans}, - year={2024}, - eprint={2412.08329}, - archivePrefix={arXiv}, - primaryClass={cs.CL}, - url={https://arxiv.org/abs/2412.08329}, -}""", + bibtex_citation=r""" +@misc{banar2024beirnlzeroshotinformationretrieval, + archiveprefix = {arXiv}, + author = {Nikolay Banar and Ehsan Lotfi and Walter Daelemans}, + eprint = {2412.08329}, + primaryclass = {cs.CL}, + title = {BEIR-NL: Zero-shot Information Retrieval Benchmark for the Dutch Language}, + url = {https://arxiv.org/abs/2412.08329}, + year = {2024}, +} +""", adapted_from=["CQADupstackGamingRetrieval"], ) diff --git a/mteb/tasks/Retrieval/nld/CQADupstackGisNLRetrieval.py b/mteb/tasks/Retrieval/nld/CQADupstackGisNLRetrieval.py index 08e57961f9..62664ddf69 100644 --- a/mteb/tasks/Retrieval/nld/CQADupstackGisNLRetrieval.py +++ b/mteb/tasks/Retrieval/nld/CQADupstackGisNLRetrieval.py @@ -31,15 +31,17 @@ class CQADupstackGisNLRetrieval(AbsTaskRetrieval): annotations_creators="derived", dialect=[], sample_creation="machine-translated and verified", # manually checked a small subset - bibtex_citation="""@misc{banar2024beirnlzeroshotinformationretrieval, - title={BEIR-NL: Zero-shot Information Retrieval Benchmark for the Dutch Language}, - author={Nikolay Banar and Ehsan Lotfi and Walter Daelemans}, - year={2024}, - eprint={2412.08329}, - archivePrefix={arXiv}, - primaryClass={cs.CL}, - url={https://arxiv.org/abs/2412.08329}, -}""", + bibtex_citation=r""" +@misc{banar2024beirnlzeroshotinformationretrieval, + archiveprefix = {arXiv}, + author = {Nikolay Banar and Ehsan Lotfi and Walter Daelemans}, + eprint = {2412.08329}, + primaryclass = {cs.CL}, + title = {BEIR-NL: Zero-shot Information Retrieval Benchmark for the Dutch Language}, + url = {https://arxiv.org/abs/2412.08329}, + year = {2024}, +} +""", adapted_from=["CQADupstackGisRetrieval"], ) diff --git a/mteb/tasks/Retrieval/nld/CQADupstackMathematicaNLRetrieval.py b/mteb/tasks/Retrieval/nld/CQADupstackMathematicaNLRetrieval.py index 44909f261b..0ca7f3dfbc 100644 --- a/mteb/tasks/Retrieval/nld/CQADupstackMathematicaNLRetrieval.py +++ b/mteb/tasks/Retrieval/nld/CQADupstackMathematicaNLRetrieval.py @@ -31,15 +31,17 @@ class CQADupstackMathematicaNLRetrieval(AbsTaskRetrieval): annotations_creators="derived", dialect=[], sample_creation="machine-translated and verified", # manually checked a small subset - bibtex_citation="""@misc{banar2024beirnlzeroshotinformationretrieval, - title={BEIR-NL: Zero-shot Information Retrieval Benchmark for the Dutch Language}, - author={Nikolay Banar and Ehsan Lotfi and Walter Daelemans}, - year={2024}, - eprint={2412.08329}, - archivePrefix={arXiv}, - primaryClass={cs.CL}, - url={https://arxiv.org/abs/2412.08329}, -}""", + bibtex_citation=r""" +@misc{banar2024beirnlzeroshotinformationretrieval, + archiveprefix = {arXiv}, + author = {Nikolay Banar and Ehsan Lotfi and Walter Daelemans}, + eprint = {2412.08329}, + primaryclass = {cs.CL}, + title = {BEIR-NL: Zero-shot Information Retrieval Benchmark for the Dutch Language}, + url = {https://arxiv.org/abs/2412.08329}, + year = {2024}, +} +""", adapted_from=["CQADupstackMathematicaRetrieval"], ) diff --git a/mteb/tasks/Retrieval/nld/CQADupstackPhysicsNLRetrieval.py b/mteb/tasks/Retrieval/nld/CQADupstackPhysicsNLRetrieval.py index 96f65f61d2..ccbe0ce1fe 100644 --- a/mteb/tasks/Retrieval/nld/CQADupstackPhysicsNLRetrieval.py +++ b/mteb/tasks/Retrieval/nld/CQADupstackPhysicsNLRetrieval.py @@ -31,15 +31,17 @@ class CQADupstackPhysicsNLRetrieval(AbsTaskRetrieval): annotations_creators="derived", dialect=[], sample_creation="machine-translated and verified", # manually checked a small subset - bibtex_citation="""@misc{banar2024beirnlzeroshotinformationretrieval, - title={BEIR-NL: Zero-shot Information Retrieval Benchmark for the Dutch Language}, - author={Nikolay Banar and Ehsan Lotfi and Walter Daelemans}, - year={2024}, - eprint={2412.08329}, - archivePrefix={arXiv}, - primaryClass={cs.CL}, - url={https://arxiv.org/abs/2412.08329}, -}""", + bibtex_citation=r""" +@misc{banar2024beirnlzeroshotinformationretrieval, + archiveprefix = {arXiv}, + author = {Nikolay Banar and Ehsan Lotfi and Walter Daelemans}, + eprint = {2412.08329}, + primaryclass = {cs.CL}, + title = {BEIR-NL: Zero-shot Information Retrieval Benchmark for the Dutch Language}, + url = {https://arxiv.org/abs/2412.08329}, + year = {2024}, +} +""", adapted_from=["CQADupstackPhysicsRetrieval"], ) diff --git a/mteb/tasks/Retrieval/nld/CQADupstackProgrammersNLRetrieval.py b/mteb/tasks/Retrieval/nld/CQADupstackProgrammersNLRetrieval.py index f9ee41a024..09a158bfac 100644 --- a/mteb/tasks/Retrieval/nld/CQADupstackProgrammersNLRetrieval.py +++ b/mteb/tasks/Retrieval/nld/CQADupstackProgrammersNLRetrieval.py @@ -31,15 +31,17 @@ class CQADupstackProgrammersNLRetrieval(AbsTaskRetrieval): annotations_creators="derived", dialect=[], sample_creation="machine-translated and verified", # manually checked a small subset - bibtex_citation="""@misc{banar2024beirnlzeroshotinformationretrieval, - title={BEIR-NL: Zero-shot Information Retrieval Benchmark for the Dutch Language}, - author={Nikolay Banar and Ehsan Lotfi and Walter Daelemans}, - year={2024}, - eprint={2412.08329}, - archivePrefix={arXiv}, - primaryClass={cs.CL}, - url={https://arxiv.org/abs/2412.08329}, -}""", + bibtex_citation=r""" +@misc{banar2024beirnlzeroshotinformationretrieval, + archiveprefix = {arXiv}, + author = {Nikolay Banar and Ehsan Lotfi and Walter Daelemans}, + eprint = {2412.08329}, + primaryclass = {cs.CL}, + title = {BEIR-NL: Zero-shot Information Retrieval Benchmark for the Dutch Language}, + url = {https://arxiv.org/abs/2412.08329}, + year = {2024}, +} +""", adapted_from=["CQADupstackProgrammersRetrieval"], ) diff --git a/mteb/tasks/Retrieval/nld/CQADupstackStatsNLRetrieval.py b/mteb/tasks/Retrieval/nld/CQADupstackStatsNLRetrieval.py index 97642a45cd..31949a24d3 100644 --- a/mteb/tasks/Retrieval/nld/CQADupstackStatsNLRetrieval.py +++ b/mteb/tasks/Retrieval/nld/CQADupstackStatsNLRetrieval.py @@ -31,15 +31,17 @@ class CQADupstackStatsNLRetrieval(AbsTaskRetrieval): annotations_creators="derived", dialect=[], sample_creation="machine-translated and verified", # manually checked a small subset - bibtex_citation="""@misc{banar2024beirnlzeroshotinformationretrieval, - title={BEIR-NL: Zero-shot Information Retrieval Benchmark for the Dutch Language}, - author={Nikolay Banar and Ehsan Lotfi and Walter Daelemans}, - year={2024}, - eprint={2412.08329}, - archivePrefix={arXiv}, - primaryClass={cs.CL}, - url={https://arxiv.org/abs/2412.08329}, -}""", + bibtex_citation=r""" +@misc{banar2024beirnlzeroshotinformationretrieval, + archiveprefix = {arXiv}, + author = {Nikolay Banar and Ehsan Lotfi and Walter Daelemans}, + eprint = {2412.08329}, + primaryclass = {cs.CL}, + title = {BEIR-NL: Zero-shot Information Retrieval Benchmark for the Dutch Language}, + url = {https://arxiv.org/abs/2412.08329}, + year = {2024}, +} +""", adapted_from=["CQADupstackStatsRetrieval"], ) diff --git a/mteb/tasks/Retrieval/nld/CQADupstackTexNLRetrieval.py b/mteb/tasks/Retrieval/nld/CQADupstackTexNLRetrieval.py index f09f7df03c..eff835af20 100644 --- a/mteb/tasks/Retrieval/nld/CQADupstackTexNLRetrieval.py +++ b/mteb/tasks/Retrieval/nld/CQADupstackTexNLRetrieval.py @@ -31,15 +31,17 @@ class CQADupstackTexNLRetrieval(AbsTaskRetrieval): annotations_creators="derived", dialect=[], sample_creation="machine-translated and verified", # manually checked a small subset - bibtex_citation="""@misc{banar2024beirnlzeroshotinformationretrieval, - title={BEIR-NL: Zero-shot Information Retrieval Benchmark for the Dutch Language}, - author={Nikolay Banar and Ehsan Lotfi and Walter Daelemans}, - year={2024}, - eprint={2412.08329}, - archivePrefix={arXiv}, - primaryClass={cs.CL}, - url={https://arxiv.org/abs/2412.08329}, -}""", + bibtex_citation=r""" +@misc{banar2024beirnlzeroshotinformationretrieval, + archiveprefix = {arXiv}, + author = {Nikolay Banar and Ehsan Lotfi and Walter Daelemans}, + eprint = {2412.08329}, + primaryclass = {cs.CL}, + title = {BEIR-NL: Zero-shot Information Retrieval Benchmark for the Dutch Language}, + url = {https://arxiv.org/abs/2412.08329}, + year = {2024}, +} +""", adapted_from=["CQADupstackTexRetrieval"], ) diff --git a/mteb/tasks/Retrieval/nld/CQADupstackUnixNLRetrieval.py b/mteb/tasks/Retrieval/nld/CQADupstackUnixNLRetrieval.py index f67b576d4d..3ce79d2048 100644 --- a/mteb/tasks/Retrieval/nld/CQADupstackUnixNLRetrieval.py +++ b/mteb/tasks/Retrieval/nld/CQADupstackUnixNLRetrieval.py @@ -31,15 +31,17 @@ class CQADupstackUnixNLRetrieval(AbsTaskRetrieval): annotations_creators="derived", dialect=[], sample_creation="machine-translated and verified", # manually checked a small subset - bibtex_citation="""@misc{banar2024beirnlzeroshotinformationretrieval, - title={BEIR-NL: Zero-shot Information Retrieval Benchmark for the Dutch Language}, - author={Nikolay Banar and Ehsan Lotfi and Walter Daelemans}, - year={2024}, - eprint={2412.08329}, - archivePrefix={arXiv}, - primaryClass={cs.CL}, - url={https://arxiv.org/abs/2412.08329}, -}""", + bibtex_citation=r""" +@misc{banar2024beirnlzeroshotinformationretrieval, + archiveprefix = {arXiv}, + author = {Nikolay Banar and Ehsan Lotfi and Walter Daelemans}, + eprint = {2412.08329}, + primaryclass = {cs.CL}, + title = {BEIR-NL: Zero-shot Information Retrieval Benchmark for the Dutch Language}, + url = {https://arxiv.org/abs/2412.08329}, + year = {2024}, +} +""", adapted_from=["CQADupstackUnixRetrieval"], ) diff --git a/mteb/tasks/Retrieval/nld/CQADupstackWebmastersNLRetrieval.py b/mteb/tasks/Retrieval/nld/CQADupstackWebmastersNLRetrieval.py index e834480e98..ecd7e570b8 100644 --- a/mteb/tasks/Retrieval/nld/CQADupstackWebmastersNLRetrieval.py +++ b/mteb/tasks/Retrieval/nld/CQADupstackWebmastersNLRetrieval.py @@ -31,15 +31,17 @@ class CQADupstackWebmastersNLRetrieval(AbsTaskRetrieval): annotations_creators="derived", dialect=[], sample_creation="machine-translated and verified", # manually checked a small subset - bibtex_citation="""@misc{banar2024beirnlzeroshotinformationretrieval, - title={BEIR-NL: Zero-shot Information Retrieval Benchmark for the Dutch Language}, - author={Nikolay Banar and Ehsan Lotfi and Walter Daelemans}, - year={2024}, - eprint={2412.08329}, - archivePrefix={arXiv}, - primaryClass={cs.CL}, - url={https://arxiv.org/abs/2412.08329}, -}""", + bibtex_citation=r""" +@misc{banar2024beirnlzeroshotinformationretrieval, + archiveprefix = {arXiv}, + author = {Nikolay Banar and Ehsan Lotfi and Walter Daelemans}, + eprint = {2412.08329}, + primaryclass = {cs.CL}, + title = {BEIR-NL: Zero-shot Information Retrieval Benchmark for the Dutch Language}, + url = {https://arxiv.org/abs/2412.08329}, + year = {2024}, +} +""", adapted_from=["CQADupstackWebmasters"], ) diff --git a/mteb/tasks/Retrieval/nld/CQADupstackWordpressNLRetrieval.py b/mteb/tasks/Retrieval/nld/CQADupstackWordpressNLRetrieval.py index b9faf5a841..4735d57eab 100644 --- a/mteb/tasks/Retrieval/nld/CQADupstackWordpressNLRetrieval.py +++ b/mteb/tasks/Retrieval/nld/CQADupstackWordpressNLRetrieval.py @@ -31,15 +31,17 @@ class CQADupstackWordpressNLRetrieval(AbsTaskRetrieval): annotations_creators="derived", dialect=[], sample_creation="machine-translated and verified", # manually checked a small subset - bibtex_citation="""@misc{banar2024beirnlzeroshotinformationretrieval, - title={BEIR-NL: Zero-shot Information Retrieval Benchmark for the Dutch Language}, - author={Nikolay Banar and Ehsan Lotfi and Walter Daelemans}, - year={2024}, - eprint={2412.08329}, - archivePrefix={arXiv}, - primaryClass={cs.CL}, - url={https://arxiv.org/abs/2412.08329}, -}""", + bibtex_citation=r""" +@misc{banar2024beirnlzeroshotinformationretrieval, + archiveprefix = {arXiv}, + author = {Nikolay Banar and Ehsan Lotfi and Walter Daelemans}, + eprint = {2412.08329}, + primaryclass = {cs.CL}, + title = {BEIR-NL: Zero-shot Information Retrieval Benchmark for the Dutch Language}, + url = {https://arxiv.org/abs/2412.08329}, + year = {2024}, +} +""", adapted_from=["CQADupstackWordpressRetrieval"], ) diff --git a/mteb/tasks/Retrieval/nld/ClimateFEVERNLRetrieval.py b/mteb/tasks/Retrieval/nld/ClimateFEVERNLRetrieval.py index b43c39bdbb..222c46d871 100644 --- a/mteb/tasks/Retrieval/nld/ClimateFEVERNLRetrieval.py +++ b/mteb/tasks/Retrieval/nld/ClimateFEVERNLRetrieval.py @@ -28,14 +28,16 @@ class ClimateFEVERNL(AbsTaskRetrieval): annotations_creators="derived", dialect=[], sample_creation="machine-translated and verified", # manually checked a small subset - bibtex_citation="""@misc{banar2024beirnlzeroshotinformationretrieval, - title={BEIR-NL: Zero-shot Information Retrieval Benchmark for the Dutch Language}, - author={Nikolay Banar and Ehsan Lotfi and Walter Daelemans}, - year={2024}, - eprint={2412.08329}, - archivePrefix={arXiv}, - primaryClass={cs.CL}, - url={https://arxiv.org/abs/2412.08329}, -}""", + bibtex_citation=r""" +@misc{banar2024beirnlzeroshotinformationretrieval, + archiveprefix = {arXiv}, + author = {Nikolay Banar and Ehsan Lotfi and Walter Daelemans}, + eprint = {2412.08329}, + primaryclass = {cs.CL}, + title = {BEIR-NL: Zero-shot Information Retrieval Benchmark for the Dutch Language}, + url = {https://arxiv.org/abs/2412.08329}, + year = {2024}, +} +""", adapted_from=["ClimateFEVER"], ) diff --git a/mteb/tasks/Retrieval/nld/DBPediaNLRetrieval.py b/mteb/tasks/Retrieval/nld/DBPediaNLRetrieval.py index 90a3675106..501b64a4da 100644 --- a/mteb/tasks/Retrieval/nld/DBPediaNLRetrieval.py +++ b/mteb/tasks/Retrieval/nld/DBPediaNLRetrieval.py @@ -27,15 +27,17 @@ class DBPediaNL(AbsTaskRetrieval): annotations_creators="derived", dialect=[], sample_creation="machine-translated and verified", # manually checked a small subset - bibtex_citation="""@misc{banar2024beirnlzeroshotinformationretrieval, - title={BEIR-NL: Zero-shot Information Retrieval Benchmark for the Dutch Language}, - author={Nikolay Banar and Ehsan Lotfi and Walter Daelemans}, - year={2024}, - eprint={2412.08329}, - archivePrefix={arXiv}, - primaryClass={cs.CL}, - url={https://arxiv.org/abs/2412.08329}, -}""", + bibtex_citation=r""" +@misc{banar2024beirnlzeroshotinformationretrieval, + archiveprefix = {arXiv}, + author = {Nikolay Banar and Ehsan Lotfi and Walter Daelemans}, + eprint = {2412.08329}, + primaryclass = {cs.CL}, + title = {BEIR-NL: Zero-shot Information Retrieval Benchmark for the Dutch Language}, + url = {https://arxiv.org/abs/2412.08329}, + year = {2024}, +} +""", prompt={ "query": "Given a query, retrieve relevant entity descriptions from DBPedia" }, diff --git a/mteb/tasks/Retrieval/nld/FEVERNLRetrieval.py b/mteb/tasks/Retrieval/nld/FEVERNLRetrieval.py index 3c00d578b2..60b995c8e6 100644 --- a/mteb/tasks/Retrieval/nld/FEVERNLRetrieval.py +++ b/mteb/tasks/Retrieval/nld/FEVERNLRetrieval.py @@ -33,14 +33,16 @@ class FEVERNL(AbsTaskRetrieval): annotations_creators="derived", dialect=[], sample_creation="machine-translated and verified", # manually checked a small subset - bibtex_citation="""@misc{banar2024beirnlzeroshotinformationretrieval, - title={BEIR-NL: Zero-shot Information Retrieval Benchmark for the Dutch Language}, - author={Nikolay Banar and Ehsan Lotfi and Walter Daelemans}, - year={2024}, - eprint={2412.08329}, - archivePrefix={arXiv}, - primaryClass={cs.CL}, - url={https://arxiv.org/abs/2412.08329}, -}""", + bibtex_citation=r""" +@misc{banar2024beirnlzeroshotinformationretrieval, + archiveprefix = {arXiv}, + author = {Nikolay Banar and Ehsan Lotfi and Walter Daelemans}, + eprint = {2412.08329}, + primaryclass = {cs.CL}, + title = {BEIR-NL: Zero-shot Information Retrieval Benchmark for the Dutch Language}, + url = {https://arxiv.org/abs/2412.08329}, + year = {2024}, +} +""", adapted_from=["FEVER"], ) diff --git a/mteb/tasks/Retrieval/nld/FiQA2018NLRetrieval.py b/mteb/tasks/Retrieval/nld/FiQA2018NLRetrieval.py index 9be7798bfc..b3ef57e362 100644 --- a/mteb/tasks/Retrieval/nld/FiQA2018NLRetrieval.py +++ b/mteb/tasks/Retrieval/nld/FiQA2018NLRetrieval.py @@ -29,14 +29,16 @@ class FiQA2018NL(AbsTaskRetrieval): annotations_creators="derived", dialect=[], sample_creation="machine-translated and verified", # manually checked a small subset - bibtex_citation="""@misc{banar2024beirnlzeroshotinformationretrieval, - title={BEIR-NL: Zero-shot Information Retrieval Benchmark for the Dutch Language}, - author={Nikolay Banar and Ehsan Lotfi and Walter Daelemans}, - year={2024}, - eprint={2412.08329}, - archivePrefix={arXiv}, - primaryClass={cs.CL}, - url={https://arxiv.org/abs/2412.08329}, -}""", + bibtex_citation=r""" +@misc{banar2024beirnlzeroshotinformationretrieval, + archiveprefix = {arXiv}, + author = {Nikolay Banar and Ehsan Lotfi and Walter Daelemans}, + eprint = {2412.08329}, + primaryclass = {cs.CL}, + title = {BEIR-NL: Zero-shot Information Retrieval Benchmark for the Dutch Language}, + url = {https://arxiv.org/abs/2412.08329}, + year = {2024}, +} +""", adapted_from=["FiQA2018"], ) diff --git a/mteb/tasks/Retrieval/nld/HotpotQANLRetrieval.py b/mteb/tasks/Retrieval/nld/HotpotQANLRetrieval.py index 642ee6920a..73ff4d625e 100644 --- a/mteb/tasks/Retrieval/nld/HotpotQANLRetrieval.py +++ b/mteb/tasks/Retrieval/nld/HotpotQANLRetrieval.py @@ -31,14 +31,16 @@ class HotpotQANL(AbsTaskRetrieval): annotations_creators="derived", dialect=[], sample_creation="machine-translated and verified", # manually checked a small subset - bibtex_citation="""@misc{banar2024beirnlzeroshotinformationretrieval, - title={BEIR-NL: Zero-shot Information Retrieval Benchmark for the Dutch Language}, - author={Nikolay Banar and Ehsan Lotfi and Walter Daelemans}, - year={2024}, - eprint={2412.08329}, - archivePrefix={arXiv}, - primaryClass={cs.CL}, - url={https://arxiv.org/abs/2412.08329}, -}""", + bibtex_citation=r""" +@misc{banar2024beirnlzeroshotinformationretrieval, + archiveprefix = {arXiv}, + author = {Nikolay Banar and Ehsan Lotfi and Walter Daelemans}, + eprint = {2412.08329}, + primaryclass = {cs.CL}, + title = {BEIR-NL: Zero-shot Information Retrieval Benchmark for the Dutch Language}, + url = {https://arxiv.org/abs/2412.08329}, + year = {2024}, +} +""", adapted_from=["HotpotQA"], ) diff --git a/mteb/tasks/Retrieval/nld/MMARCONLRetrieval.py b/mteb/tasks/Retrieval/nld/MMARCONLRetrieval.py index c8582b0cb4..951971f148 100644 --- a/mteb/tasks/Retrieval/nld/MMARCONLRetrieval.py +++ b/mteb/tasks/Retrieval/nld/MMARCONLRetrieval.py @@ -29,20 +29,22 @@ class MMMARCONL(AbsTaskRetrieval): annotations_creators="derived", # manually checked a small subset dialect=[], sample_creation="machine-translated and verified", - bibtex_citation="""@article{DBLP:journals/corr/abs-2108-13897, - author = {Luiz Bonifacio and - Israel Campiotti and - Roberto de Alencar Lotufo and - Rodrigo Frassetto Nogueira}, - title = {mMARCO: {A} Multilingual Version of {MS} {MARCO} Passage Ranking Dataset}, - journal = {CoRR}, - volume = {abs/2108.13897}, - year = {2021}, - url = {https://arxiv.org/abs/2108.13897}, - eprinttype = {arXiv}, - eprint = {2108.13897}, - timestamp = {Mon, 20 Mar 2023 15:35:34 +0100}, - biburl = {https://dblp.org/rec/journals/corr/abs-2108-13897.bib}, - bibsource = {dblp computer science bibliography, https://dblp.org} -}""", + bibtex_citation=r""" +@article{DBLP:journals/corr/abs-2108-13897, + author = {Luiz Bonifacio and +Israel Campiotti and +Roberto de Alencar Lotufo and +Rodrigo Frassetto Nogueira}, + bibsource = {dblp computer science bibliography, https://dblp.org}, + biburl = {https://dblp.org/rec/journals/corr/abs-2108-13897.bib}, + eprint = {2108.13897}, + eprinttype = {arXiv}, + journal = {CoRR}, + timestamp = {Mon, 20 Mar 2023 15:35:34 +0100}, + title = {mMARCO: {A} Multilingual Version of {MS} {MARCO} Passage Ranking Dataset}, + url = {https://arxiv.org/abs/2108.13897}, + volume = {abs/2108.13897}, + year = {2021}, +} +""", ) diff --git a/mteb/tasks/Retrieval/nld/NFCorpusNLRetrieval.py b/mteb/tasks/Retrieval/nld/NFCorpusNLRetrieval.py index bd0ab7c162..92edcdd4c2 100644 --- a/mteb/tasks/Retrieval/nld/NFCorpusNLRetrieval.py +++ b/mteb/tasks/Retrieval/nld/NFCorpusNLRetrieval.py @@ -28,14 +28,16 @@ class NFCorpusNL(AbsTaskRetrieval): annotations_creators="derived", dialect=[], sample_creation="machine-translated and verified", # manually checked a small subset - bibtex_citation="""@misc{banar2024beirnlzeroshotinformationretrieval, - title={BEIR-NL: Zero-shot Information Retrieval Benchmark for the Dutch Language}, - author={Nikolay Banar and Ehsan Lotfi and Walter Daelemans}, - year={2024}, - eprint={2412.08329}, - archivePrefix={arXiv}, - primaryClass={cs.CL}, - url={https://arxiv.org/abs/2412.08329}, -}""", + bibtex_citation=r""" +@misc{banar2024beirnlzeroshotinformationretrieval, + archiveprefix = {arXiv}, + author = {Nikolay Banar and Ehsan Lotfi and Walter Daelemans}, + eprint = {2412.08329}, + primaryclass = {cs.CL}, + title = {BEIR-NL: Zero-shot Information Retrieval Benchmark for the Dutch Language}, + url = {https://arxiv.org/abs/2412.08329}, + year = {2024}, +} +""", adapted_from=["NFCorpus"], ) diff --git a/mteb/tasks/Retrieval/nld/NQNLRetrieval.py b/mteb/tasks/Retrieval/nld/NQNLRetrieval.py index f88f95061d..f6e8bf0007 100644 --- a/mteb/tasks/Retrieval/nld/NQNLRetrieval.py +++ b/mteb/tasks/Retrieval/nld/NQNLRetrieval.py @@ -27,14 +27,16 @@ class NQNL(AbsTaskRetrieval): annotations_creators="derived", dialect=[], sample_creation="machine-translated and verified", # manually checked a small subset - bibtex_citation="""@misc{banar2024beirnlzeroshotinformationretrieval, - title={BEIR-NL: Zero-shot Information Retrieval Benchmark for the Dutch Language}, - author={Nikolay Banar and Ehsan Lotfi and Walter Daelemans}, - year={2024}, - eprint={2412.08329}, - archivePrefix={arXiv}, - primaryClass={cs.CL}, - url={https://arxiv.org/abs/2412.08329}, -}""", + bibtex_citation=r""" +@misc{banar2024beirnlzeroshotinformationretrieval, + archiveprefix = {arXiv}, + author = {Nikolay Banar and Ehsan Lotfi and Walter Daelemans}, + eprint = {2412.08329}, + primaryclass = {cs.CL}, + title = {BEIR-NL: Zero-shot Information Retrieval Benchmark for the Dutch Language}, + url = {https://arxiv.org/abs/2412.08329}, + year = {2024}, +} +""", adapted_from=["NQ"], ) diff --git a/mteb/tasks/Retrieval/nld/QuoraNLRetrieval.py b/mteb/tasks/Retrieval/nld/QuoraNLRetrieval.py index 7e97fc6a88..a3452f71c9 100644 --- a/mteb/tasks/Retrieval/nld/QuoraNLRetrieval.py +++ b/mteb/tasks/Retrieval/nld/QuoraNLRetrieval.py @@ -32,14 +32,16 @@ class QuoraNLRetrieval(AbsTaskRetrieval): annotations_creators="derived", dialect=[], sample_creation="machine-translated and verified", # manually checked a small subset - bibtex_citation="""@misc{banar2024beirnlzeroshotinformationretrieval, - title={BEIR-NL: Zero-shot Information Retrieval Benchmark for the Dutch Language}, - author={Nikolay Banar and Ehsan Lotfi and Walter Daelemans}, - year={2024}, - eprint={2412.08329}, - archivePrefix={arXiv}, - primaryClass={cs.CL}, - url={https://arxiv.org/abs/2412.08329}, -}""", + bibtex_citation=r""" +@misc{banar2024beirnlzeroshotinformationretrieval, + archiveprefix = {arXiv}, + author = {Nikolay Banar and Ehsan Lotfi and Walter Daelemans}, + eprint = {2412.08329}, + primaryclass = {cs.CL}, + title = {BEIR-NL: Zero-shot Information Retrieval Benchmark for the Dutch Language}, + url = {https://arxiv.org/abs/2412.08329}, + year = {2024}, +} +""", adapted_from=["QuoraRetrieval"], ) diff --git a/mteb/tasks/Retrieval/nld/SCIDOCSNLRetrieval.py b/mteb/tasks/Retrieval/nld/SCIDOCSNLRetrieval.py index 46fac07a12..de2c76cc8a 100644 --- a/mteb/tasks/Retrieval/nld/SCIDOCSNLRetrieval.py +++ b/mteb/tasks/Retrieval/nld/SCIDOCSNLRetrieval.py @@ -30,14 +30,16 @@ class SCIDOCSNL(AbsTaskRetrieval): annotations_creators="derived", dialect=[], sample_creation="machine-translated and verified", # manually checked a small subset - bibtex_citation="""@misc{banar2024beirnlzeroshotinformationretrieval, - title={BEIR-NL: Zero-shot Information Retrieval Benchmark for the Dutch Language}, - author={Nikolay Banar and Ehsan Lotfi and Walter Daelemans}, - year={2024}, - eprint={2412.08329}, - archivePrefix={arXiv}, - primaryClass={cs.CL}, - url={https://arxiv.org/abs/2412.08329}, -}""", + bibtex_citation=r""" +@misc{banar2024beirnlzeroshotinformationretrieval, + archiveprefix = {arXiv}, + author = {Nikolay Banar and Ehsan Lotfi and Walter Daelemans}, + eprint = {2412.08329}, + primaryclass = {cs.CL}, + title = {BEIR-NL: Zero-shot Information Retrieval Benchmark for the Dutch Language}, + url = {https://arxiv.org/abs/2412.08329}, + year = {2024}, +} +""", adapted_from=["SCIDOCS"], ) diff --git a/mteb/tasks/Retrieval/nld/SciFactNLRetrieval.py b/mteb/tasks/Retrieval/nld/SciFactNLRetrieval.py index a04d559d48..ae9e7f9ae2 100644 --- a/mteb/tasks/Retrieval/nld/SciFactNLRetrieval.py +++ b/mteb/tasks/Retrieval/nld/SciFactNLRetrieval.py @@ -27,14 +27,16 @@ class SciFactNL(AbsTaskRetrieval): annotations_creators="derived", dialect=[], sample_creation="machine-translated and verified", # manually checked a small subset - bibtex_citation="""@misc{banar2024beirnlzeroshotinformationretrieval, - title={BEIR-NL: Zero-shot Information Retrieval Benchmark for the Dutch Language}, - author={Nikolay Banar and Ehsan Lotfi and Walter Daelemans}, - year={2024}, - eprint={2412.08329}, - archivePrefix={arXiv}, - primaryClass={cs.CL}, - url={https://arxiv.org/abs/2412.08329}, -}""", + bibtex_citation=r""" +@misc{banar2024beirnlzeroshotinformationretrieval, + archiveprefix = {arXiv}, + author = {Nikolay Banar and Ehsan Lotfi and Walter Daelemans}, + eprint = {2412.08329}, + primaryclass = {cs.CL}, + title = {BEIR-NL: Zero-shot Information Retrieval Benchmark for the Dutch Language}, + url = {https://arxiv.org/abs/2412.08329}, + year = {2024}, +} +""", adapted_from=["SciFact"], ) diff --git a/mteb/tasks/Retrieval/nld/TRECCOVIDNLRetrieval.py b/mteb/tasks/Retrieval/nld/TRECCOVIDNLRetrieval.py index 0efaa4f536..8b2d7699f0 100644 --- a/mteb/tasks/Retrieval/nld/TRECCOVIDNLRetrieval.py +++ b/mteb/tasks/Retrieval/nld/TRECCOVIDNLRetrieval.py @@ -31,14 +31,16 @@ class TRECCOVIDNL(AbsTaskRetrieval): annotations_creators="derived", dialect=[], sample_creation="machine-translated and verified", # manually checked a small subset - bibtex_citation="""@misc{banar2024beirnlzeroshotinformationretrieval, - title={BEIR-NL: Zero-shot Information Retrieval Benchmark for the Dutch Language}, - author={Nikolay Banar and Ehsan Lotfi and Walter Daelemans}, - year={2024}, - eprint={2412.08329}, - archivePrefix={arXiv}, - primaryClass={cs.CL}, - url={https://arxiv.org/abs/2412.08329}, -}""", + bibtex_citation=r""" +@misc{banar2024beirnlzeroshotinformationretrieval, + archiveprefix = {arXiv}, + author = {Nikolay Banar and Ehsan Lotfi and Walter Daelemans}, + eprint = {2412.08329}, + primaryclass = {cs.CL}, + title = {BEIR-NL: Zero-shot Information Retrieval Benchmark for the Dutch Language}, + url = {https://arxiv.org/abs/2412.08329}, + year = {2024}, +} +""", adapted_from=["TRECCOVID"], ) diff --git a/mteb/tasks/Retrieval/nld/Touche2020NLRetrieval.py b/mteb/tasks/Retrieval/nld/Touche2020NLRetrieval.py index 61181b694b..ecc40186c4 100644 --- a/mteb/tasks/Retrieval/nld/Touche2020NLRetrieval.py +++ b/mteb/tasks/Retrieval/nld/Touche2020NLRetrieval.py @@ -26,14 +26,16 @@ class Touche2020NL(AbsTaskRetrieval): annotations_creators="derived", dialect=[], sample_creation="machine-translated and verified", # manually checked a small subset - bibtex_citation="""@misc{banar2024beirnlzeroshotinformationretrieval, - title={BEIR-NL: Zero-shot Information Retrieval Benchmark for the Dutch Language}, - author={Nikolay Banar and Ehsan Lotfi and Walter Daelemans}, - year={2024}, - eprint={2412.08329}, - archivePrefix={arXiv}, - primaryClass={cs.CL}, - url={https://arxiv.org/abs/2412.08329}, -}""", + bibtex_citation=r""" +@misc{banar2024beirnlzeroshotinformationretrieval, + archiveprefix = {arXiv}, + author = {Nikolay Banar and Ehsan Lotfi and Walter Daelemans}, + eprint = {2412.08329}, + primaryclass = {cs.CL}, + title = {BEIR-NL: Zero-shot Information Retrieval Benchmark for the Dutch Language}, + url = {https://arxiv.org/abs/2412.08329}, + year = {2024}, +} +""", adapted_from=["Touche2020"], ) diff --git a/mteb/tasks/Retrieval/nob/norquad.py b/mteb/tasks/Retrieval/nob/norquad.py index f578cefec8..73a85e5ece 100644 --- a/mteb/tasks/Retrieval/nob/norquad.py +++ b/mteb/tasks/Retrieval/nob/norquad.py @@ -28,24 +28,26 @@ class NorQuadRetrieval(AbsTaskRetrieval): annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation="""@inproceedings{ivanova-etal-2023-norquad, - title = "{N}or{Q}u{AD}: {N}orwegian Question Answering Dataset", - author = "Ivanova, Sardana and - Andreassen, Fredrik and - Jentoft, Matias and - Wold, Sondre and - {\O}vrelid, Lilja", - editor = {Alum{\"a}e, Tanel and - Fishel, Mark}, - booktitle = "Proceedings of the 24th Nordic Conference on Computational Linguistics (NoDaLiDa)", - month = may, - year = "2023", - address = "T{\'o}rshavn, Faroe Islands", - publisher = "University of Tartu Library", - url = "https://aclanthology.org/2023.nodalida-1.17", - pages = "159--168", - abstract = "In this paper we present NorQuAD: the first Norwegian question answering dataset for machine reading comprehension. The dataset consists of 4,752 manually created question-answer pairs. We here detail the data collection procedure and present statistics of the dataset. We also benchmark several multilingual and Norwegian monolingual language models on the dataset and compare them against human performance. The dataset will be made freely available.", -}""", + bibtex_citation=r""" +@inproceedings{ivanova-etal-2023-norquad, + abstract = {In this paper we present NorQuAD: the first Norwegian question answering dataset for machine reading comprehension. The dataset consists of 4,752 manually created question-answer pairs. We here detail the data collection procedure and present statistics of the dataset. We also benchmark several multilingual and Norwegian monolingual language models on the dataset and compare them against human performance. The dataset will be made freely available.}, + address = {T{\'o}rshavn, Faroe Islands}, + author = {Ivanova, Sardana and +Andreassen, Fredrik and +Jentoft, Matias and +Wold, Sondre and +{\O}vrelid, Lilja}, + booktitle = {Proceedings of the 24th Nordic Conference on Computational Linguistics (NoDaLiDa)}, + editor = {Alum{\"a}e, Tanel and +Fishel, Mark}, + month = may, + pages = {159--168}, + publisher = {University of Tartu Library}, + title = {{N}or{Q}u{AD}: {N}orwegian Question Answering Dataset}, + url = {https://aclanthology.org/2023.nodalida-1.17}, + year = {2023}, +} +""", prompt={ "query": "Given a question in Norwegian, retrieve the answer from Wikipedia articles" }, diff --git a/mteb/tasks/Retrieval/nob/snl_retrieval.py b/mteb/tasks/Retrieval/nob/snl_retrieval.py index cf64834329..9d2016a7c5 100644 --- a/mteb/tasks/Retrieval/nob/snl_retrieval.py +++ b/mteb/tasks/Retrieval/nob/snl_retrieval.py @@ -27,12 +27,14 @@ class SNLRetrieval(AbsTaskRetrieval): annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation="""@mastersthesis{navjord2023beyond, - title={Beyond extractive: advancing abstractive automatic text summarization in Norwegian with transformers}, - author={Navjord, J{\o}rgen Johnsen and Korsvik, Jon-Mikkel Ryen}, - year={2023}, - school={Norwegian University of Life Sciences, {\AA}s} -}""", + bibtex_citation=r""" +@mastersthesis{navjord2023beyond, + author = {Navjord, J{\o}rgen Johnsen and Korsvik, Jon-Mikkel Ryen}, + school = {Norwegian University of Life Sciences, {\AA}s}, + title = {Beyond extractive: advancing abstractive automatic text summarization in Norwegian with transformers}, + year = {2023}, +} +""", prompt={"query": "Given a lexicon headline in Norwegian, retrieve its article"}, task_subtypes=["Article retrieval"], ) diff --git a/mteb/tasks/Retrieval/pol/ArguAnaPLRetrieval.py b/mteb/tasks/Retrieval/pol/ArguAnaPLRetrieval.py index 4251863c75..980dfffc9f 100644 --- a/mteb/tasks/Retrieval/pol/ArguAnaPLRetrieval.py +++ b/mteb/tasks/Retrieval/pol/ArguAnaPLRetrieval.py @@ -30,13 +30,15 @@ class ArguAnaPL(AbsTaskRetrieval): annotations_creators=None, dialect=[], sample_creation=None, - bibtex_citation="""@misc{wojtasik2024beirpl, - title={BEIR-PL: Zero Shot Information Retrieval Benchmark for the Polish Language}, - author={Konrad Wojtasik and Vadim Shishkin and Kacper Wołowiec and Arkadiusz Janz and Maciej Piasecki}, - year={2024}, - eprint={2305.19840}, - archivePrefix={arXiv}, - primaryClass={cs.IR} -}""", + bibtex_citation=r""" +@misc{wojtasik2024beirpl, + archiveprefix = {arXiv}, + author = {Konrad Wojtasik and Vadim Shishkin and Kacper Wołowiec and Arkadiusz Janz and Maciej Piasecki}, + eprint = {2305.19840}, + primaryclass = {cs.IR}, + title = {BEIR-PL: Zero Shot Information Retrieval Benchmark for the Polish Language}, + year = {2024}, +} +""", adapted_from=["ArguAna"], ) diff --git a/mteb/tasks/Retrieval/pol/DBPediaPLRetrieval.py b/mteb/tasks/Retrieval/pol/DBPediaPLRetrieval.py index 23d44fcb59..64959a5c50 100644 --- a/mteb/tasks/Retrieval/pol/DBPediaPLRetrieval.py +++ b/mteb/tasks/Retrieval/pol/DBPediaPLRetrieval.py @@ -28,16 +28,18 @@ class DBPediaPL(AbsTaskRetrieval): annotations_creators="derived", dialect=[], sample_creation="machine-translated", - bibtex_citation="""@inproceedings{Hasibi:2017:DVT, - author = {Hasibi, Faegheh and Nikolaev, Fedor and Xiong, Chenyan and Balog, Krisztian and Bratsberg, Svein Erik and Kotov, Alexander and Callan, Jamie}, - title = {DBpedia-Entity V2: A Test Collection for Entity Search}, - booktitle = {Proceedings of the 40th International ACM SIGIR Conference on Research and Development in Information Retrieval}, - series = {SIGIR '17}, - year = {2017}, - pages = {1265--1268}, - doi = {10.1145/3077136.3080751}, - publisher = {ACM} -}""", + bibtex_citation=r""" +@inproceedings{Hasibi:2017:DVT, + author = {Hasibi, Faegheh and Nikolaev, Fedor and Xiong, Chenyan and Balog, Krisztian and Bratsberg, Svein Erik and Kotov, Alexander and Callan, Jamie}, + booktitle = {Proceedings of the 40th International ACM SIGIR Conference on Research and Development in Information Retrieval}, + doi = {10.1145/3077136.3080751}, + pages = {1265--1268}, + publisher = {ACM}, + series = {SIGIR '17}, + title = {DBpedia-Entity V2: A Test Collection for Entity Search}, + year = {2017}, +} +""", adapted_from=["DBPedia"], ) @@ -65,15 +67,17 @@ class DBPediaPLHardNegatives(AbsTaskRetrieval): annotations_creators="derived", dialect=[], sample_creation="machine-translated", - bibtex_citation="""@inproceedings{Hasibi:2017:DVT, - author = {Hasibi, Faegheh and Nikolaev, Fedor and Xiong, Chenyan and Balog, Krisztian and Bratsberg, Svein Erik and Kotov, Alexander and Callan, Jamie}, - title = {DBpedia-Entity V2: A Test Collection for Entity Search}, - booktitle = {Proceedings of the 40th International ACM SIGIR Conference on Research and Development in Information Retrieval}, - series = {SIGIR '17}, - year = {2017}, - pages = {1265--1268}, - doi = {10.1145/3077136.3080751}, - publisher = {ACM} -}""", + bibtex_citation=r""" +@inproceedings{Hasibi:2017:DVT, + author = {Hasibi, Faegheh and Nikolaev, Fedor and Xiong, Chenyan and Balog, Krisztian and Bratsberg, Svein Erik and Kotov, Alexander and Callan, Jamie}, + booktitle = {Proceedings of the 40th International ACM SIGIR Conference on Research and Development in Information Retrieval}, + doi = {10.1145/3077136.3080751}, + pages = {1265--1268}, + publisher = {ACM}, + series = {SIGIR '17}, + title = {DBpedia-Entity V2: A Test Collection for Entity Search}, + year = {2017}, +} +""", adapted_from=["DBPedia"], ) diff --git a/mteb/tasks/Retrieval/pol/FiQAPLRetrieval.py b/mteb/tasks/Retrieval/pol/FiQAPLRetrieval.py index b54f4ae4ed..dad7ea81e1 100644 --- a/mteb/tasks/Retrieval/pol/FiQAPLRetrieval.py +++ b/mteb/tasks/Retrieval/pol/FiQAPLRetrieval.py @@ -30,12 +30,13 @@ class FiQAPLRetrieval(AbsTaskRetrieval): annotations_creators="human-annotated", dialect=[], sample_creation="found", - bibtex_citation="""@inproceedings{ -thakur2021beir, -title={{BEIR}: A Heterogeneous Benchmark for Zero-shot Evaluation of Information Retrieval Models}, -author={Nandan Thakur and Nils Reimers and Andreas R{\"u}ckl{\'e} and Abhishek Srivastava and Iryna Gurevych}, -booktitle={Thirty-fifth Conference on Neural Information Processing Systems Datasets and Benchmarks Track (Round 2)}, -year={2021}, -url={https://openreview.net/forum?id=wCu6T5xFjeJ} -}""", + bibtex_citation=r""" +@inproceedings{thakur2021beir, + author = {Nandan Thakur and Nils Reimers and Andreas R{\"u}ckl{\'e} and Abhishek Srivastava and Iryna Gurevych}, + booktitle = {Thirty-fifth Conference on Neural Information Processing Systems Datasets and Benchmarks Track (Round 2)}, + title = {{BEIR}: A Heterogeneous Benchmark for Zero-shot Evaluation of Information Retrieval Models}, + url = {https://openreview.net/forum?id=wCu6T5xFjeJ}, + year = {2021}, +} +""", ) diff --git a/mteb/tasks/Retrieval/pol/HotpotQAPLRetrieval.py b/mteb/tasks/Retrieval/pol/HotpotQAPLRetrieval.py index 609aa1bbf3..8f083c4e7f 100644 --- a/mteb/tasks/Retrieval/pol/HotpotQAPLRetrieval.py +++ b/mteb/tasks/Retrieval/pol/HotpotQAPLRetrieval.py @@ -28,14 +28,16 @@ class HotpotQAPL(AbsTaskRetrieval): annotations_creators="derived", dialect=[], sample_creation="machine-translated", - bibtex_citation="""@misc{wojtasik2024beirpl, - title={BEIR-PL: Zero Shot Information Retrieval Benchmark for the Polish Language}, - author={Konrad Wojtasik and Vadim Shishkin and Kacper Wołowiec and Arkadiusz Janz and Maciej Piasecki}, - year={2024}, - eprint={2305.19840}, - archivePrefix={arXiv}, - primaryClass={cs.IR} -}""", + bibtex_citation=r""" +@misc{wojtasik2024beirpl, + archiveprefix = {arXiv}, + author = {Konrad Wojtasik and Vadim Shishkin and Kacper Wołowiec and Arkadiusz Janz and Maciej Piasecki}, + eprint = {2305.19840}, + primaryclass = {cs.IR}, + title = {BEIR-PL: Zero Shot Information Retrieval Benchmark for the Polish Language}, + year = {2024}, +} +""", adapted_from=["HotpotQA"], ) @@ -63,13 +65,15 @@ class HotpotQAPLHardNegatives(AbsTaskRetrieval): annotations_creators="derived", dialect=[], sample_creation="machine-translated", - bibtex_citation="""@misc{wojtasik2024beirpl, - title={BEIR-PL: Zero Shot Information Retrieval Benchmark for the Polish Language}, - author={Konrad Wojtasik and Vadim Shishkin and Kacper Wołowiec and Arkadiusz Janz and Maciej Piasecki}, - year={2024}, - eprint={2305.19840}, - archivePrefix={arXiv}, - primaryClass={cs.IR} -}""", + bibtex_citation=r""" +@misc{wojtasik2024beirpl, + archiveprefix = {arXiv}, + author = {Konrad Wojtasik and Vadim Shishkin and Kacper Wołowiec and Arkadiusz Janz and Maciej Piasecki}, + eprint = {2305.19840}, + primaryclass = {cs.IR}, + title = {BEIR-PL: Zero Shot Information Retrieval Benchmark for the Polish Language}, + year = {2024}, +} +""", adapted_from=["HotpotQA"], ) diff --git a/mteb/tasks/Retrieval/pol/MSMARCOPLRetrieval.py b/mteb/tasks/Retrieval/pol/MSMARCOPLRetrieval.py index 9d68ad507a..d53cb7cbc9 100644 --- a/mteb/tasks/Retrieval/pol/MSMARCOPLRetrieval.py +++ b/mteb/tasks/Retrieval/pol/MSMARCOPLRetrieval.py @@ -30,14 +30,16 @@ class MSMARCOPL(AbsTaskRetrieval): annotations_creators="derived", dialect=[], sample_creation="machine-translated", - bibtex_citation=""""@misc{wojtasik2024beirpl, - title={BEIR-PL: Zero Shot Information Retrieval Benchmark for the Polish Language}, - author={Konrad Wojtasik and Vadim Shishkin and Kacper Wołowiec and Arkadiusz Janz and Maciej Piasecki}, - year={2024}, - eprint={2305.19840}, - archivePrefix={arXiv}, - primaryClass={cs.IR} -}""", + bibtex_citation=r""" +@misc{wojtasik2024beirpl, + archiveprefix = {arXiv}, + author = {Konrad Wojtasik and Vadim Shishkin and Kacper Wołowiec and Arkadiusz Janz and Maciej Piasecki}, + eprint = {2305.19840}, + primaryclass = {cs.IR}, + title = {BEIR-PL: Zero Shot Information Retrieval Benchmark for the Polish Language}, + year = {2024}, +} +""", adapted_from=["MSMARCO"], ) @@ -67,13 +69,15 @@ class MSMARCOPLHardNegatives(AbsTaskRetrieval): annotations_creators="derived", dialect=[], sample_creation="machine-translated", - bibtex_citation=""""@misc{wojtasik2024beirpl, - title={BEIR-PL: Zero Shot Information Retrieval Benchmark for the Polish Language}, - author={Konrad Wojtasik and Vadim Shishkin and Kacper Wołowiec and Arkadiusz Janz and Maciej Piasecki}, - year={2024}, - eprint={2305.19840}, - archivePrefix={arXiv}, - primaryClass={cs.IR} -}""", + bibtex_citation=r""" +@misc{wojtasik2024beirpl, + archiveprefix = {arXiv}, + author = {Konrad Wojtasik and Vadim Shishkin and Kacper Wołowiec and Arkadiusz Janz and Maciej Piasecki}, + eprint = {2305.19840}, + primaryclass = {cs.IR}, + title = {BEIR-PL: Zero Shot Information Retrieval Benchmark for the Polish Language}, + year = {2024}, +} +""", adapted_from=["MSMARCO"], ) diff --git a/mteb/tasks/Retrieval/pol/NFCorpusPLRetrieval.py b/mteb/tasks/Retrieval/pol/NFCorpusPLRetrieval.py index ad6e41dc07..2c8558c08c 100644 --- a/mteb/tasks/Retrieval/pol/NFCorpusPLRetrieval.py +++ b/mteb/tasks/Retrieval/pol/NFCorpusPLRetrieval.py @@ -28,13 +28,15 @@ class NFCorpusPL(AbsTaskRetrieval): annotations_creators=None, dialect=None, sample_creation=None, - bibtex_citation="""@misc{wojtasik2024beirpl, - title={BEIR-PL: Zero Shot Information Retrieval Benchmark for the Polish Language}, - author={Konrad Wojtasik and Vadim Shishkin and Kacper Wołowiec and Arkadiusz Janz and Maciej Piasecki}, - year={2024}, - eprint={2305.19840}, - archivePrefix={arXiv}, - primaryClass={cs.IR} -}""", + bibtex_citation=r""" +@misc{wojtasik2024beirpl, + archiveprefix = {arXiv}, + author = {Konrad Wojtasik and Vadim Shishkin and Kacper Wołowiec and Arkadiusz Janz and Maciej Piasecki}, + eprint = {2305.19840}, + primaryclass = {cs.IR}, + title = {BEIR-PL: Zero Shot Information Retrieval Benchmark for the Polish Language}, + year = {2024}, +} +""", adapted_from=["NFCorpus"], ) diff --git a/mteb/tasks/Retrieval/pol/NQPLRetrieval.py b/mteb/tasks/Retrieval/pol/NQPLRetrieval.py index bbe96a3ce4..a3cab40691 100644 --- a/mteb/tasks/Retrieval/pol/NQPLRetrieval.py +++ b/mteb/tasks/Retrieval/pol/NQPLRetrieval.py @@ -28,14 +28,16 @@ class NQPL(AbsTaskRetrieval): annotations_creators=None, dialect=[], sample_creation="machine-translated", - bibtex_citation="""@misc{wojtasik2024beirpl, - title={BEIR-PL: Zero Shot Information Retrieval Benchmark for the Polish Language}, - author={Konrad Wojtasik and Vadim Shishkin and Kacper Wołowiec and Arkadiusz Janz and Maciej Piasecki}, - year={2024}, - eprint={2305.19840}, - archivePrefix={arXiv}, - primaryClass={cs.IR} -}""", + bibtex_citation=r""" +@misc{wojtasik2024beirpl, + archiveprefix = {arXiv}, + author = {Konrad Wojtasik and Vadim Shishkin and Kacper Wołowiec and Arkadiusz Janz and Maciej Piasecki}, + eprint = {2305.19840}, + primaryclass = {cs.IR}, + title = {BEIR-PL: Zero Shot Information Retrieval Benchmark for the Polish Language}, + year = {2024}, +} +""", adapted_from=["NQ"], ) @@ -63,13 +65,15 @@ class NQPLHardNegatives(AbsTaskRetrieval): annotations_creators=None, dialect=[], sample_creation="machine-translated", - bibtex_citation="""@misc{wojtasik2024beirpl, - title={BEIR-PL: Zero Shot Information Retrieval Benchmark for the Polish Language}, - author={Konrad Wojtasik and Vadim Shishkin and Kacper Wołowiec and Arkadiusz Janz and Maciej Piasecki}, - year={2024}, - eprint={2305.19840}, - archivePrefix={arXiv}, - primaryClass={cs.IR} -}""", + bibtex_citation=r""" +@misc{wojtasik2024beirpl, + archiveprefix = {arXiv}, + author = {Konrad Wojtasik and Vadim Shishkin and Kacper Wołowiec and Arkadiusz Janz and Maciej Piasecki}, + eprint = {2305.19840}, + primaryclass = {cs.IR}, + title = {BEIR-PL: Zero Shot Information Retrieval Benchmark for the Polish Language}, + year = {2024}, +} +""", adapted_from=["NQ"], ) diff --git a/mteb/tasks/Retrieval/pol/QuoraPLRetrieval.py b/mteb/tasks/Retrieval/pol/QuoraPLRetrieval.py index 12586ba697..7ab63fe481 100644 --- a/mteb/tasks/Retrieval/pol/QuoraPLRetrieval.py +++ b/mteb/tasks/Retrieval/pol/QuoraPLRetrieval.py @@ -28,14 +28,16 @@ class QuoraPLRetrieval(AbsTaskRetrieval): annotations_creators=None, dialect=[], sample_creation="machine-translated", - bibtex_citation=""""@misc{wojtasik2024beirpl, - title={BEIR-PL: Zero Shot Information Retrieval Benchmark for the Polish Language}, - author={Konrad Wojtasik and Vadim Shishkin and Kacper Wołowiec and Arkadiusz Janz and Maciej Piasecki}, - year={2024}, - eprint={2305.19840}, - archivePrefix={arXiv}, - primaryClass={cs.IR} -}""", + bibtex_citation=r""" +@misc{wojtasik2024beirpl, + archiveprefix = {arXiv}, + author = {Konrad Wojtasik and Vadim Shishkin and Kacper Wołowiec and Arkadiusz Janz and Maciej Piasecki}, + eprint = {2305.19840}, + primaryclass = {cs.IR}, + title = {BEIR-PL: Zero Shot Information Retrieval Benchmark for the Polish Language}, + year = {2024}, +} +""", adapted_from=["QuoraRetrieval"], ) @@ -63,13 +65,15 @@ class QuoraPLRetrievalHardNegatives(AbsTaskRetrieval): annotations_creators=None, dialect=[], sample_creation="machine-translated", - bibtex_citation=""""@misc{wojtasik2024beirpl, - title={BEIR-PL: Zero Shot Information Retrieval Benchmark for the Polish Language}, - author={Konrad Wojtasik and Vadim Shishkin and Kacper Wołowiec and Arkadiusz Janz and Maciej Piasecki}, - year={2024}, - eprint={2305.19840}, - archivePrefix={arXiv}, - primaryClass={cs.IR} -}""", + bibtex_citation=r""" +@misc{wojtasik2024beirpl, + archiveprefix = {arXiv}, + author = {Konrad Wojtasik and Vadim Shishkin and Kacper Wołowiec and Arkadiusz Janz and Maciej Piasecki}, + eprint = {2305.19840}, + primaryclass = {cs.IR}, + title = {BEIR-PL: Zero Shot Information Retrieval Benchmark for the Polish Language}, + year = {2024}, +} +""", adapted_from=["QuoraRetrieval"], ) diff --git a/mteb/tasks/Retrieval/pol/SCIDOCSPLRetrieval.py b/mteb/tasks/Retrieval/pol/SCIDOCSPLRetrieval.py index c7cd4958f8..5ebdf9eb0f 100644 --- a/mteb/tasks/Retrieval/pol/SCIDOCSPLRetrieval.py +++ b/mteb/tasks/Retrieval/pol/SCIDOCSPLRetrieval.py @@ -28,13 +28,15 @@ class SCIDOCSPL(AbsTaskRetrieval): annotations_creators=None, dialect=None, sample_creation=None, - bibtex_citation="""@misc{wojtasik2024beirpl, - title={BEIR-PL: Zero Shot Information Retrieval Benchmark for the Polish Language}, - author={Konrad Wojtasik and Vadim Shishkin and Kacper Wołowiec and Arkadiusz Janz and Maciej Piasecki}, - year={2024}, - eprint={2305.19840}, - archivePrefix={arXiv}, - primaryClass={cs.IR} -}""", + bibtex_citation=r""" +@misc{wojtasik2024beirpl, + archiveprefix = {arXiv}, + author = {Konrad Wojtasik and Vadim Shishkin and Kacper Wołowiec and Arkadiusz Janz and Maciej Piasecki}, + eprint = {2305.19840}, + primaryclass = {cs.IR}, + title = {BEIR-PL: Zero Shot Information Retrieval Benchmark for the Polish Language}, + year = {2024}, +} +""", adapted_from=["SCIDOCS"], ) diff --git a/mteb/tasks/Retrieval/pol/SciFactPLRetrieval.py b/mteb/tasks/Retrieval/pol/SciFactPLRetrieval.py index 1199df50a9..d73b55eb2f 100644 --- a/mteb/tasks/Retrieval/pol/SciFactPLRetrieval.py +++ b/mteb/tasks/Retrieval/pol/SciFactPLRetrieval.py @@ -28,13 +28,15 @@ class SciFactPL(AbsTaskRetrieval): annotations_creators=None, dialect=None, sample_creation=None, - bibtex_citation="""@misc{wojtasik2024beirpl, - title={BEIR-PL: Zero Shot Information Retrieval Benchmark for the Polish Language}, - author={Konrad Wojtasik and Vadim Shishkin and Kacper Wołowiec and Arkadiusz Janz and Maciej Piasecki}, - year={2024}, - eprint={2305.19840}, - archivePrefix={arXiv}, - primaryClass={cs.IR} -}""", + bibtex_citation=r""" +@misc{wojtasik2024beirpl, + archiveprefix = {arXiv}, + author = {Konrad Wojtasik and Vadim Shishkin and Kacper Wołowiec and Arkadiusz Janz and Maciej Piasecki}, + eprint = {2305.19840}, + primaryclass = {cs.IR}, + title = {BEIR-PL: Zero Shot Information Retrieval Benchmark for the Polish Language}, + year = {2024}, +} +""", adapted_from=["SciFact"], ) diff --git a/mteb/tasks/Retrieval/pol/TRECCOVIDPLRetrieval.py b/mteb/tasks/Retrieval/pol/TRECCOVIDPLRetrieval.py index f3bfd2ad13..bb9da781b0 100644 --- a/mteb/tasks/Retrieval/pol/TRECCOVIDPLRetrieval.py +++ b/mteb/tasks/Retrieval/pol/TRECCOVIDPLRetrieval.py @@ -31,13 +31,15 @@ class TRECCOVIDPL(AbsTaskRetrieval): annotations_creators="derived", dialect=[], sample_creation="machine-translated", - bibtex_citation="""@misc{wojtasik2024beirpl, - title={BEIR-PL: Zero Shot Information Retrieval Benchmark for the Polish Language}, - author={Konrad Wojtasik and Vadim Shishkin and Kacper Wołowiec and Arkadiusz Janz and Maciej Piasecki}, - year={2024}, - eprint={2305.19840}, - archivePrefix={arXiv}, - primaryClass={cs.IR} -}""", + bibtex_citation=r""" +@misc{wojtasik2024beirpl, + archiveprefix = {arXiv}, + author = {Konrad Wojtasik and Vadim Shishkin and Kacper Wołowiec and Arkadiusz Janz and Maciej Piasecki}, + eprint = {2305.19840}, + primaryclass = {cs.IR}, + title = {BEIR-PL: Zero Shot Information Retrieval Benchmark for the Polish Language}, + year = {2024}, +} +""", adapted_from=["TRECCOVID"], ) diff --git a/mteb/tasks/Retrieval/rus/RiaNewsRetrieval.py b/mteb/tasks/Retrieval/rus/RiaNewsRetrieval.py index 049a3c5edf..0399374200 100644 --- a/mteb/tasks/Retrieval/rus/RiaNewsRetrieval.py +++ b/mteb/tasks/Retrieval/rus/RiaNewsRetrieval.py @@ -29,12 +29,14 @@ class RiaNewsRetrieval(AbsTaskRetrieval): annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation="""@inproceedings{gavrilov2018self, - title={Self-Attentive Model for Headline Generation}, - author={Gavrilov, Daniil and Kalaidin, Pavel and Malykh, Valentin}, - booktitle={Proceedings of the 41st European Conference on Information Retrieval}, - year={2019} - }""", + bibtex_citation=r""" +@inproceedings{gavrilov2018self, + author = {Gavrilov, Daniil and Kalaidin, Pavel and Malykh, Valentin}, + booktitle = {Proceedings of the 41st European Conference on Information Retrieval}, + title = {Self-Attentive Model for Headline Generation}, + year = {2019}, +} +""", prompt={"query": "Given a news title, retrieve relevant news article"}, ) @@ -63,11 +65,13 @@ class RiaNewsRetrievalHardNegatives(AbsTaskRetrieval): annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation="""@inproceedings{gavrilov2018self, - title={Self-Attentive Model for Headline Generation}, - author={Gavrilov, Daniil and Kalaidin, Pavel and Malykh, Valentin}, - booktitle={Proceedings of the 41st European Conference on Information Retrieval}, - year={2019} - }""", + bibtex_citation=r""" +@inproceedings{gavrilov2018self, + author = {Gavrilov, Daniil and Kalaidin, Pavel and Malykh, Valentin}, + booktitle = {Proceedings of the 41st European Conference on Information Retrieval}, + title = {Self-Attentive Model for Headline Generation}, + year = {2019}, +} +""", adapted_from=["RiaNewsRetrieval"], ) diff --git a/mteb/tasks/Retrieval/rus/RuBQRetrieval.py b/mteb/tasks/Retrieval/rus/RuBQRetrieval.py index 3bb1bb35e9..1c294682c6 100644 --- a/mteb/tasks/Retrieval/rus/RuBQRetrieval.py +++ b/mteb/tasks/Retrieval/rus/RuBQRetrieval.py @@ -29,13 +29,15 @@ class RuBQRetrieval(AbsTaskRetrieval): annotations_creators="human-annotated", dialect=[], sample_creation="created", - bibtex_citation="""@inproceedings{RuBQ2021, - title={RuBQ 2.0: An Innovated Russian Question Answering Dataset}, - author={Ivan Rybin and Vladislav Korablinov and Pavel Efimov and Pavel Braslavski}, - booktitle={ESWC}, - year={2021}, - pages={532--547} - }""", + bibtex_citation=r""" +@inproceedings{RuBQ2021, + author = {Ivan Rybin and Vladislav Korablinov and Pavel Efimov and Pavel Braslavski}, + booktitle = {ESWC}, + pages = {532--547}, + title = {RuBQ 2.0: An Innovated Russian Question Answering Dataset}, + year = {2021}, +} +""", prompt={ "query": "Given a question, retrieve Wikipedia passages that answer the question" }, diff --git a/mteb/tasks/Retrieval/slk/SlovakSumRetrieval.py b/mteb/tasks/Retrieval/slk/SlovakSumRetrieval.py index ea08c03225..fbba98487e 100644 --- a/mteb/tasks/Retrieval/slk/SlovakSumRetrieval.py +++ b/mteb/tasks/Retrieval/slk/SlovakSumRetrieval.py @@ -34,14 +34,14 @@ class SlovakSumRetrieval(AbsTaskRetrieval): annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation=""" - @inproceedings{OndrejowaSlovakSum24, - title = {SlovakSum: A Large Scale Slovak Summarization Dataset}, - booktitle = {Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation}, - author = {Ondrejová, Viktória and Šuppa, Marek}, - date = {2024}, - } - """, + bibtex_citation=r""" +@inproceedings{OndrejowaSlovakSum24, + author = {Ondrejová, Viktória and Šuppa, Marek}, + booktitle = {Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation}, + date = {2024}, + title = {SlovakSum: A Large Scale Slovak Summarization Dataset}, +} +""", ) def load_data(self, **kwargs): diff --git a/mteb/tasks/Retrieval/spa/SpanishPassageRetrievalS2P.py b/mteb/tasks/Retrieval/spa/SpanishPassageRetrievalS2P.py index 8ef0681dcd..79e0f59d01 100644 --- a/mteb/tasks/Retrieval/spa/SpanishPassageRetrievalS2P.py +++ b/mteb/tasks/Retrieval/spa/SpanishPassageRetrievalS2P.py @@ -30,27 +30,28 @@ class SpanishPassageRetrievalS2P(AbsTaskRetrieval): annotations_creators=None, dialect=None, sample_creation=None, - bibtex_citation="""@InProceedings{10.1007/978-3-030-15719-7_19, -author="Kamateri, Eleni + bibtex_citation=r""" +@inproceedings{10.1007/978-3-030-15719-7_19, + abstract = {This paper describes a new test collection for passage retrieval from health-related Web resources in Spanish. The test collection contains 10,037 health-related documents in Spanish, 37 topics representing complex information needs formulated in a total of 167 natural language questions, and manual relevance assessments of text passages, pooled from multiple systems. This test collection is the first to combine search in a language beyond English, passage retrieval, and health-related resources and topics targeting the general public.}, + address = {Cham}, + author = {Kamateri, Eleni and Tsikrika, Theodora and Symeonidis, Spyridon and Vrochidis, Stefanos and Minker, Wolfgang -and Kompatsiaris, Yiannis", -editor="Azzopardi, Leif +and Kompatsiaris, Yiannis}, + booktitle = {Advances in Information Retrieval}, + editor = {Azzopardi, Leif and Stein, Benno and Fuhr, Norbert and Mayr, Philipp and Hauff, Claudia -and Hiemstra, Djoerd", -title="A Test Collection for Passage Retrieval Evaluation of Spanish Health-Related Resources", -booktitle="Advances in Information Retrieval", -year="2019", -publisher="Springer International Publishing", -address="Cham", -pages="148--154", -abstract="This paper describes a new test collection for passage retrieval from health-related Web resources in Spanish. The test collection contains 10,037 health-related documents in Spanish, 37 topics representing complex information needs formulated in a total of 167 natural language questions, and manual relevance assessments of text passages, pooled from multiple systems. This test collection is the first to combine search in a language beyond English, passage retrieval, and health-related resources and topics targeting the general public.", -isbn="978-3-030-15719-7" +and Hiemstra, Djoerd}, + isbn = {978-3-030-15719-7}, + pages = {148--154}, + publisher = {Springer International Publishing}, + title = {A Test Collection for Passage Retrieval Evaluation of Spanish Health-Related Resources}, + year = {2019}, } """, ) diff --git a/mteb/tasks/Retrieval/spa/SpanishPassageRetrievalS2S.py b/mteb/tasks/Retrieval/spa/SpanishPassageRetrievalS2S.py index 86b45f1f4c..f22739e2a1 100644 --- a/mteb/tasks/Retrieval/spa/SpanishPassageRetrievalS2S.py +++ b/mteb/tasks/Retrieval/spa/SpanishPassageRetrievalS2S.py @@ -30,27 +30,28 @@ class SpanishPassageRetrievalS2S(AbsTaskRetrieval): annotations_creators=None, dialect=None, sample_creation=None, - bibtex_citation="""@InProceedings{10.1007/978-3-030-15719-7_19, -author="Kamateri, Eleni + bibtex_citation=r""" +@inproceedings{10.1007/978-3-030-15719-7_19, + abstract = {This paper describes a new test collection for passage retrieval from health-related Web resources in Spanish. The test collection contains 10,037 health-related documents in Spanish, 37 topics representing complex information needs formulated in a total of 167 natural language questions, and manual relevance assessments of text passages, pooled from multiple systems. This test collection is the first to combine search in a language beyond English, passage retrieval, and health-related resources and topics targeting the general public.}, + address = {Cham}, + author = {Kamateri, Eleni and Tsikrika, Theodora and Symeonidis, Spyridon and Vrochidis, Stefanos and Minker, Wolfgang -and Kompatsiaris, Yiannis", -editor="Azzopardi, Leif +and Kompatsiaris, Yiannis}, + booktitle = {Advances in Information Retrieval}, + editor = {Azzopardi, Leif and Stein, Benno and Fuhr, Norbert and Mayr, Philipp and Hauff, Claudia -and Hiemstra, Djoerd", -title="A Test Collection for Passage Retrieval Evaluation of Spanish Health-Related Resources", -booktitle="Advances in Information Retrieval", -year="2019", -publisher="Springer International Publishing", -address="Cham", -pages="148--154", -abstract="This paper describes a new test collection for passage retrieval from health-related Web resources in Spanish. The test collection contains 10,037 health-related documents in Spanish, 37 topics representing complex information needs formulated in a total of 167 natural language questions, and manual relevance assessments of text passages, pooled from multiple systems. This test collection is the first to combine search in a language beyond English, passage retrieval, and health-related resources and topics targeting the general public.", -isbn="978-3-030-15719-7" +and Hiemstra, Djoerd}, + isbn = {978-3-030-15719-7}, + pages = {148--154}, + publisher = {Springer International Publishing}, + title = {A Test Collection for Passage Retrieval Evaluation of Spanish Health-Related Resources}, + year = {2019}, } """, ) diff --git a/mteb/tasks/Retrieval/swe/SweFaqRetrieval.py b/mteb/tasks/Retrieval/swe/SweFaqRetrieval.py index eccc7d9ab7..14637b5eb5 100644 --- a/mteb/tasks/Retrieval/swe/SweFaqRetrieval.py +++ b/mteb/tasks/Retrieval/swe/SweFaqRetrieval.py @@ -30,13 +30,15 @@ class SweFaqRetrieval(AbsTaskRetrieval): annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation="""@inproceedings{berdivcevskis2023superlim, - title={Superlim: A Swedish language understanding evaluation benchmark}, - author={Berdi{\v{c}}evskis, Aleksandrs and Bouma, Gerlof and Kurtz, Robin and Morger, Felix and {\"O}hman, Joey and Adesam, Yvonne and Borin, Lars and Dann{\'e}lls, Dana and Forsberg, Markus and Isbister, Tim and others}, - booktitle={Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing}, - pages={8137--8153}, - year={2023} -}""", # for the benchmark in which this dataset is used + bibtex_citation=r""" +@inproceedings{berdivcevskis2023superlim, + author = {Berdi{\v{c}}evskis, Aleksandrs and Bouma, Gerlof and Kurtz, Robin and Morger, Felix and {\"O}hman, Joey and Adesam, Yvonne and Borin, Lars and Dann{\'e}lls, Dana and Forsberg, Markus and Isbister, Tim and others}, + booktitle = {Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing}, + pages = {8137--8153}, + title = {Superlim: A Swedish language understanding evaluation benchmark}, + year = {2023}, +} +""", # for the benchmark in which this dataset is used prompt={"query": "Retrieve answers given questions in Swedish"}, ) diff --git a/mteb/tasks/Retrieval/swe/SwednRetrieval.py b/mteb/tasks/Retrieval/swe/SwednRetrieval.py index acd7b65de7..4867813af0 100644 --- a/mteb/tasks/Retrieval/swe/SwednRetrieval.py +++ b/mteb/tasks/Retrieval/swe/SwednRetrieval.py @@ -30,12 +30,14 @@ class SwednRetrieval(AbsTaskRetrieval): dialect=[], task_subtypes=["Article retrieval"], sample_creation="found", - bibtex_citation="""@inproceedings{monsen2021method, - title={A method for building non-english corpora for abstractive text summarization}, - author={Monsen, Julius and J{\"o}nsson, Arne}, - booktitle={Proceedings of CLARIN Annual Conference}, - year={2021} -}""", + bibtex_citation=r""" +@inproceedings{monsen2021method, + author = {Monsen, Julius and J{\"o}nsson, Arne}, + booktitle = {Proceedings of CLARIN Annual Conference}, + title = {A method for building non-english corpora for abstractive text summarization}, + year = {2021}, +} +""", prompt={ "query": "Given a Swedish news headline retrieve summaries or news articles" }, diff --git a/mteb/tasks/Retrieval/tur/TurHistQuad.py b/mteb/tasks/Retrieval/tur/TurHistQuad.py index cac7b0d8fb..a8bc0912bb 100644 --- a/mteb/tasks/Retrieval/tur/TurHistQuad.py +++ b/mteb/tasks/Retrieval/tur/TurHistQuad.py @@ -28,19 +28,19 @@ class TurHistQuadRetrieval(AbsTaskRetrieval): annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation=""" - @INPROCEEDINGS{9559013, - author={Soygazi, Fatih and Çiftçi, Okan and Kök, Uğurcan and Cengiz, Soner}, - booktitle={2021 6th International Conference on Computer Science and Engineering (UBMK)}, - title={THQuAD: Turkish Historic Question Answering Dataset for Reading Comprehension}, - year={2021}, - volume={}, - number={}, - pages={215-220}, - keywords={Computer science;Computational modeling;Neural networks;Knowledge discovery;Information retrieval;Natural language processing;History;question answering;information retrieval;natural language understanding;deep learning;contextualized word embeddings}, - doi={10.1109/UBMK52708.2021.9559013}} - - """, + bibtex_citation=r""" +@inproceedings{9559013, + author = {Soygazi, Fatih and Çiftçi, Okan and Kök, Uğurcan and Cengiz, Soner}, + booktitle = {2021 6th International Conference on Computer Science and Engineering (UBMK)}, + doi = {10.1109/UBMK52708.2021.9559013}, + keywords = {Computer science;Computational modeling;Neural networks;Knowledge discovery;Information retrieval;Natural language processing;History;question answering;information retrieval;natural language understanding;deep learning;contextualized word embeddings}, + number = {}, + pages = {215-220}, + title = {THQuAD: Turkish Historic Question Answering Dataset for Reading Comprehension}, + volume = {}, + year = {2021}, +} +""", ) def load_data(self, **kwargs) -> None: diff --git a/mteb/tasks/Retrieval/vie/GreenNodeTableMarkdownRetrieval.py b/mteb/tasks/Retrieval/vie/GreenNodeTableMarkdownRetrieval.py new file mode 100644 index 0000000000..75a7928621 --- /dev/null +++ b/mteb/tasks/Retrieval/vie/GreenNodeTableMarkdownRetrieval.py @@ -0,0 +1,33 @@ +from __future__ import annotations + +from mteb.abstasks.TaskMetadata import TaskMetadata + +from ....abstasks.AbsTaskRetrieval import AbsTaskRetrieval + +TEST_SAMPLES = 2048 + + +class GreenNodeTableMarkdownRetrieval(AbsTaskRetrieval): + metadata = TaskMetadata( + name="GreenNodeTableMarkdownRetrieval", + description="GreenNodeTable documents", + reference="https://huggingface.co/GreenNode", + dataset={ + "path": "GreenNode/GreenNode-Table-Markdown-Retrieval-VN", + "revision": "d86a4dad9fd7c70359f617d86984395ea89be1c5", + }, + type="Retrieval", + category="s2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["vie-Latn"], + main_score="ndcg_at_10", + date=("2025-03-16", "2025-03-16"), + domains=["Financial", "Encyclopaedic", "Non-fiction"], + task_subtypes=["Article retrieval"], + license="mit", + annotations_creators="human-annotated", + dialect=[], + sample_creation="found", + bibtex_citation="", # TODO: Add bibtex citation when the paper is published + ) diff --git a/mteb/tasks/Retrieval/vie/VieQuADRetrieval.py b/mteb/tasks/Retrieval/vie/VieQuADRetrieval.py index 07ec5aba8b..8391fb0adb 100644 --- a/mteb/tasks/Retrieval/vie/VieQuADRetrieval.py +++ b/mteb/tasks/Retrieval/vie/VieQuADRetrieval.py @@ -33,23 +33,26 @@ class VieQuADRetrieval(AbsTaskRetrieval): annotations_creators="human-annotated", dialect=[], sample_creation="found", - bibtex_citation="""@inproceedings{nguyen-etal-2020-vietnamese, -title = "A Vietnamese Dataset for Evaluating Machine Reading Comprehension", -author = "Nguyen, Kiet and - Nguyen, Vu and - Nguyen, Anh and - Nguyen, Ngan", -editor = "Scott, Donia and - Bel, Nuria and - Zong, Chengqing", -booktitle = "Proceedings of the 28th International Conference on Computational Linguistics", -month = dec, -year = "2020", -address = "Barcelona, Spain (Online)", -publisher = "International Committee on Computational Linguistics", -url = "https://aclanthology.org/2020.coling-main.233", -doi = "10.18653/v1/2020.coling-main.233", -pages = "2595--2605"}""", + bibtex_citation=r""" +@inproceedings{nguyen-etal-2020-vietnamese, + address = {Barcelona, Spain (Online)}, + author = {Nguyen, Kiet and +Nguyen, Vu and +Nguyen, Anh and +Nguyen, Ngan}, + booktitle = {Proceedings of the 28th International Conference on Computational Linguistics}, + doi = {10.18653/v1/2020.coling-main.233}, + editor = {Scott, Donia and +Bel, Nuria and +Zong, Chengqing}, + month = dec, + pages = {2595--2605}, + publisher = {International Committee on Computational Linguistics}, + title = {A Vietnamese Dataset for Evaluating Machine Reading Comprehension}, + url = {https://aclanthology.org/2020.coling-main.233}, + year = {2020}, +} +""", ) def load_data(self, **kwargs): diff --git a/mteb/tasks/Retrieval/vie/ZacLegalTextRetrieval.py b/mteb/tasks/Retrieval/vie/ZacLegalTextRetrieval.py new file mode 100644 index 0000000000..3a97e80afe --- /dev/null +++ b/mteb/tasks/Retrieval/vie/ZacLegalTextRetrieval.py @@ -0,0 +1,31 @@ +from __future__ import annotations + +from mteb.abstasks.TaskMetadata import TaskMetadata + +from ....abstasks.AbsTaskRetrieval import AbsTaskRetrieval + + +class ZacLegalTextRetrieval(AbsTaskRetrieval): + metadata = TaskMetadata( + name="ZacLegalTextRetrieval", + description="Zalo Legal Text documents", + reference="https://challenge.zalo.ai", + dataset={ + "path": "GreenNode/zalo-ai-legal-text-retrieval-vn", + "revision": "910766554633e8da014e88f54988705dde7ecaac", + }, + type="Retrieval", + category="s2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["vie-Latn"], + main_score="ndcg_at_10", + date=("2025-03-16", "2025-03-16"), + domains=["Legal"], + task_subtypes=["Article retrieval"], + license="mit", + annotations_creators="human-annotated", + dialect=[], + sample_creation="found", + bibtex_citation="", # TODO: Add bibtex citation when the paper is published + ) diff --git a/mteb/tasks/Retrieval/zho/CMTEBRetrieval.py b/mteb/tasks/Retrieval/zho/CMTEBRetrieval.py index eec977f926..dd158e2858 100644 --- a/mteb/tasks/Retrieval/zho/CMTEBRetrieval.py +++ b/mteb/tasks/Retrieval/zho/CMTEBRetrieval.py @@ -44,21 +44,29 @@ class T2Retrieval(AbsTaskRetrieval): eval_splits=["dev"], eval_langs=["cmn-Hans"], main_score="ndcg_at_10", - date=None, - domains=None, - task_subtypes=None, - license=None, - annotations_creators=None, + date=("2023-04-04", "2023-05-16"), + domains=[ + "Medical", + "Academic", + "Financial", + "Government", + "Non-fiction", + ], + task_subtypes=[], + license="apache-2.0", + annotations_creators="human-annotated", dialect=None, sample_creation=None, - bibtex_citation="""@misc{xie2023t2ranking, - title={T2Ranking: A large-scale Chinese Benchmark for Passage Ranking}, - author={Xiaohui Xie and Qian Dong and Bingning Wang and Feiyang Lv and Ting Yao and Weinan Gan and Zhijing Wu and Xiangsheng Li and Haitao Li and Yiqun Liu and Jin Ma}, - year={2023}, - eprint={2304.03679}, - archivePrefix={arXiv}, - primaryClass={cs.IR} -}""", + bibtex_citation=r""" +@misc{xie2023t2ranking, + archiveprefix = {arXiv}, + author = {Xiaohui Xie and Qian Dong and Bingning Wang and Feiyang Lv and Ting Yao and Weinan Gan and Zhijing Wu and Xiangsheng Li and Haitao Li and Yiqun Liu and Jin Ma}, + eprint = {2304.03679}, + primaryclass = {cs.IR}, + title = {T2Ranking: A large-scale Chinese Benchmark for Passage Ranking}, + year = {2023}, +} +""", prompt={ "query": "Given a Chinese search query, retrieve web passages that answer the question" }, @@ -102,14 +110,16 @@ class MMarcoRetrieval(AbsTaskRetrieval): annotations_creators=None, dialect=None, sample_creation=None, - bibtex_citation="""@misc{xiao2024cpack, - title={C-Pack: Packaged Resources To Advance General Chinese Embedding}, - author={Shitao Xiao and Zheng Liu and Peitian Zhang and Niklas Muennighoff and Defu Lian and Jian-Yun Nie}, - year={2024}, - eprint={2309.07597}, - archivePrefix={arXiv}, - primaryClass={cs.CL} -}""", + bibtex_citation=r""" +@misc{xiao2024cpack, + archiveprefix = {arXiv}, + author = {Shitao Xiao and Zheng Liu and Peitian Zhang and Niklas Muennighoff and Defu Lian and Jian-Yun Nie}, + eprint = {2309.07597}, + primaryclass = {cs.CL}, + title = {C-Pack: Packaged Resources To Advance General Chinese Embedding}, + year = {2024}, +} +""", prompt={ "query": "Given a web search query, retrieve relevant passages that answer the query" }, @@ -151,14 +161,16 @@ class DuRetrieval(AbsTaskRetrieval): annotations_creators=None, dialect=None, sample_creation=None, - bibtex_citation="""@misc{qiu2022dureaderretrieval, - title={DuReader_retrieval: A Large-scale Chinese Benchmark for Passage Retrieval from Web Search Engine}, - author={Yifu Qiu and Hongyu Li and Yingqi Qu and Ying Chen and Qiaoqiao She and Jing Liu and Hua Wu and Haifeng Wang}, - year={2022}, - eprint={2203.10232}, - archivePrefix={arXiv}, - primaryClass={cs.CL} -}""", + bibtex_citation=r""" +@misc{qiu2022dureaderretrieval, + archiveprefix = {arXiv}, + author = {Yifu Qiu and Hongyu Li and Yingqi Qu and Ying Chen and Qiaoqiao She and Jing Liu and Hua Wu and Haifeng Wang}, + eprint = {2203.10232}, + primaryclass = {cs.CL}, + title = {DuReader_retrieval: A Large-scale Chinese Benchmark for Passage Retrieval from Web Search Engine}, + year = {2022}, +} +""", prompt={ "query": "Given a Chinese search query, retrieve web passages that answer the question" }, @@ -193,14 +205,24 @@ class CovidRetrieval(AbsTaskRetrieval): eval_splits=["dev"], eval_langs=["cmn-Hans"], main_score="ndcg_at_10", - date=None, - domains=None, - task_subtypes=None, - license=None, - annotations_creators=None, - dialect=None, + date=("2022-03-03", "2022-03-18"), + domains=["Medical", "Entertainment"], + task_subtypes=[], + license="not specified", + annotations_creators="human-annotated", + dialect=[], sample_creation=None, - bibtex_citation=None, + bibtex_citation=r""" +@misc{long2022multicprmultidomainchinese, + archiveprefix = {arXiv}, + author = {Dingkun Long and Qiong Gao and Kuan Zou and Guangwei Xu and Pengjun Xie and Ruijie Guo and Jian Xu and Guanjun Jiang and Luxi Xing and Ping Yang}, + eprint = {2203.03367}, + primaryclass = {cs.IR}, + title = {Multi-CPR: A Multi Domain Chinese Dataset for Passage Retrieval}, + url = {https://arxiv.org/abs/2203.03367}, + year = {2022}, +} +""", prompt={ "query": "Given a question on COVID-19, retrieve news articles that answer the question" }, @@ -242,7 +264,17 @@ class CmedqaRetrieval(AbsTaskRetrieval): annotations_creators=None, dialect=None, sample_creation=None, - bibtex_citation=None, + bibtex_citation=r""" +@misc{qiu2022dureaderretrievallargescalechinesebenchmark, + archiveprefix = {arXiv}, + author = {Yifu Qiu and Hongyu Li and Yingqi Qu and Ying Chen and Qiaoqiao She and Jing Liu and Hua Wu and Haifeng Wang}, + eprint = {2203.10232}, + primaryclass = {cs.CL}, + title = {DuReader_retrieval: A Large-scale Chinese Benchmark for Passage Retrieval from Web Search Engine}, + url = {https://arxiv.org/abs/2203.10232}, + year = {2022}, +} +""", prompt={ "query": "Given a Chinese community medical question, retrieve replies that best answer the question" }, @@ -286,7 +318,17 @@ class EcomRetrieval(AbsTaskRetrieval): annotations_creators=None, dialect=None, sample_creation=None, - bibtex_citation=None, + bibtex_citation=r""" +@misc{long2022multicprmultidomainchinese, + archiveprefix = {arXiv}, + author = {Dingkun Long and Qiong Gao and Kuan Zou and Guangwei Xu and Pengjun Xie and Ruijie Guo and Jian Xu and Guanjun Jiang and Luxi Xing and Ping Yang}, + eprint = {2203.03367}, + primaryclass = {cs.IR}, + title = {Multi-CPR: A Multi Domain Chinese Dataset for Passage Retrieval}, + url = {https://arxiv.org/abs/2203.03367}, + year = {2022}, +} +""", prompt={ "query": "Given a user query from an e-commerce website, retrieve description sentences of relevant products" }, @@ -330,7 +372,17 @@ class MedicalRetrieval(AbsTaskRetrieval): annotations_creators=None, dialect=None, sample_creation=None, - bibtex_citation=None, + bibtex_citation=r""" +@misc{long2022multicprmultidomainchinese, + archiveprefix = {arXiv}, + author = {Dingkun Long and Qiong Gao and Kuan Zou and Guangwei Xu and Pengjun Xie and Ruijie Guo and Jian Xu and Guanjun Jiang and Luxi Xing and Ping Yang}, + eprint = {2203.03367}, + primaryclass = {cs.IR}, + title = {Multi-CPR: A Multi Domain Chinese Dataset for Passage Retrieval}, + url = {https://arxiv.org/abs/2203.03367}, + year = {2022}, +} +""", prompt={ "query": "Given a medical question, retrieve user replies that best answer the question" }, @@ -374,7 +426,17 @@ class VideoRetrieval(AbsTaskRetrieval): annotations_creators=None, dialect=None, sample_creation=None, - bibtex_citation=None, + bibtex_citation=r""" +@misc{long2022multicprmultidomainchinese, + archiveprefix = {arXiv}, + author = {Dingkun Long and Qiong Gao and Kuan Zou and Guangwei Xu and Pengjun Xie and Ruijie Guo and Jian Xu and Guanjun Jiang and Luxi Xing and Ping Yang}, + eprint = {2203.03367}, + primaryclass = {cs.IR}, + title = {Multi-CPR: A Multi Domain Chinese Dataset for Passage Retrieval}, + url = {https://arxiv.org/abs/2203.03367}, + year = {2022}, +} +""", prompt={ "query": "Given a video search query, retrieve the titles of relevant videos" }, diff --git a/mteb/tasks/Retrieval/zho/LeCaRDv2Retrieval.py b/mteb/tasks/Retrieval/zho/LeCaRDv2Retrieval.py index 9d3480ff79..e733755fd8 100644 --- a/mteb/tasks/Retrieval/zho/LeCaRDv2Retrieval.py +++ b/mteb/tasks/Retrieval/zho/LeCaRDv2Retrieval.py @@ -27,12 +27,14 @@ class LeCaRDv2(AbsTaskRetrieval): annotations_creators="derived", dialect=None, sample_creation="found", - bibtex_citation="""@misc{li2023lecardv2, - title={LeCaRDv2: A Large-Scale Chinese Legal Case Retrieval Dataset}, - author={Haitao Li and Yunqiu Shao and Yueyue Wu and Qingyao Ai and Yixiao Ma and Yiqun Liu}, - year={2023}, - eprint={2310.17609}, - archivePrefix={arXiv}, - primaryClass={cs.CL} -}""", + bibtex_citation=r""" +@misc{li2023lecardv2, + archiveprefix = {arXiv}, + author = {Haitao Li and Yunqiu Shao and Yueyue Wu and Qingyao Ai and Yixiao Ma and Yiqun Liu}, + eprint = {2310.17609}, + primaryclass = {cs.CL}, + title = {LeCaRDv2: A Large-Scale Chinese Legal Case Retrieval Dataset}, + year = {2023}, +} +""", ) diff --git a/mteb/tasks/STS/deu/GermanSTSBenchmarkSTS.py b/mteb/tasks/STS/deu/GermanSTSBenchmarkSTS.py index 34add4378e..862fb18b98 100644 --- a/mteb/tasks/STS/deu/GermanSTSBenchmarkSTS.py +++ b/mteb/tasks/STS/deu/GermanSTSBenchmarkSTS.py @@ -21,19 +21,21 @@ class GermanSTSBenchmarkSTS(AbsTaskSTS): eval_splits=["validation", "test"], eval_langs=["deu-Latn"], main_score="cosine_spearman", - date=None, - domains=None, + date=("2023-11-09", "2024-01-24"), + domains=[], task_subtypes=None, - license=None, + license="cc-by-sa-3.0", annotations_creators=None, dialect=None, sample_creation=None, - bibtex_citation="""@InProceedings{huggingface:dataset:stsb_multi_mt, -title = {Machine translated multilingual STS benchmark dataset.}, -author={Philip May}, -year={2021}, -url={https://github.com/PhilipMay/stsb-multi-mt} -}""", + bibtex_citation=r""" +@inproceedings{huggingface:dataset:stsb_multi_mt, + author = {Philip May}, + title = {Machine translated multilingual STS benchmark dataset.}, + url = {https://github.com/PhilipMay/stsb-multi-mt}, + year = {2021}, +} +""", ) @property diff --git a/mteb/tasks/STS/eng/BiossesSTS.py b/mteb/tasks/STS/eng/BiossesSTS.py index 1fc1d5a1d0..9fc424d043 100644 --- a/mteb/tasks/STS/eng/BiossesSTS.py +++ b/mteb/tasks/STS/eng/BiossesSTS.py @@ -27,21 +27,23 @@ class BiossesSTS(AbsTaskSTS): annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation="""@article{10.1093/bioinformatics/btx238, - author = {Soğancıoğlu, Gizem and Öztürk, Hakime and Özgür, Arzucan}, - title = "{BIOSSES: a semantic sentence similarity estimation system for the biomedical domain}", - journal = {Bioinformatics}, - volume = {33}, - number = {14}, - pages = {i49-i58}, - year = {2017}, - month = {07}, - abstract = "{The amount of information available in textual format is rapidly increasing in the biomedical domain. Therefore, natural language processing (NLP) applications are becoming increasingly important to facilitate the retrieval and analysis of these data. Computing the semantic similarity between sentences is an important component in many NLP tasks including text retrieval and summarization. A number of approaches have been proposed for semantic sentence similarity estimation for generic English. However, our experiments showed that such approaches do not effectively cover biomedical knowledge and produce poor results for biomedical text.We propose several approaches for sentence-level semantic similarity computation in the biomedical domain, including string similarity measures and measures based on the distributed vector representations of sentences learned in an unsupervised manner from a large biomedical corpus. In addition, ontology-based approaches are presented that utilize general and domain-specific ontologies. Finally, a supervised regression based model is developed that effectively combines the different similarity computation metrics. A benchmark data set consisting of 100 sentence pairs from the biomedical literature is manually annotated by five human experts and used for evaluating the proposed methods.The experiments showed that the supervised semantic sentence similarity computation approach obtained the best performance (0.836 correlation with gold standard human annotations) and improved over the state-of-the-art domain-independent systems up to 42.6\\% in terms of the Pearson correlation metric.A web-based system for biomedical semantic sentence similarity computation, the source code, and the annotated benchmark data set are available at: http://tabilab.cmpe.boun.edu.tr/BIOSSES/.}", - issn = {1367-4803}, - doi = {10.1093/bioinformatics/btx238}, - url = {https://doi.org/10.1093/bioinformatics/btx238}, - eprint = {https://academic.oup.com/bioinformatics/article-pdf/33/14/i49/50315066/bioinformatics\_33\_14\_i49.pdf}, -}""", + bibtex_citation=r""" +@article{10.1093/bioinformatics/btx238, + abstract = {{The amount of information available in textual format is rapidly increasing in the biomedical domain. Therefore, natural language processing (NLP) applications are becoming increasingly important to facilitate the retrieval and analysis of these data. Computing the semantic similarity between sentences is an important component in many NLP tasks including text retrieval and summarization. A number of approaches have been proposed for semantic sentence similarity estimation for generic English. However, our experiments showed that such approaches do not effectively cover biomedical knowledge and produce poor results for biomedical text.We propose several approaches for sentence-level semantic similarity computation in the biomedical domain, including string similarity measures and measures based on the distributed vector representations of sentences learned in an unsupervised manner from a large biomedical corpus. In addition, ontology-based approaches are presented that utilize general and domain-specific ontologies. Finally, a supervised regression based model is developed that effectively combines the different similarity computation metrics. A benchmark data set consisting of 100 sentence pairs from the biomedical literature is manually annotated by five human experts and used for evaluating the proposed methods.The experiments showed that the supervised semantic sentence similarity computation approach obtained the best performance (0.836 correlation with gold standard human annotations) and improved over the state-of-the-art domain-independent systems up to 42.6\\% in terms of the Pearson correlation metric.A web-based system for biomedical semantic sentence similarity computation, the source code, and the annotated benchmark data set are available at: http://tabilab.cmpe.boun.edu.tr/BIOSSES/.}}, + author = {Soğancıoğlu, Gizem and Öztürk, Hakime and Özgür, Arzucan}, + doi = {10.1093/bioinformatics/btx238}, + eprint = {https://academic.oup.com/bioinformatics/article-pdf/33/14/i49/50315066/bioinformatics\_33\_14\_i49.pdf}, + issn = {1367-4803}, + journal = {Bioinformatics}, + month = {07}, + number = {14}, + pages = {i49-i58}, + title = {{BIOSSES: a semantic sentence similarity estimation system for the biomedical domain}}, + url = {https://doi.org/10.1093/bioinformatics/btx238}, + volume = {33}, + year = {2017}, +} +""", ) @property diff --git a/mteb/tasks/STS/eng/STS12STS.py b/mteb/tasks/STS/eng/STS12STS.py index b222b42c66..a36608a0e9 100644 --- a/mteb/tasks/STS/eng/STS12STS.py +++ b/mteb/tasks/STS/eng/STS12STS.py @@ -27,19 +27,21 @@ class STS12STS(AbsTaskSTS): annotations_creators="human-annotated", dialect=[], sample_creation="created", - bibtex_citation="""@inproceedings{10.5555/2387636.2387697, -author = {Agirre, Eneko and Diab, Mona and Cer, Daniel and Gonzalez-Agirre, Aitor}, -title = {SemEval-2012 task 6: a pilot on semantic textual similarity}, -year = {2012}, -publisher = {Association for Computational Linguistics}, -address = {USA}, -abstract = {Semantic Textual Similarity (STS) measures the degree of semantic equivalence between two texts. This paper presents the results of the STS pilot task in Semeval. The training data contained 2000 sentence pairs from previously existing paraphrase datasets and machine translation evaluation resources. The test data also comprised 2000 sentences pairs for those datasets, plus two surprise datasets with 400 pairs from a different machine translation evaluation corpus and 750 pairs from a lexical resource mapping exercise. The similarity of pairs of sentences was rated on a 0-5 scale (low to high similarity) by human judges using Amazon Mechanical Turk, with high Pearson correlation scores, around 90\%. 35 teams participated in the task, submitting 88 runs. The best results scored a Pearson correlation >80\%, well above a simple lexical baseline that only scored a 31\% correlation. This pilot task opens an exciting way ahead, although there are still open issues, specially the evaluation metric.}, -booktitle = {Proceedings of the First Joint Conference on Lexical and Computational Semantics - Volume 1: Proceedings of the Main Conference and the Shared Task, and Volume 2: Proceedings of the Sixth International Workshop on Semantic Evaluation}, -pages = {385–393}, -numpages = {9}, -location = {Montr\'{e}al, Canada}, -series = {SemEval '12} -}""", + bibtex_citation=r""" +@inproceedings{10.5555/2387636.2387697, + abstract = {Semantic Textual Similarity (STS) measures the degree of semantic equivalence between two texts. This paper presents the results of the STS pilot task in Semeval. The training data contained 2000 sentence pairs from previously existing paraphrase datasets and machine translation evaluation resources. The test data also comprised 2000 sentences pairs for those datasets, plus two surprise datasets with 400 pairs from a different machine translation evaluation corpus and 750 pairs from a lexical resource mapping exercise. The similarity of pairs of sentences was rated on a 0-5 scale (low to high similarity) by human judges using Amazon Mechanical Turk, with high Pearson correlation scores, around 90\%. 35 teams participated in the task, submitting 88 runs. The best results scored a Pearson correlation >80\%, well above a simple lexical baseline that only scored a 31\% correlation. This pilot task opens an exciting way ahead, although there are still open issues, specially the evaluation metric.}, + address = {USA}, + author = {Agirre, Eneko and Diab, Mona and Cer, Daniel and Gonzalez-Agirre, Aitor}, + booktitle = {Proceedings of the First Joint Conference on Lexical and Computational Semantics - Volume 1: Proceedings of the Main Conference and the Shared Task, and Volume 2: Proceedings of the Sixth International Workshop on Semantic Evaluation}, + location = {Montr\'{e}al, Canada}, + numpages = {9}, + pages = {385–393}, + publisher = {Association for Computational Linguistics}, + series = {SemEval '12}, + title = {SemEval-2012 task 6: a pilot on semantic textual similarity}, + year = {2012}, +} +""", ) @property diff --git a/mteb/tasks/STS/eng/STS13STS.py b/mteb/tasks/STS/eng/STS13STS.py index 415eafbc23..e345618fa8 100644 --- a/mteb/tasks/STS/eng/STS13STS.py +++ b/mteb/tasks/STS/eng/STS13STS.py @@ -27,13 +27,15 @@ class STS13STS(AbsTaskSTS): annotations_creators="human-annotated", dialect=[], sample_creation="created", - bibtex_citation="""@inproceedings{Agirre2013SEM2S, - title={*SEM 2013 shared task: Semantic Textual Similarity}, - author={Eneko Agirre and Daniel Matthew Cer and Mona T. Diab and Aitor Gonzalez-Agirre and Weiwei Guo}, - booktitle={International Workshop on Semantic Evaluation}, - year={2013}, - url={https://api.semanticscholar.org/CorpusID:10241043} -}""", + bibtex_citation=r""" +@inproceedings{Agirre2013SEM2S, + author = {Eneko Agirre and Daniel Matthew Cer and Mona T. Diab and Aitor Gonzalez-Agirre and Weiwei Guo}, + booktitle = {International Workshop on Semantic Evaluation}, + title = {*SEM 2013 shared task: Semantic Textual Similarity}, + url = {https://api.semanticscholar.org/CorpusID:10241043}, + year = {2013}, +} +""", ) @property diff --git a/mteb/tasks/STS/eng/STS14STS.py b/mteb/tasks/STS/eng/STS14STS.py index 933cc124da..e2ab4f5d1e 100644 --- a/mteb/tasks/STS/eng/STS14STS.py +++ b/mteb/tasks/STS/eng/STS14STS.py @@ -27,24 +27,26 @@ class STS14STS(AbsTaskSTS): annotations_creators="derived", dialect=[], sample_creation="created", - bibtex_citation="""@inproceedings{bandhakavi-etal-2014-generating, - title = "Generating a Word-Emotion Lexicon from {\#}Emotional Tweets", - author = "Bandhakavi, Anil and - Wiratunga, Nirmalie and - P, Deepak and - Massie, Stewart", - editor = "Bos, Johan and - Frank, Anette and - Navigli, Roberto", - booktitle = "Proceedings of the Third Joint Conference on Lexical and Computational Semantics (*{SEM} 2014)", - month = aug, - year = "2014", - address = "Dublin, Ireland", - publisher = "Association for Computational Linguistics and Dublin City University", - url = "https://aclanthology.org/S14-1002", - doi = "10.3115/v1/S14-1002", - pages = "12--21", -}""", + bibtex_citation=r""" +@inproceedings{bandhakavi-etal-2014-generating, + address = {Dublin, Ireland}, + author = {Bandhakavi, Anil and +Wiratunga, Nirmalie and +P, Deepak and +Massie, Stewart}, + booktitle = {Proceedings of the Third Joint Conference on Lexical and Computational Semantics (*{SEM} 2014)}, + doi = {10.3115/v1/S14-1002}, + editor = {Bos, Johan and +Frank, Anette and +Navigli, Roberto}, + month = aug, + pages = {12--21}, + publisher = {Association for Computational Linguistics and Dublin City University}, + title = {Generating a Word-Emotion Lexicon from {\#}Emotional Tweets}, + url = {https://aclanthology.org/S14-1002}, + year = {2014}, +} +""", ) @property diff --git a/mteb/tasks/STS/eng/STS15STS.py b/mteb/tasks/STS/eng/STS15STS.py index 99e81aa90f..4ffad2e282 100644 --- a/mteb/tasks/STS/eng/STS15STS.py +++ b/mteb/tasks/STS/eng/STS15STS.py @@ -27,22 +27,24 @@ class STS15STS(AbsTaskSTS): annotations_creators="human-annotated", dialect=[], sample_creation="created", - bibtex_citation="""@inproceedings{bicici-2015-rtm, - title = "{RTM}-{DCU}: Predicting Semantic Similarity with Referential Translation Machines", - author = "Bi{\c{c}}ici, Ergun", - editor = "Nakov, Preslav and - Zesch, Torsten and - Cer, Daniel and - Jurgens, David", - booktitle = "Proceedings of the 9th International Workshop on Semantic Evaluation ({S}em{E}val 2015)", - month = jun, - year = "2015", - address = "Denver, Colorado", - publisher = "Association for Computational Linguistics", - url = "https://aclanthology.org/S15-2010", - doi = "10.18653/v1/S15-2010", - pages = "56--63", -}""", + bibtex_citation=r""" +@inproceedings{bicici-2015-rtm, + address = {Denver, Colorado}, + author = {Bi{\c{c}}ici, Ergun}, + booktitle = {Proceedings of the 9th International Workshop on Semantic Evaluation ({S}em{E}val 2015)}, + doi = {10.18653/v1/S15-2010}, + editor = {Nakov, Preslav and +Zesch, Torsten and +Cer, Daniel and +Jurgens, David}, + month = jun, + pages = {56--63}, + publisher = {Association for Computational Linguistics}, + title = {{RTM}-{DCU}: Predicting Semantic Similarity with Referential Translation Machines}, + url = {https://aclanthology.org/S15-2010}, + year = {2015}, +} +""", ) @property diff --git a/mteb/tasks/STS/eng/STS16STS.py b/mteb/tasks/STS/eng/STS16STS.py index 94c978d4fc..4e489ffc2d 100644 --- a/mteb/tasks/STS/eng/STS16STS.py +++ b/mteb/tasks/STS/eng/STS16STS.py @@ -27,28 +27,30 @@ class STS16STS(AbsTaskSTS): annotations_creators="human-annotated", dialect=[], sample_creation="created", - bibtex_citation="""@inproceedings{nakov-etal-2016-semeval, - title = "{S}em{E}val-2016 Task 4: Sentiment Analysis in {T}witter", - author = "Nakov, Preslav and - Ritter, Alan and - Rosenthal, Sara and - Sebastiani, Fabrizio and - Stoyanov, Veselin", - editor = "Bethard, Steven and - Carpuat, Marine and - Cer, Daniel and - Jurgens, David and - Nakov, Preslav and - Zesch, Torsten", - booktitle = "Proceedings of the 10th International Workshop on Semantic Evaluation ({S}em{E}val-2016)", - month = jun, - year = "2016", - address = "San Diego, California", - publisher = "Association for Computational Linguistics", - url = "https://aclanthology.org/S16-1001", - doi = "10.18653/v1/S16-1001", - pages = "1--18", -}""", + bibtex_citation=r""" +@inproceedings{nakov-etal-2016-semeval, + address = {San Diego, California}, + author = {Nakov, Preslav and +Ritter, Alan and +Rosenthal, Sara and +Sebastiani, Fabrizio and +Stoyanov, Veselin}, + booktitle = {Proceedings of the 10th International Workshop on Semantic Evaluation ({S}em{E}val-2016)}, + doi = {10.18653/v1/S16-1001}, + editor = {Bethard, Steven and +Carpuat, Marine and +Cer, Daniel and +Jurgens, David and +Nakov, Preslav and +Zesch, Torsten}, + month = jun, + pages = {1--18}, + publisher = {Association for Computational Linguistics}, + title = {{S}em{E}val-2016 Task 4: Sentiment Analysis in {T}witter}, + url = {https://aclanthology.org/S16-1001}, + year = {2016}, +} +""", ) @property diff --git a/mteb/tasks/STS/eng/STSBenchmarkSTS.py b/mteb/tasks/STS/eng/STSBenchmarkSTS.py index e600711d34..c1363128d4 100644 --- a/mteb/tasks/STS/eng/STSBenchmarkSTS.py +++ b/mteb/tasks/STS/eng/STSBenchmarkSTS.py @@ -27,12 +27,14 @@ class STSBenchmarkSTS(AbsTaskSTS): annotations_creators="human-annotated", dialect=[], sample_creation="machine-translated and verified", - bibtex_citation="""@InProceedings{huggingface:dataset:stsb_multi_mt, -title = {Machine translated multilingual STS benchmark dataset.}, -author={Philip May}, -year={2021}, -url={https://github.com/PhilipMay/stsb-multi-mt} -}""", + bibtex_citation=r""" +@inproceedings{huggingface:dataset:stsb_multi_mt, + author = {Philip May}, + title = {Machine translated multilingual STS benchmark dataset.}, + url = {https://github.com/PhilipMay/stsb-multi-mt}, + year = {2021}, +} +""", ) @property diff --git a/mteb/tasks/STS/eng/SickrSTS.py b/mteb/tasks/STS/eng/SickrSTS.py index 1c93fff578..b0b7a580d2 100644 --- a/mteb/tasks/STS/eng/SickrSTS.py +++ b/mteb/tasks/STS/eng/SickrSTS.py @@ -27,32 +27,34 @@ class SickrSTS(AbsTaskSTS): annotations_creators="human-annotated", dialect=None, sample_creation=None, - bibtex_citation="""@inproceedings{marelli-etal-2014-sick, - title = "A {SICK} cure for the evaluation of compositional distributional semantic models", - author = "Marelli, Marco and - Menini, Stefano and - Baroni, Marco and - Bentivogli, Luisa and - Bernardi, Raffaella and - Zamparelli, Roberto", - editor = "Calzolari, Nicoletta and - Choukri, Khalid and - Declerck, Thierry and - Loftsson, Hrafn and - Maegaard, Bente and - Mariani, Joseph and - Moreno, Asuncion and - Odijk, Jan and - Piperidis, Stelios", - booktitle = "Proceedings of the Ninth International Conference on Language Resources and Evaluation ({LREC}'14)", - month = may, - year = "2014", - address = "Reykjavik, Iceland", - publisher = "European Language Resources Association (ELRA)", - url = "http://www.lrec-conf.org/proceedings/lrec2014/pdf/363_Paper.pdf", - pages = "216--223", - abstract = "Shared and internationally recognized benchmarks are fundamental for the development of any computational system. We aim to help the research community working on compositional distributional semantic models (CDSMs) by providing SICK (Sentences Involving Compositional Knowldedge), a large size English benchmark tailored for them. SICK consists of about 10,000 English sentence pairs that include many examples of the lexical, syntactic and semantic phenomena that CDSMs are expected to account for, but do not require dealing with other aspects of existing sentential data sets (idiomatic multiword expressions, named entities, telegraphic language) that are not within the scope of CDSMs. By means of crowdsourcing techniques, each pair was annotated for two crucial semantic tasks: relatedness in meaning (with a 5-point rating scale as gold score) and entailment relation between the two elements (with three possible gold labels: entailment, contradiction, and neutral). The SICK data set was used in SemEval-2014 Task 1, and it freely available for research purposes.", -}""", + bibtex_citation=r""" +@inproceedings{marelli-etal-2014-sick, + abstract = {Shared and internationally recognized benchmarks are fundamental for the development of any computational system. We aim to help the research community working on compositional distributional semantic models (CDSMs) by providing SICK (Sentences Involving Compositional Knowldedge), a large size English benchmark tailored for them. SICK consists of about 10,000 English sentence pairs that include many examples of the lexical, syntactic and semantic phenomena that CDSMs are expected to account for, but do not require dealing with other aspects of existing sentential data sets (idiomatic multiword expressions, named entities, telegraphic language) that are not within the scope of CDSMs. By means of crowdsourcing techniques, each pair was annotated for two crucial semantic tasks: relatedness in meaning (with a 5-point rating scale as gold score) and entailment relation between the two elements (with three possible gold labels: entailment, contradiction, and neutral). The SICK data set was used in SemEval-2014 Task 1, and it freely available for research purposes.}, + address = {Reykjavik, Iceland}, + author = {Marelli, Marco and +Menini, Stefano and +Baroni, Marco and +Bentivogli, Luisa and +Bernardi, Raffaella and +Zamparelli, Roberto}, + booktitle = {Proceedings of the Ninth International Conference on Language Resources and Evaluation ({LREC}'14)}, + editor = {Calzolari, Nicoletta and +Choukri, Khalid and +Declerck, Thierry and +Loftsson, Hrafn and +Maegaard, Bente and +Mariani, Joseph and +Moreno, Asuncion and +Odijk, Jan and +Piperidis, Stelios}, + month = may, + pages = {216--223}, + publisher = {European Language Resources Association (ELRA)}, + title = {A {SICK} cure for the evaluation of compositional distributional semantic models}, + url = {http://www.lrec-conf.org/proceedings/lrec2014/pdf/363_Paper.pdf}, + year = {2014}, +} +""", ) @property diff --git a/mteb/tasks/STS/fao/FaroeseSTS.py b/mteb/tasks/STS/fao/FaroeseSTS.py index 156485321a..2fa91ddf11 100644 --- a/mteb/tasks/STS/fao/FaroeseSTS.py +++ b/mteb/tasks/STS/fao/FaroeseSTS.py @@ -27,20 +27,20 @@ class FaroeseSTS(AbsTaskSTS): annotations_creators="human-annotated", dialect=[], sample_creation="found", - bibtex_citation=""" - @inproceedings{snaebjarnarson-etal-2023-transfer, - title = "{T}ransfer to a Low-Resource Language via Close Relatives: The Case Study on Faroese", - author = "Snæbjarnarson, Vésteinn and - Simonsen, Annika and - Glavaš, Goran and - Vulić, Ivan", - booktitle = "Proceedings of the 24th Nordic Conference on Computational Linguistics (NoDaLiDa)", - month = "may 22--24", - year = "2023", - address = "Tórshavn, Faroe Islands", - publisher = {Link{\"o}ping University Electronic Press, Sweden}, - } - """, + bibtex_citation=r""" +@inproceedings{snaebjarnarson-etal-2023-transfer, + address = {Tórshavn, Faroe Islands}, + author = {Snæbjarnarson, Vésteinn and +Simonsen, Annika and +Glavaš, Goran and +Vulić, Ivan}, + booktitle = {Proceedings of the 24th Nordic Conference on Computational Linguistics (NoDaLiDa)}, + month = {may 22--24}, + publisher = {Link{\"o}ping University Electronic Press, Sweden}, + title = {{T}ransfer to a Low-Resource Language via Close Relatives: The Case Study on Faroese}, + year = {2023}, +} +""", ) @property diff --git a/mteb/tasks/STS/fin/FinParaSTS.py b/mteb/tasks/STS/fin/FinParaSTS.py index 8d647112e8..63331e7c8e 100644 --- a/mteb/tasks/STS/fin/FinParaSTS.py +++ b/mteb/tasks/STS/fin/FinParaSTS.py @@ -31,30 +31,30 @@ class FinParaSTS(AbsTaskSTS): annotations_creators="expert-annotated", dialect=[], sample_creation="found", - bibtex_citation=""" - @inproceedings{kanerva-etal-2021-finnish, - title = "{F}innish Paraphrase Corpus", - author = {Kanerva, Jenna and - Ginter, Filip and - Chang, Li-Hsin and - Rastas, Iiro and - Skantsi, Valtteri and - Kilpel{\"a}inen, Jemina and - Kupari, Hanna-Mari and - Saarni, Jenna and - Sev{\'o}n, Maija and - Tarkka, Otto}, - editor = "Dobnik, Simon and - {\\O}vrelid, Lilja", - booktitle = "Proceedings of the 23rd Nordic Conference on Computational Linguistics (NoDaLiDa)", - month = may # " 31--2 " # jun, - year = "2021", - address = "Reykjavik, Iceland (Online)", - publisher = {Link{\"o}ping University Electronic Press, Sweden}, - url = "https://aclanthology.org/2021.nodalida-main.29", - pages = "288--298", - } - """, + bibtex_citation=r""" +@inproceedings{kanerva-etal-2021-finnish, + address = {Reykjavik, Iceland (Online)}, + author = {Kanerva, Jenna and +Ginter, Filip and +Chang, Li-Hsin and +Rastas, Iiro and +Skantsi, Valtteri and +Kilpel{\"a}inen, Jemina and +Kupari, Hanna-Mari and +Saarni, Jenna and +Sev{\'o}n, Maija and +Tarkka, Otto}, + booktitle = {Proceedings of the 23rd Nordic Conference on Computational Linguistics (NoDaLiDa)}, + editor = {Dobnik, Simon and +{\\O}vrelid, Lilja}, + month = may # { 31--2 } # jun, + pages = {288--298}, + publisher = {Link{\"o}ping University Electronic Press, Sweden}, + title = {{F}innish Paraphrase Corpus}, + url = {https://aclanthology.org/2021.nodalida-main.29}, + year = {2021}, +} +""", ) @property diff --git a/mteb/tasks/STS/fra/SickFrSTS.py b/mteb/tasks/STS/fra/SickFrSTS.py index 241aa60163..c34a933121 100644 --- a/mteb/tasks/STS/fra/SickFrSTS.py +++ b/mteb/tasks/STS/fra/SickFrSTS.py @@ -21,7 +21,7 @@ class SickFrSTS(AbsTaskSTS): eval_langs=["fra-Latn"], main_score="cosine_spearman", date=None, - domains=None, + domains=[], task_subtypes=None, license=None, annotations_creators=None, diff --git a/mteb/tasks/STS/jpn/JSICK.py b/mteb/tasks/STS/jpn/JSICK.py index 554a3abf1d..cb0ea0920e 100644 --- a/mteb/tasks/STS/jpn/JSICK.py +++ b/mteb/tasks/STS/jpn/JSICK.py @@ -28,17 +28,17 @@ class JSICK(AbsTaskSTS): annotations_creators="human-annotated", dialect=[], sample_creation="found", - bibtex_citation=""" - @article{yanaka2022compositional, - title={Compositional Evaluation on Japanese Textual Entailment and Similarity}, - author={Yanaka, Hitomi and Mineshima, Koji}, - journal={Transactions of the Association for Computational Linguistics}, - volume={10}, - pages={1266--1284}, - year={2022}, - publisher={MIT Press One Broadway, 12th Floor, Cambridge, Massachusetts 02142, USA~…} - } - """, + bibtex_citation=r""" +@article{yanaka2022compositional, + author = {Yanaka, Hitomi and Mineshima, Koji}, + journal = {Transactions of the Association for Computational Linguistics}, + pages = {1266--1284}, + publisher = {MIT Press One Broadway, 12th Floor, Cambridge, Massachusetts 02142, USA~…}, + title = {Compositional Evaluation on Japanese Textual Entailment and Similarity}, + volume = {10}, + year = {2022}, +} +""", ) @property diff --git a/mteb/tasks/STS/jpn/JSTS.py b/mteb/tasks/STS/jpn/JSTS.py index 4993359190..1b426bded6 100644 --- a/mteb/tasks/STS/jpn/JSTS.py +++ b/mteb/tasks/STS/jpn/JSTS.py @@ -30,33 +30,35 @@ class JSTS(AbsTaskSTS): annotations_creators="human-annotated", dialect=[], sample_creation="found", - bibtex_citation="""@inproceedings{kurihara-etal-2022-jglue, - title = "{JGLUE}: {J}apanese General Language Understanding Evaluation", - author = "Kurihara, Kentaro and - Kawahara, Daisuke and - Shibata, Tomohide", - editor = "Calzolari, Nicoletta and - B{\'e}chet, Fr{\'e}d{\'e}ric and - Blache, Philippe and - Choukri, Khalid and - Cieri, Christopher and - Declerck, Thierry and - Goggi, Sara and - Isahara, Hitoshi and - Maegaard, Bente and - Mariani, Joseph and - Mazo, H{\'e}l{\`e}ne and - Odijk, Jan and - Piperidis, Stelios", - booktitle = "Proceedings of the Thirteenth Language Resources and Evaluation Conference", - month = jun, - year = "2022", - address = "Marseille, France", - publisher = "European Language Resources Association", - url = "https://aclanthology.org/2022.lrec-1.317", - pages = "2957--2966", - abstract = "To develop high-performance natural language understanding (NLU) models, it is necessary to have a benchmark to evaluate and analyze NLU ability from various perspectives. While the English NLU benchmark, GLUE, has been the forerunner, benchmarks are now being released for languages other than English, such as CLUE for Chinese and FLUE for French; but there is no such benchmark for Japanese. We build a Japanese NLU benchmark, JGLUE, from scratch without translation to measure the general NLU ability in Japanese. We hope that JGLUE will facilitate NLU research in Japanese.", -}""", + bibtex_citation=r""" +@inproceedings{kurihara-etal-2022-jglue, + abstract = {To develop high-performance natural language understanding (NLU) models, it is necessary to have a benchmark to evaluate and analyze NLU ability from various perspectives. While the English NLU benchmark, GLUE, has been the forerunner, benchmarks are now being released for languages other than English, such as CLUE for Chinese and FLUE for French; but there is no such benchmark for Japanese. We build a Japanese NLU benchmark, JGLUE, from scratch without translation to measure the general NLU ability in Japanese. We hope that JGLUE will facilitate NLU research in Japanese.}, + address = {Marseille, France}, + author = {Kurihara, Kentaro and +Kawahara, Daisuke and +Shibata, Tomohide}, + booktitle = {Proceedings of the Thirteenth Language Resources and Evaluation Conference}, + editor = {Calzolari, Nicoletta and +B{\'e}chet, Fr{\'e}d{\'e}ric and +Blache, Philippe and +Choukri, Khalid and +Cieri, Christopher and +Declerck, Thierry and +Goggi, Sara and +Isahara, Hitoshi and +Maegaard, Bente and +Mariani, Joseph and +Mazo, H{\'e}l{\`e}ne and +Odijk, Jan and +Piperidis, Stelios}, + month = jun, + pages = {2957--2966}, + publisher = {European Language Resources Association}, + title = {{JGLUE}: {J}apanese General Language Understanding Evaluation}, + url = {https://aclanthology.org/2022.lrec-1.317}, + year = {2022}, +} +""", ) @property diff --git a/mteb/tasks/STS/kor/KlueSTS.py b/mteb/tasks/STS/kor/KlueSTS.py index 0ff8a724bb..2934133fed 100644 --- a/mteb/tasks/STS/kor/KlueSTS.py +++ b/mteb/tasks/STS/kor/KlueSTS.py @@ -28,14 +28,16 @@ class KlueSTS(AbsTaskSTS): annotations_creators="human-annotated", dialect=[], sample_creation="found", - bibtex_citation="""@misc{park2021klue, - title={KLUE: Korean Language Understanding Evaluation}, - author={Sungjoon Park and Jihyung Moon and Sungdong Kim and Won Ik Cho and Jiyoon Han and Jangwon Park and Chisung Song and Junseong Kim and Yongsook Song and Taehwan Oh and Joohong Lee and Juhyun Oh and Sungwon Lyu and Younghoon Jeong and Inkwon Lee and Sangwoo Seo and Dongjun Lee and Hyunwoo Kim and Myeonghwa Lee and Seongbo Jang and Seungwon Do and Sunkyoung Kim and Kyungtae Lim and Jongwon Lee and Kyumin Park and Jamin Shin and Seonghyun Kim and Lucy Park and Alice Oh and Jungwoo Ha and Kyunghyun Cho}, - year={2021}, - eprint={2105.09680}, - archivePrefix={arXiv}, - primaryClass={cs.CL} -}""", + bibtex_citation=r""" +@misc{park2021klue, + archiveprefix = {arXiv}, + author = {Sungjoon Park and Jihyung Moon and Sungdong Kim and Won Ik Cho and Jiyoon Han and Jangwon Park and Chisung Song and Junseong Kim and Yongsook Song and Taehwan Oh and Joohong Lee and Juhyun Oh and Sungwon Lyu and Younghoon Jeong and Inkwon Lee and Sangwoo Seo and Dongjun Lee and Hyunwoo Kim and Myeonghwa Lee and Seongbo Jang and Seungwon Do and Sunkyoung Kim and Kyungtae Lim and Jongwon Lee and Kyumin Park and Jamin Shin and Seonghyun Kim and Lucy Park and Alice Oh and Jungwoo Ha and Kyunghyun Cho}, + eprint = {2105.09680}, + primaryclass = {cs.CL}, + title = {KLUE: Korean Language Understanding Evaluation}, + year = {2021}, +} +""", ) @property diff --git a/mteb/tasks/STS/kor/KorSTS.py b/mteb/tasks/STS/kor/KorSTS.py index 6ab1437bb1..bc2ce9258e 100644 --- a/mteb/tasks/STS/kor/KorSTS.py +++ b/mteb/tasks/STS/kor/KorSTS.py @@ -27,12 +27,14 @@ class KorSTS(AbsTaskSTS): annotations_creators=None, dialect=[], sample_creation="machine-translated and localized", - bibtex_citation="""@article{ham2020kornli, - title={KorNLI and KorSTS: New Benchmark Datasets for Korean Natural Language Understanding}, - author={Ham, Jiyeon and Choe, Yo Joong and Park, Kyubyong and Choi, Ilji and Soh, Hyungjoon}, - journal={arXiv preprint arXiv:2004.03289}, - year={2020} -}""", + bibtex_citation=r""" +@article{ham2020kornli, + author = {Ham, Jiyeon and Choe, Yo Joong and Park, Kyubyong and Choi, Ilji and Soh, Hyungjoon}, + journal = {arXiv preprint arXiv:2004.03289}, + title = {KorNLI and KorSTS: New Benchmark Datasets for Korean Natural Language Understanding}, + year = {2020}, +} +""", ) @property diff --git a/mteb/tasks/STS/multilingual/IndicCrosslingualSTS.py b/mteb/tasks/STS/multilingual/IndicCrosslingualSTS.py index 2d5b653ec2..e22223979e 100644 --- a/mteb/tasks/STS/multilingual/IndicCrosslingualSTS.py +++ b/mteb/tasks/STS/multilingual/IndicCrosslingualSTS.py @@ -60,19 +60,21 @@ class IndicCrosslingualSTS(AbsTaskSTS, MultilingualTask): annotations_creators="expert-annotated", dialect=[], sample_creation="created", - bibtex_citation="""@article{10.1162/tacl_a_00452, - author = {Ramesh, Gowtham and Doddapaneni, Sumanth and Bheemaraj, Aravinth and Jobanputra, Mayank and AK, Raghavan and Sharma, Ajitesh and Sahoo, Sujit and Diddee, Harshita and J, Mahalakshmi and Kakwani, Divyanshu and Kumar, Navneet and Pradeep, Aswin and Nagaraj, Srihari and Deepak, Kumar and Raghavan, Vivek and Kunchukuttan, Anoop and Kumar, Pratyush and Khapra, Mitesh Shantadevi}, - title = "{Samanantar: The Largest Publicly Available Parallel Corpora Collection for 11 Indic Languages}", - journal = {Transactions of the Association for Computational Linguistics}, - volume = {10}, - pages = {145-162}, - year = {2022}, - month = {02}, - issn = {2307-387X}, - doi = {10.1162/tacl_a_00452}, - url = {https://doi.org/10.1162/tacl\\_a\\_00452}, - eprint = {https://direct.mit.edu/tacl/article-pdf/doi/10.1162/tacl\\_a\\_00452/1987010/tacl\\_a\\_00452.pdf}, -}""", + bibtex_citation=r""" +@article{10.1162/tacl_a_00452, + author = {Ramesh, Gowtham and Doddapaneni, Sumanth and Bheemaraj, Aravinth and Jobanputra, Mayank and AK, Raghavan and Sharma, Ajitesh and Sahoo, Sujit and Diddee, Harshita and J, Mahalakshmi and Kakwani, Divyanshu and Kumar, Navneet and Pradeep, Aswin and Nagaraj, Srihari and Deepak, Kumar and Raghavan, Vivek and Kunchukuttan, Anoop and Kumar, Pratyush and Khapra, Mitesh Shantadevi}, + doi = {10.1162/tacl_a_00452}, + eprint = {https://direct.mit.edu/tacl/article-pdf/doi/10.1162/tacl\\_a\\_00452/1987010/tacl\\_a\\_00452.pdf}, + issn = {2307-387X}, + journal = {Transactions of the Association for Computational Linguistics}, + month = {02}, + pages = {145-162}, + title = {{Samanantar: The Largest Publicly Available Parallel Corpora Collection for 11 Indic Languages}}, + url = {https://doi.org/10.1162/tacl\\_a\\_00452}, + volume = {10}, + year = {2022}, +} +""", ) @property diff --git a/mteb/tasks/STS/multilingual/STS17CrosslingualSTS.py b/mteb/tasks/STS/multilingual/STS17CrosslingualSTS.py index 478f2fbd68..47789d9648 100644 --- a/mteb/tasks/STS/multilingual/STS17CrosslingualSTS.py +++ b/mteb/tasks/STS/multilingual/STS17CrosslingualSTS.py @@ -42,29 +42,31 @@ class STS17Crosslingual(AbsTaskSTS, MultilingualTask): annotations_creators="human-annotated", dialect=[], sample_creation="created", - bibtex_citation="""@inproceedings{cer-etal-2017-semeval, - title = "{S}em{E}val-2017 Task 1: Semantic Textual Similarity Multilingual and Crosslingual Focused Evaluation", - author = "Cer, Daniel and - Diab, Mona and - Agirre, Eneko and - Lopez-Gazpio, I{\\~n}igo and - Specia, Lucia", - editor = "Bethard, Steven and - Carpuat, Marine and - Apidianaki, Marianna and - Mohammad, Saif M. and - Cer, Daniel and - Jurgens, David", - booktitle = "Proceedings of the 11th International Workshop on Semantic Evaluation ({S}em{E}val-2017)", - month = aug, - year = "2017", - address = "Vancouver, Canada", - publisher = "Association for Computational Linguistics", - url = "https://aclanthology.org/S17-2001", - doi = "10.18653/v1/S17-2001", - pages = "1--14", - abstract = "Semantic Textual Similarity (STS) measures the meaning similarity of sentences. Applications include machine translation (MT), summarization, generation, question answering (QA), short answer grading, semantic search, dialog and conversational systems. The STS shared task is a venue for assessing the current state-of-the-art. The 2017 task focuses on multilingual and cross-lingual pairs with one sub-track exploring MT quality estimation (MTQE) data. The task obtained strong participation from 31 teams, with 17 participating in \textit{all language tracks}. We summarize performance and review a selection of well performing methods. Analysis highlights common errors, providing insight into the limitations of existing models. To support ongoing work on semantic representations, the \textit{STS Benchmark} is introduced as a new shared training and evaluation set carefully selected from the corpus of English STS shared task data (2012-2017).", -}""", + bibtex_citation=r""" +@inproceedings{cer-etal-2017-semeval, + abstract = {Semantic Textual Similarity (STS) measures the meaning similarity of sentences. Applications include machine translation (MT), summarization, generation, question answering (QA), short answer grading, semantic search, dialog and conversational systems. The STS shared task is a venue for assessing the current state-of-the-art. The 2017 task focuses on multilingual and cross-lingual pairs with one sub-track exploring MT quality estimation (MTQE) data. The task obtained strong participation from 31 teams, with 17 participating in \textit{all language tracks}. We summarize performance and review a selection of well performing methods. Analysis highlights common errors, providing insight into the limitations of existing models. To support ongoing work on semantic representations, the \textit{STS Benchmark} is introduced as a new shared training and evaluation set carefully selected from the corpus of English STS shared task data (2012-2017).}, + address = {Vancouver, Canada}, + author = {Cer, Daniel and +Diab, Mona and +Agirre, Eneko and +Lopez-Gazpio, I{\\~n}igo and +Specia, Lucia}, + booktitle = {Proceedings of the 11th International Workshop on Semantic Evaluation ({S}em{E}val-2017)}, + doi = {10.18653/v1/S17-2001}, + editor = {Bethard, Steven and +Carpuat, Marine and +Apidianaki, Marianna and +Mohammad, Saif M. and +Cer, Daniel and +Jurgens, David}, + month = aug, + pages = {1--14}, + publisher = {Association for Computational Linguistics}, + title = {{S}em{E}val-2017 Task 1: Semantic Textual Similarity Multilingual and Crosslingual Focused Evaluation}, + url = {https://aclanthology.org/S17-2001}, + year = {2017}, +} +""", ) @property diff --git a/mteb/tasks/STS/multilingual/STS22CrosslingualSTS.py b/mteb/tasks/STS/multilingual/STS22CrosslingualSTS.py index cc231b63b3..09eb936a48 100644 --- a/mteb/tasks/STS/multilingual/STS22CrosslingualSTS.py +++ b/mteb/tasks/STS/multilingual/STS22CrosslingualSTS.py @@ -49,34 +49,36 @@ class STS22CrosslingualSTSv2(AbsTaskSTS, MultilingualTask): annotations_creators="human-annotated", dialect=[], sample_creation="found", - bibtex_citation="""@inproceedings{chen-etal-2022-semeval, - title = "{S}em{E}val-2022 Task 8: Multilingual news article similarity", - author = {Chen, Xi and - Zeynali, Ali and - Camargo, Chico and - Fl{\"o}ck, Fabian and - Gaffney, Devin and - Grabowicz, Przemyslaw and - Hale, Scott and - Jurgens, David and - Samory, Mattia}, - editor = "Emerson, Guy and - Schluter, Natalie and - Stanovsky, Gabriel and - Kumar, Ritesh and - Palmer, Alexis and - Schneider, Nathan and - Singh, Siddharth and - Ratan, Shyam", - booktitle = "Proceedings of the 16th International Workshop on Semantic Evaluation (SemEval-2022)", - month = jul, - year = "2022", - address = "Seattle, United States", - publisher = "Association for Computational Linguistics", - url = "https://aclanthology.org/2022.semeval-1.155", - doi = "10.18653/v1/2022.semeval-1.155", - pages = "1094--1106", -}""", + bibtex_citation=r""" +@inproceedings{chen-etal-2022-semeval, + address = {Seattle, United States}, + author = {Chen, Xi and +Zeynali, Ali and +Camargo, Chico and +Fl{\"o}ck, Fabian and +Gaffney, Devin and +Grabowicz, Przemyslaw and +Hale, Scott and +Jurgens, David and +Samory, Mattia}, + booktitle = {Proceedings of the 16th International Workshop on Semantic Evaluation (SemEval-2022)}, + doi = {10.18653/v1/2022.semeval-1.155}, + editor = {Emerson, Guy and +Schluter, Natalie and +Stanovsky, Gabriel and +Kumar, Ritesh and +Palmer, Alexis and +Schneider, Nathan and +Singh, Siddharth and +Ratan, Shyam}, + month = jul, + pages = {1094--1106}, + publisher = {Association for Computational Linguistics}, + title = {{S}em{E}val-2022 Task 8: Multilingual news article similarity}, + url = {https://aclanthology.org/2022.semeval-1.155}, + year = {2022}, +} +""", adapted_from=["STS22"], ) @@ -112,34 +114,36 @@ class STS22CrosslingualSTS(AbsTaskSTS, MultilingualTask): annotations_creators="human-annotated", dialect=[], sample_creation="found", - bibtex_citation="""@inproceedings{chen-etal-2022-semeval, - title = "{S}em{E}val-2022 Task 8: Multilingual news article similarity", - author = {Chen, Xi and - Zeynali, Ali and - Camargo, Chico and - Fl{\"o}ck, Fabian and - Gaffney, Devin and - Grabowicz, Przemyslaw and - Hale, Scott and - Jurgens, David and - Samory, Mattia}, - editor = "Emerson, Guy and - Schluter, Natalie and - Stanovsky, Gabriel and - Kumar, Ritesh and - Palmer, Alexis and - Schneider, Nathan and - Singh, Siddharth and - Ratan, Shyam", - booktitle = "Proceedings of the 16th International Workshop on Semantic Evaluation (SemEval-2022)", - month = jul, - year = "2022", - address = "Seattle, United States", - publisher = "Association for Computational Linguistics", - url = "https://aclanthology.org/2022.semeval-1.155", - doi = "10.18653/v1/2022.semeval-1.155", - pages = "1094--1106", -}""", + bibtex_citation=r""" +@inproceedings{chen-etal-2022-semeval, + address = {Seattle, United States}, + author = {Chen, Xi and +Zeynali, Ali and +Camargo, Chico and +Fl{\"o}ck, Fabian and +Gaffney, Devin and +Grabowicz, Przemyslaw and +Hale, Scott and +Jurgens, David and +Samory, Mattia}, + booktitle = {Proceedings of the 16th International Workshop on Semantic Evaluation (SemEval-2022)}, + doi = {10.18653/v1/2022.semeval-1.155}, + editor = {Emerson, Guy and +Schluter, Natalie and +Stanovsky, Gabriel and +Kumar, Ritesh and +Palmer, Alexis and +Schneider, Nathan and +Singh, Siddharth and +Ratan, Shyam}, + month = jul, + pages = {1094--1106}, + publisher = {Association for Computational Linguistics}, + title = {{S}em{E}val-2022 Task 8: Multilingual news article similarity}, + url = {https://aclanthology.org/2022.semeval-1.155}, + year = {2022}, +} +""", ) @property diff --git a/mteb/tasks/STS/multilingual/STSBenchmarkMultilingualSTS.py b/mteb/tasks/STS/multilingual/STSBenchmarkMultilingualSTS.py index eaf5ff1afb..9e04db5b02 100644 --- a/mteb/tasks/STS/multilingual/STSBenchmarkMultilingualSTS.py +++ b/mteb/tasks/STS/multilingual/STSBenchmarkMultilingualSTS.py @@ -46,12 +46,14 @@ class STSBenchmarkMultilingualSTS(AbsTaskSTS, MultilingualTask): annotations_creators="human-annotated", dialect=[], sample_creation="machine-translated", - bibtex_citation="""@InProceedings{huggingface:dataset:stsb_multi_mt, - title = {Machine translated multilingual STS benchmark dataset.}, - author={Philip May}, - year={2021}, - url={https://github.com/PhilipMay/stsb-multi-mt} - }""", + bibtex_citation=r""" +@inproceedings{huggingface:dataset:stsb_multi_mt, + author = {Philip May}, + title = {Machine translated multilingual STS benchmark dataset.}, + url = {https://github.com/PhilipMay/stsb-multi-mt}, + year = {2021}, +} +""", ) @property diff --git a/mteb/tasks/STS/multilingual/SemRel24STS.py b/mteb/tasks/STS/multilingual/SemRel24STS.py index f6c1e7c402..c94ae87c02 100644 --- a/mteb/tasks/STS/multilingual/SemRel24STS.py +++ b/mteb/tasks/STS/multilingual/SemRel24STS.py @@ -49,20 +49,21 @@ class SemRel24STS(AbsTaskSTS, MultilingualTask): annotations_creators="human-annotated", dialect=[], sample_creation="created", - bibtex_citation="""@misc{ousidhoum2024semrel2024, - title={SemRel2024: A Collection of Semantic Textual Relatedness Datasets for 14 Languages}, - author={Nedjma Ousidhoum and Shamsuddeen Hassan Muhammad and Mohamed Abdalla and Idris Abdulmumin and Ibrahim Said Ahmad and - Sanchit Ahuja and Alham Fikri Aji and Vladimir Araujo and Abinew Ali Ayele and Pavan Baswani and Meriem Beloucif and - Chris Biemann and Sofia Bourhim and Christine De Kock and Genet Shanko Dekebo and - Oumaima Hourrane and Gopichand Kanumolu and Lokesh Madasu and Samuel Rutunda and Manish Shrivastava and - Thamar Solorio and Nirmal Surange and Hailegnaw Getaneh Tilaye and Krishnapriya Vishnubhotla and Genta Winata and - Seid Muhie Yimam and Saif M. Mohammad}, - year={2024}, - eprint={2402.08638}, - archivePrefix={arXiv}, - primaryClass={cs.CL} - } - """, + bibtex_citation=r""" +@misc{ousidhoum2024semrel2024, + archiveprefix = {arXiv}, + author = {Nedjma Ousidhoum and Shamsuddeen Hassan Muhammad and Mohamed Abdalla and Idris Abdulmumin and Ibrahim Said Ahmad and +Sanchit Ahuja and Alham Fikri Aji and Vladimir Araujo and Abinew Ali Ayele and Pavan Baswani and Meriem Beloucif and +Chris Biemann and Sofia Bourhim and Christine De Kock and Genet Shanko Dekebo and +Oumaima Hourrane and Gopichand Kanumolu and Lokesh Madasu and Samuel Rutunda and Manish Shrivastava and +Thamar Solorio and Nirmal Surange and Hailegnaw Getaneh Tilaye and Krishnapriya Vishnubhotla and Genta Winata and +Seid Muhie Yimam and Saif M. Mohammad}, + eprint = {2402.08638}, + primaryclass = {cs.CL}, + title = {SemRel2024: A Collection of Semantic Textual Relatedness Datasets for 14 Languages}, + year = {2024}, +} +""", ) @property diff --git a/mteb/tasks/STS/pol/PolishSTS.py b/mteb/tasks/STS/pol/PolishSTS.py index 9115f37996..db1deb0eac 100644 --- a/mteb/tasks/STS/pol/PolishSTS.py +++ b/mteb/tasks/STS/pol/PolishSTS.py @@ -26,37 +26,37 @@ class SickrPLSTS(AbsTaskSTS): annotations_creators="human-annotated", dialect=[], sample_creation="human-translated and localized", - bibtex_citation=""" + bibtex_citation=r""" @inproceedings{dadas-etal-2020-evaluation, - title = "Evaluation of Sentence Representations in {P}olish", - author = "Dadas, Slawomir and - Perelkiewicz, Michal and - Poswiata, Rafal", - editor = "Calzolari, Nicoletta and - B{\'e}chet, Fr{\'e}d{\'e}ric and - Blache, Philippe and - Choukri, Khalid and - Cieri, Christopher and - Declerck, Thierry and - Goggi, Sara and - Isahara, Hitoshi and - Maegaard, Bente and - Mariani, Joseph and - Mazo, Helene and - Moreno, Asuncion and - Odijk, Jan and - Piperidis, Stelios", - booktitle = "Proceedings of the Twelfth Language Resources and Evaluation Conference", - month = may, - year = "2020", - address = "Marseille, France", - publisher = "European Language Resources Association", - url = "https://aclanthology.org/2020.lrec-1.207", - pages = "1674--1680", - language = "English", - ISBN = "979-10-95546-34-4", + address = {Marseille, France}, + author = {Dadas, Slawomir and +Perelkiewicz, Michal and +Poswiata, Rafal}, + booktitle = {Proceedings of the Twelfth Language Resources and Evaluation Conference}, + editor = {Calzolari, Nicoletta and +B{\'e}chet, Fr{\'e}d{\'e}ric and +Blache, Philippe and +Choukri, Khalid and +Cieri, Christopher and +Declerck, Thierry and +Goggi, Sara and +Isahara, Hitoshi and +Maegaard, Bente and +Mariani, Joseph and +Mazo, Helene and +Moreno, Asuncion and +Odijk, Jan and +Piperidis, Stelios}, + isbn = {979-10-95546-34-4}, + language = {English}, + month = may, + pages = {1674--1680}, + publisher = {European Language Resources Association}, + title = {Evaluation of Sentence Representations in {P}olish}, + url = {https://aclanthology.org/2020.lrec-1.207}, + year = {2020}, } - """, +""", ) @property @@ -89,24 +89,23 @@ class CdscrSTS(AbsTaskSTS): annotations_creators="human-annotated", dialect=[], sample_creation="human-translated and localized", - bibtex_citation=""" + bibtex_citation=r""" @inproceedings{wroblewska-krasnowska-kieras-2017-polish, - title = "{P}olish evaluation dataset for compositional distributional semantics models", - author = "Wr{\'o}blewska, Alina and - Krasnowska-Kiera{\'s}, Katarzyna", - editor = "Barzilay, Regina and - Kan, Min-Yen", - booktitle = "Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)", - month = jul, - year = "2017", - address = "Vancouver, Canada", - publisher = "Association for Computational Linguistics", - url = "https://aclanthology.org/P17-1073", - doi = "10.18653/v1/P17-1073", - pages = "784--792", + address = {Vancouver, Canada}, + author = {Wr{\'o}blewska, Alina and +Krasnowska-Kiera{\'s}, Katarzyna}, + booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)}, + doi = {10.18653/v1/P17-1073}, + editor = {Barzilay, Regina and +Kan, Min-Yen}, + month = jul, + pages = {784--792}, + publisher = {Association for Computational Linguistics}, + title = {{P}olish evaluation dataset for compositional distributional semantics models}, + url = {https://aclanthology.org/P17-1073}, + year = {2017}, } - - """, +""", ) @property diff --git a/mteb/tasks/STS/por/Assin2STS.py b/mteb/tasks/STS/por/Assin2STS.py index e96ae97c34..0719f03607 100644 --- a/mteb/tasks/STS/por/Assin2STS.py +++ b/mteb/tasks/STS/por/Assin2STS.py @@ -26,14 +26,16 @@ class Assin2STS(AbsTaskSTS): annotations_creators="human-annotated", dialect=[], sample_creation="found", - bibtex_citation="""@inproceedings{real2020assin, - title={The assin 2 shared task: a quick overview}, - author={Real, Livy and Fonseca, Erick and Oliveira, Hugo Goncalo}, - booktitle={International Conference on Computational Processing of the Portuguese Language}, - pages={406--412}, - year={2020}, - organization={Springer} - }""", + bibtex_citation=r""" +@inproceedings{real2020assin, + author = {Real, Livy and Fonseca, Erick and Oliveira, Hugo Goncalo}, + booktitle = {International Conference on Computational Processing of the Portuguese Language}, + organization = {Springer}, + pages = {406--412}, + title = {The assin 2 shared task: a quick overview}, + year = {2020}, +} +""", ) @property diff --git a/mteb/tasks/STS/por/SickBrSTS.py b/mteb/tasks/STS/por/SickBrSTS.py index 5298ab5437..d9733ed50b 100644 --- a/mteb/tasks/STS/por/SickBrSTS.py +++ b/mteb/tasks/STS/por/SickBrSTS.py @@ -29,27 +29,27 @@ class SickBrSTS(AbsTaskSTS): annotations_creators="human-annotated", dialect=[], sample_creation="human-translated and localized", - bibtex_citation=""" + bibtex_citation=r""" @inproceedings{real18, - author="Real, Livy - and Rodrigues, Ana - and Vieira e Silva, Andressa - and Albiero, Beatriz - and Thalenberg, Bruna - and Guide, Bruno - and Silva, Cindy - and de Oliveira Lima, Guilherme - and Camara, Igor C. S. - and Stanojevi{\'{c}}, Milo{\v{s}} - and Souza, Rodrigo - and de Paiva, Valeria" - year ="2018", - title="SICK-BR: A Portuguese Corpus for Inference", - booktitle="Computational Processing of the Portuguese Language. PROPOR 2018.", - doi ="10.1007/978-3-319-99722-3_31", - isbn="978-3-319-99722-3" + author = {Real, Livy +and Rodrigues, Ana +and Vieira e Silva, Andressa +and Albiero, Beatriz +and Thalenberg, Bruna +and Guide, Bruno +and Silva, Cindy +and de Oliveira Lima, Guilherme +and Camara, Igor C. S. +and Stanojevi{\'{c}}, Milo{\v{s}} +and Souza, Rodrigo +and de Paiva, Valeria}, + booktitle = {{Computational Processing of the Portuguese Language. PROPOR 2018.}}, + doi = {10.1007/978-3-319-99722-3_31}, + isbn = {978-3-319-99722-3}, + title = {{SICK-BR: A Portuguese Corpus for Inference}}, + year = {2018}, } - """, +""", ) @property diff --git a/mteb/tasks/STS/ron/RonSTS.py b/mteb/tasks/STS/ron/RonSTS.py index 4941cba3e6..d08dd109bb 100644 --- a/mteb/tasks/STS/ron/RonSTS.py +++ b/mteb/tasks/STS/ron/RonSTS.py @@ -28,14 +28,14 @@ class RonSTS(AbsTaskSTS): annotations_creators="human-annotated", dialect=[], sample_creation="machine-translated and verified", - bibtex_citation=""" - @inproceedings{dumitrescu2021liro, - title={LiRo: Benchmark and leaderboard for Romanian language tasks}, - author={Dumitrescu, Stefan Daniel and Rebeja, Petru and Lorincz, Beata and Gaman, Mihaela and Avram, Andrei and Ilie, Mihai and Pruteanu, Andrei and Stan, Adriana and Rosia, Lorena and Iacobescu, Cristina and others}, - booktitle={Thirty-fifth Conference on Neural Information Processing Systems Datasets and Benchmarks Track (Round 1)}, - year={2021} - } - """, + bibtex_citation=r""" +@inproceedings{dumitrescu2021liro, + author = {Dumitrescu, Stefan Daniel and Rebeja, Petru and Lorincz, Beata and Gaman, Mihaela and Avram, Andrei and Ilie, Mihai and Pruteanu, Andrei and Stan, Adriana and Rosia, Lorena and Iacobescu, Cristina and others}, + booktitle = {Thirty-fifth Conference on Neural Information Processing Systems Datasets and Benchmarks Track (Round 1)}, + title = {LiRo: Benchmark and leaderboard for Romanian language tasks}, + year = {2021}, +} +""", ) @property diff --git a/mteb/tasks/STS/rus/RUParaPhraserSTS.py b/mteb/tasks/STS/rus/RUParaPhraserSTS.py index 9174f2f661..19e3addfe5 100644 --- a/mteb/tasks/STS/rus/RUParaPhraserSTS.py +++ b/mteb/tasks/STS/rus/RUParaPhraserSTS.py @@ -27,30 +27,31 @@ class RUParaPhraserSTS(AbsTaskSTS): annotations_creators="human-annotated", dialect=[], sample_creation="found", - bibtex_citation=""" - @inproceedings{gudkov-etal-2020-automatically, - title = "Automatically Ranked {R}ussian Paraphrase Corpus for Text Generation", - author = "Gudkov, Vadim and - Mitrofanova, Olga and - Filippskikh, Elizaveta", - booktitle = "Proceedings of the Fourth Workshop on Neural Generation and Translation", - month = jul, - year = "2020", - address = "Online", - publisher = "Association for Computational Linguistics", - url = "https://aclanthology.org/2020.ngt-1.6", - doi = "10.18653/v1/2020.ngt-1.6", - pages = "54--59", - } - @inproceedings{pivovarova2017paraphraser, - title={ParaPhraser: Russian paraphrase corpus and shared task}, - author={Pivovarova, Lidia and Pronoza, Ekaterina and Yagunova, Elena and Pronoza, Anton}, - booktitle={Conference on artificial intelligence and natural language}, - pages={211--225}, - year={2017}, - organization={Springer} - } - """, + bibtex_citation=r""" +@inproceedings{gudkov-etal-2020-automatically, + address = {Online}, + author = {Gudkov, Vadim and +Mitrofanova, Olga and +Filippskikh, Elizaveta}, + booktitle = {Proceedings of the Fourth Workshop on Neural Generation and Translation}, + doi = {10.18653/v1/2020.ngt-1.6}, + month = jul, + pages = {54--59}, + publisher = {Association for Computational Linguistics}, + title = {Automatically Ranked {R}ussian Paraphrase Corpus for Text Generation}, + url = {https://aclanthology.org/2020.ngt-1.6}, + year = {2020}, +} + +@inproceedings{pivovarova2017paraphraser, + author = {Pivovarova, Lidia and Pronoza, Ekaterina and Yagunova, Elena and Pronoza, Anton}, + booktitle = {Conference on artificial intelligence and natural language}, + organization = {Springer}, + pages = {211--225}, + title = {ParaPhraser: Russian paraphrase corpus and shared task}, + year = {2017}, +} +""", ) @property diff --git a/mteb/tasks/STS/rus/RuSTSBenchmarkSTS.py b/mteb/tasks/STS/rus/RuSTSBenchmarkSTS.py index eca26691fa..44e3395e1c 100644 --- a/mteb/tasks/STS/rus/RuSTSBenchmarkSTS.py +++ b/mteb/tasks/STS/rus/RuSTSBenchmarkSTS.py @@ -18,7 +18,7 @@ class RuSTSBenchmarkSTS(AbsTaskSTS): type="STS", category="s2s", modalities=["text"], - eval_splits=["test"], + eval_splits=["test", "validation"], eval_langs=["rus-Cyrl"], main_score="cosine_spearman", date=("2012-01-01", "2018-01-01"), @@ -28,12 +28,14 @@ class RuSTSBenchmarkSTS(AbsTaskSTS): annotations_creators="human-annotated", dialect=[], sample_creation="machine-translated and verified", - bibtex_citation="""@InProceedings{huggingface:dataset:stsb_multi_mt, -title = {Machine translated multilingual STS benchmark dataset.}, -author={Philip May}, -year={2021}, -url={https://github.com/PhilipMay/stsb-multi-mt} -}""", + bibtex_citation=r""" +@inproceedings{huggingface:dataset:stsb_multi_mt, + author = {Philip May}, + title = {Machine translated multilingual STS benchmark dataset.}, + url = {https://github.com/PhilipMay/stsb-multi-mt}, + year = {2021}, +} +""", ) @property diff --git a/mteb/tasks/STS/spa/STSES.py b/mteb/tasks/STS/spa/STSES.py index 8bdbf227a2..844e132c77 100644 --- a/mteb/tasks/STS/spa/STSES.py +++ b/mteb/tasks/STS/spa/STSES.py @@ -30,21 +30,21 @@ class STSES(AbsTaskSTS): annotations_creators=None, dialect=None, sample_creation=None, - bibtex_citation="""@inproceedings{agirre2015semeval, - title={Semeval-2015 task 2: Semantic textual similarity, english, spanish and pilot on interpretability}, - author={Agirre, Eneko and Banea, Carmen and Cardie, Claire and Cer, Daniel and Diab, Mona and Gonzalez-Agirre, Aitor and Guo, Weiwei and Lopez-Gazpio, Inigo and Maritxalar, Montse and Mihalcea, Rada and others}, - booktitle={Proceedings of the 9th international workshop on semantic evaluation (SemEval 2015)}, - pages={252--263}, - year={2015} + bibtex_citation=r""" +@inproceedings{agirre2014semeval, + author = {Agirre, Eneko and Banea, Carmen and Cardie, Claire and Cer, Daniel M and Diab, Mona T and Gonzalez-Agirre, Aitor and Guo, Weiwei and Mihalcea, Rada and Rigau, German and Wiebe, Janyce}, + booktitle = {SemEval@ COLING}, + pages = {81--91}, + title = {SemEval-2014 Task 10: Multilingual Semantic Textual Similarity.}, + year = {2014}, } - -@inproceedings{agirre2014semeval, - title={SemEval-2014 Task 10: Multilingual Semantic Textual Similarity.}, - author={Agirre, Eneko and Banea, Carmen and Cardie, Claire and Cer, Daniel M and Diab, Mona T and Gonzalez-Agirre, Aitor and Guo, Weiwei and Mihalcea, Rada and Rigau, German and Wiebe, Janyce}, - booktitle={SemEval@ COLING}, - pages={81--91}, - year={2014} +@inproceedings{agirre2015semeval, + author = {Agirre, Eneko and Banea, Carmen and Cardie, Claire and Cer, Daniel and Diab, Mona and Gonzalez-Agirre, Aitor and Guo, Weiwei and Lopez-Gazpio, Inigo and Maritxalar, Montse and Mihalcea, Rada and others}, + booktitle = {Proceedings of the 9th international workshop on semantic evaluation (SemEval 2015)}, + pages = {252--263}, + title = {Semeval-2015 task 2: Semantic textual similarity, english, spanish and pilot on interpretability}, + year = {2015}, } """, ) diff --git a/mteb/tasks/STS/zho/CMTEBSTS.py b/mteb/tasks/STS/zho/CMTEBSTS.py index c7c0134d2a..fed68649eb 100644 --- a/mteb/tasks/STS/zho/CMTEBSTS.py +++ b/mteb/tasks/STS/zho/CMTEBSTS.py @@ -27,26 +27,28 @@ class ATEC(AbsTaskSTS): annotations_creators=None, dialect=None, sample_creation=None, - bibtex_citation="""@inproceedings{raghu-etal-2021-end, - title = "End-to-End Learning of Flowchart Grounded Task-Oriented Dialogs", - author = "Raghu, Dinesh and - Agarwal, Shantanu and - Joshi, Sachindra and - {Mausam}", - editor = "Moens, Marie-Francine and - Huang, Xuanjing and - Specia, Lucia and - Yih, Scott Wen-tau", - booktitle = "Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing", - month = nov, - year = "2021", - address = "Online and Punta Cana, Dominican Republic", - publisher = "Association for Computational Linguistics", - url = "https://aclanthology.org/2021.emnlp-main.357", - doi = "10.18653/v1/2021.emnlp-main.357", - pages = "4348--4366", - abstract = "We propose a novel problem within end-to-end learning of task oriented dialogs (TOD), in which the dialog system mimics a troubleshooting agent who helps a user by diagnosing their problem (e.g., car not starting). Such dialogs are grounded in domain-specific flowcharts, which the agent is supposed to follow during the conversation. Our task exposes novel technical challenges for neural TOD, such as grounding an utterance to the flowchart without explicit annotation, referring to additional manual pages when user asks a clarification question, and ability to follow unseen flowcharts at test time. We release a dataset (FLODIAL) consisting of 2,738 dialogs grounded on 12 different troubleshooting flowcharts. We also design a neural model, FLONET, which uses a retrieval-augmented generation architecture to train the dialog agent. Our experiments find that FLONET can do zero-shot transfer to unseen flowcharts, and sets a strong baseline for future research.", -}""", + bibtex_citation=r""" +@inproceedings{raghu-etal-2021-end, + abstract = {We propose a novel problem within end-to-end learning of task oriented dialogs (TOD), in which the dialog system mimics a troubleshooting agent who helps a user by diagnosing their problem (e.g., car not starting). Such dialogs are grounded in domain-specific flowcharts, which the agent is supposed to follow during the conversation. Our task exposes novel technical challenges for neural TOD, such as grounding an utterance to the flowchart without explicit annotation, referring to additional manual pages when user asks a clarification question, and ability to follow unseen flowcharts at test time. We release a dataset (FLODIAL) consisting of 2,738 dialogs grounded on 12 different troubleshooting flowcharts. We also design a neural model, FLONET, which uses a retrieval-augmented generation architecture to train the dialog agent. Our experiments find that FLONET can do zero-shot transfer to unseen flowcharts, and sets a strong baseline for future research.}, + address = {Online and Punta Cana, Dominican Republic}, + author = {Raghu, Dinesh and +Agarwal, Shantanu and +Joshi, Sachindra and +{Mausam}}, + booktitle = {Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing}, + doi = {10.18653/v1/2021.emnlp-main.357}, + editor = {Moens, Marie-Francine and +Huang, Xuanjing and +Specia, Lucia and +Yih, Scott Wen-tau}, + month = nov, + pages = {4348--4366}, + publisher = {Association for Computational Linguistics}, + title = {End-to-End Learning of Flowchart Grounded Task-Oriented Dialogs}, + url = {https://aclanthology.org/2021.emnlp-main.357}, + year = {2021}, +} +""", ) @property @@ -79,15 +81,17 @@ class BQ(AbsTaskSTS): annotations_creators=None, dialect=None, sample_creation=None, - bibtex_citation="""@misc{xiao2024cpackpackagedresourcesadvance, - title={C-Pack: Packaged Resources To Advance General Chinese Embedding}, - author={Shitao Xiao and Zheng Liu and Peitian Zhang and Niklas Muennighoff and Defu Lian and Jian-Yun Nie}, - year={2024}, - eprint={2309.07597}, - archivePrefix={arXiv}, - primaryClass={cs.CL}, - url={https://arxiv.org/abs/2309.07597}, -}""", + bibtex_citation=r""" +@misc{xiao2024cpackpackagedresourcesadvance, + archiveprefix = {arXiv}, + author = {Shitao Xiao and Zheng Liu and Peitian Zhang and Niklas Muennighoff and Defu Lian and Jian-Yun Nie}, + eprint = {2309.07597}, + primaryclass = {cs.CL}, + title = {C-Pack: Packaged Resources To Advance General Chinese Embedding}, + url = {https://arxiv.org/abs/2309.07597}, + year = {2024}, +} +""", ) @property @@ -120,15 +124,17 @@ class LCQMC(AbsTaskSTS): annotations_creators=None, dialect=None, sample_creation=None, - bibtex_citation="""@misc{xiao2024cpackpackagedresourcesadvance, - title={C-Pack: Packaged Resources To Advance General Chinese Embedding}, - author={Shitao Xiao and Zheng Liu and Peitian Zhang and Niklas Muennighoff and Defu Lian and Jian-Yun Nie}, - year={2024}, - eprint={2309.07597}, - archivePrefix={arXiv}, - primaryClass={cs.CL}, - url={https://arxiv.org/abs/2309.07597}, -}""", + bibtex_citation=r""" +@misc{xiao2024cpackpackagedresourcesadvance, + archiveprefix = {arXiv}, + author = {Shitao Xiao and Zheng Liu and Peitian Zhang and Niklas Muennighoff and Defu Lian and Jian-Yun Nie}, + eprint = {2309.07597}, + primaryclass = {cs.CL}, + title = {C-Pack: Packaged Resources To Advance General Chinese Embedding}, + url = {https://arxiv.org/abs/2309.07597}, + year = {2024}, +} +""", ) @property @@ -161,15 +167,17 @@ class PAWSX(AbsTaskSTS): annotations_creators=None, dialect=None, sample_creation=None, - bibtex_citation="""@misc{xiao2024cpackpackagedresourcesadvance, - title={C-Pack: Packaged Resources To Advance General Chinese Embedding}, - author={Shitao Xiao and Zheng Liu and Peitian Zhang and Niklas Muennighoff and Defu Lian and Jian-Yun Nie}, - year={2024}, - eprint={2309.07597}, - archivePrefix={arXiv}, - primaryClass={cs.CL}, - url={https://arxiv.org/abs/2309.07597}, -}""", + bibtex_citation=r""" +@misc{xiao2024cpackpackagedresourcesadvance, + archiveprefix = {arXiv}, + author = {Shitao Xiao and Zheng Liu and Peitian Zhang and Niklas Muennighoff and Defu Lian and Jian-Yun Nie}, + eprint = {2309.07597}, + primaryclass = {cs.CL}, + title = {C-Pack: Packaged Resources To Advance General Chinese Embedding}, + url = {https://arxiv.org/abs/2309.07597}, + year = {2024}, +} +""", ) @property @@ -196,21 +204,23 @@ class STSB(AbsTaskSTS): eval_langs=["cmn-Hans"], main_score="cosine_spearman", date=None, - domains=None, + domains=[], task_subtypes=None, license=None, annotations_creators=None, dialect=None, sample_creation=None, - bibtex_citation="""@misc{xiao2024cpackpackagedresourcesadvance, - title={C-Pack: Packaged Resources To Advance General Chinese Embedding}, - author={Shitao Xiao and Zheng Liu and Peitian Zhang and Niklas Muennighoff and Defu Lian and Jian-Yun Nie}, - year={2024}, - eprint={2309.07597}, - archivePrefix={arXiv}, - primaryClass={cs.CL}, - url={https://arxiv.org/abs/2309.07597}, -}""", + bibtex_citation=r""" +@misc{xiao2024cpackpackagedresourcesadvance, + archiveprefix = {arXiv}, + author = {Shitao Xiao and Zheng Liu and Peitian Zhang and Niklas Muennighoff and Defu Lian and Jian-Yun Nie}, + eprint = {2309.07597}, + primaryclass = {cs.CL}, + title = {C-Pack: Packaged Resources To Advance General Chinese Embedding}, + url = {https://arxiv.org/abs/2309.07597}, + year = {2024}, +} +""", ) @property @@ -243,26 +253,28 @@ class AFQMC(AbsTaskSTS): annotations_creators=None, dialect=None, sample_creation=None, - bibtex_citation="""@inproceedings{raghu-etal-2021-end, - title = "End-to-End Learning of Flowchart Grounded Task-Oriented Dialogs", - author = "Raghu, Dinesh and - Agarwal, Shantanu and - Joshi, Sachindra and - {Mausam}", - editor = "Moens, Marie-Francine and - Huang, Xuanjing and - Specia, Lucia and - Yih, Scott Wen-tau", - booktitle = "Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing", - month = nov, - year = "2021", - address = "Online and Punta Cana, Dominican Republic", - publisher = "Association for Computational Linguistics", - url = "https://aclanthology.org/2021.emnlp-main.357", - doi = "10.18653/v1/2021.emnlp-main.357", - pages = "4348--4366", - abstract = "We propose a novel problem within end-to-end learning of task oriented dialogs (TOD), in which the dialog system mimics a troubleshooting agent who helps a user by diagnosing their problem (e.g., car not starting). Such dialogs are grounded in domain-specific flowcharts, which the agent is supposed to follow during the conversation. Our task exposes novel technical challenges for neural TOD, such as grounding an utterance to the flowchart without explicit annotation, referring to additional manual pages when user asks a clarification question, and ability to follow unseen flowcharts at test time. We release a dataset (FLODIAL) consisting of 2,738 dialogs grounded on 12 different troubleshooting flowcharts. We also design a neural model, FLONET, which uses a retrieval-augmented generation architecture to train the dialog agent. Our experiments find that FLONET can do zero-shot transfer to unseen flowcharts, and sets a strong baseline for future research.", -}""", + bibtex_citation=r""" +@inproceedings{raghu-etal-2021-end, + abstract = {We propose a novel problem within end-to-end learning of task oriented dialogs (TOD), in which the dialog system mimics a troubleshooting agent who helps a user by diagnosing their problem (e.g., car not starting). Such dialogs are grounded in domain-specific flowcharts, which the agent is supposed to follow during the conversation. Our task exposes novel technical challenges for neural TOD, such as grounding an utterance to the flowchart without explicit annotation, referring to additional manual pages when user asks a clarification question, and ability to follow unseen flowcharts at test time. We release a dataset (FLODIAL) consisting of 2,738 dialogs grounded on 12 different troubleshooting flowcharts. We also design a neural model, FLONET, which uses a retrieval-augmented generation architecture to train the dialog agent. Our experiments find that FLONET can do zero-shot transfer to unseen flowcharts, and sets a strong baseline for future research.}, + address = {Online and Punta Cana, Dominican Republic}, + author = {Raghu, Dinesh and +Agarwal, Shantanu and +Joshi, Sachindra and +{Mausam}}, + booktitle = {Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing}, + doi = {10.18653/v1/2021.emnlp-main.357}, + editor = {Moens, Marie-Francine and +Huang, Xuanjing and +Specia, Lucia and +Yih, Scott Wen-tau}, + month = nov, + pages = {4348--4366}, + publisher = {Association for Computational Linguistics}, + title = {End-to-End Learning of Flowchart Grounded Task-Oriented Dialogs}, + url = {https://aclanthology.org/2021.emnlp-main.357}, + year = {2021}, +} +""", ) @property diff --git a/mteb/tasks/Summarization/eng/SummEvalSummarization.py b/mteb/tasks/Summarization/eng/SummEvalSummarization.py index 39708ffbeb..743160195a 100644 --- a/mteb/tasks/Summarization/eng/SummEvalSummarization.py +++ b/mteb/tasks/Summarization/eng/SummEvalSummarization.py @@ -32,12 +32,14 @@ class SummEvalSummarization(AbsTaskSummarization): annotations_creators="human-annotated", dialect=[], sample_creation="created", - bibtex_citation="""@article{fabbri2020summeval, - title={SummEval: Re-evaluating Summarization Evaluation}, - author={Fabbri, Alexander R and Kry{\'s}ci{\'n}ski, Wojciech and McCann, Bryan and Xiong, Caiming and Socher, Richard and Radev, Dragomir}, - journal={arXiv preprint arXiv:2007.12626}, - year={2020} -}""", + bibtex_citation=r""" +@article{fabbri2020summeval, + author = {Fabbri, Alexander R and Kry{\'s}ci{\'n}ski, Wojciech and McCann, Bryan and Xiong, Caiming and Socher, Richard and Radev, Dragomir}, + journal = {arXiv preprint arXiv:2007.12626}, + title = {SummEval: Re-evaluating Summarization Evaluation}, + year = {2020}, +} +""", ) @property @@ -70,12 +72,14 @@ class SummEvalSummarizationv2(AbsTaskSummarization): annotations_creators="human-annotated", dialect=[], sample_creation="created", - bibtex_citation="""@article{fabbri2020summeval, - title={SummEval: Re-evaluating Summarization Evaluation}, - author={Fabbri, Alexander R and Kry{\'s}ci{\'n}ski, Wojciech and McCann, Bryan and Xiong, Caiming and Socher, Richard and Radev, Dragomir}, - journal={arXiv preprint arXiv:2007.12626}, - year={2020} -}""", + bibtex_citation=r""" +@article{fabbri2020summeval, + author = {Fabbri, Alexander R and Kry{\'s}ci{\'n}ski, Wojciech and McCann, Bryan and Xiong, Caiming and Socher, Richard and Radev, Dragomir}, + journal = {arXiv preprint arXiv:2007.12626}, + title = {SummEval: Re-evaluating Summarization Evaluation}, + year = {2020}, +} +""", adapted_from=["SummEvalSummarization"], ) diff --git a/mteb/tasks/Summarization/fra/SummEvalFrSummarization.py b/mteb/tasks/Summarization/fra/SummEvalFrSummarization.py index 109792b375..964655eb9a 100644 --- a/mteb/tasks/Summarization/fra/SummEvalFrSummarization.py +++ b/mteb/tasks/Summarization/fra/SummEvalFrSummarization.py @@ -31,12 +31,14 @@ class SummEvalFrSummarization(AbsTaskSummarization): annotations_creators="human-annotated", dialect=[], sample_creation="machine-translated", - bibtex_citation="""@article{fabbri2020summeval, - title={SummEval: Re-evaluating Summarization Evaluation}, - author={Fabbri, Alexander R and Kry{\'s}ci{\'n}ski, Wojciech and McCann, Bryan and Xiong, Caiming and Socher, Richard and Radev, Dragomir}, - journal={arXiv preprint arXiv:2007.12626}, - year={2020} -}""", + bibtex_citation=r""" +@article{fabbri2020summeval, + author = {Fabbri, Alexander R and Kry{\'s}ci{\'n}ski, Wojciech and McCann, Bryan and Xiong, Caiming and Socher, Richard and Radev, Dragomir}, + journal = {arXiv preprint arXiv:2007.12626}, + title = {SummEval: Re-evaluating Summarization Evaluation}, + year = {2020}, +} +""", ) @property @@ -70,12 +72,14 @@ class SummEvalFrSummarizationv2(AbsTaskSummarization): annotations_creators="human-annotated", dialect=[], sample_creation="machine-translated", - bibtex_citation="""@article{fabbri2020summeval, - title={SummEval: Re-evaluating Summarization Evaluation}, - author={Fabbri, Alexander R and Kry{\'s}ci{\'n}ski, Wojciech and McCann, Bryan and Xiong, Caiming and Socher, Richard and Radev, Dragomir}, - journal={arXiv preprint arXiv:2007.12626}, - year={2020} -}""", + bibtex_citation=r""" +@article{fabbri2020summeval, + author = {Fabbri, Alexander R and Kry{\'s}ci{\'n}ski, Wojciech and McCann, Bryan and Xiong, Caiming and Socher, Richard and Radev, Dragomir}, + journal = {arXiv preprint arXiv:2007.12626}, + title = {SummEval: Re-evaluating Summarization Evaluation}, + year = {2020}, +} +""", adapted_from=["SummEvalFrSummarization"], ) diff --git a/mteb/tasks/__init__.py b/mteb/tasks/__init__.py index 7e283380ee..1b224a05be 100644 --- a/mteb/tasks/__init__.py +++ b/mteb/tasks/__init__.py @@ -10,8 +10,8 @@ from .Clustering import * from .Image.Any2AnyMultiChoice import * from .Image.Any2AnyRetrieval import * -from .Image.Clustering import * from .Image.ImageClassification import * +from .Image.ImageClustering import * from .Image.ImageMultilabelClassification import * from .Image.ImageTextPairClassification import * from .Image.VisualSTS import * diff --git a/mteb/tasks/aggregated_tasks/CQADupStackNLRetrieval.py b/mteb/tasks/aggregated_tasks/CQADupStackNLRetrieval.py index 46688bf199..c19ba6e8f5 100644 --- a/mteb/tasks/aggregated_tasks/CQADupStackNLRetrieval.py +++ b/mteb/tasks/aggregated_tasks/CQADupStackNLRetrieval.py @@ -53,14 +53,16 @@ class CQADupstackNLRetrieval(AbsTaskAggregate): annotations_creators="derived", dialect=[""], sample_creation="machine-translated and verified", # manually checked a small subset - bibtex_citation="""@misc{banar2024beirnlzeroshotinformationretrieval, - title={BEIR-NL: Zero-shot Information Retrieval Benchmark for the Dutch Language}, - author={Nikolay Banar and Ehsan Lotfi and Walter Daelemans}, - year={2024}, - eprint={2412.08329}, - archivePrefix={arXiv}, - primaryClass={cs.CL}, - url={https://arxiv.org/abs/2412.08329}, -}""", + bibtex_citation=r""" +@misc{banar2024beirnlzeroshotinformationretrieval, + archiveprefix = {arXiv}, + author = {Nikolay Banar and Ehsan Lotfi and Walter Daelemans}, + eprint = {2412.08329}, + primaryclass = {cs.CL}, + title = {BEIR-NL: Zero-shot Information Retrieval Benchmark for the Dutch Language}, + url = {https://arxiv.org/abs/2412.08329}, + year = {2024}, +} +""", adapted_from=["CQADupstackRetrieval"], ) diff --git a/mteb/tasks/aggregated_tasks/CQADupStackRetrieval.py b/mteb/tasks/aggregated_tasks/CQADupStackRetrieval.py index 917a667eb3..8b8bda0e68 100644 --- a/mteb/tasks/aggregated_tasks/CQADupStackRetrieval.py +++ b/mteb/tasks/aggregated_tasks/CQADupStackRetrieval.py @@ -42,21 +42,23 @@ class CQADupstackRetrieval(AbsTaskAggregate): main_score="ndcg_at_10", type="Retrieval", # since everything is retrieval - otherwise it would be "Aggregated" eval_splits=["test"], - bibtex_citation="""@inproceedings{hoogeveen2015, -author = {Hoogeveen, Doris and Verspoor, Karin M. and Baldwin, Timothy}, -title = {CQADupStack: A Benchmark Data Set for Community Question-Answering Research}, -booktitle = {Proceedings of the 20th Australasian Document Computing Symposium (ADCS)}, -series = {ADCS '15}, -year = {2015}, -isbn = {978-1-4503-4040-3}, -location = {Parramatta, NSW, Australia}, -pages = {3:1--3:8}, -articleno = {3}, -numpages = {8}, -url = {http://doi.acm.org/10.1145/2838931.2838934}, -doi = {10.1145/2838931.2838934}, -acmid = {2838934}, -publisher = {ACM}, -address = {New York, NY, USA}, -}""", + bibtex_citation=r""" +@inproceedings{hoogeveen2015, + acmid = {2838934}, + address = {New York, NY, USA}, + articleno = {3}, + author = {Hoogeveen, Doris and Verspoor, Karin M. and Baldwin, Timothy}, + booktitle = {Proceedings of the 20th Australasian Document Computing Symposium (ADCS)}, + doi = {10.1145/2838931.2838934}, + isbn = {978-1-4503-4040-3}, + location = {Parramatta, NSW, Australia}, + numpages = {8}, + pages = {3:1--3:8}, + publisher = {ACM}, + series = {ADCS '15}, + title = {CQADupStack: A Benchmark Data Set for Community Question-Answering Research}, + url = {http://doi.acm.org/10.1145/2838931.2838934}, + year = {2015}, +} +""", ) diff --git a/mteb/tasks/aggregated_tasks/STS17MultilingualVisualSTS.py b/mteb/tasks/aggregated_tasks/STS17MultilingualVisualSTS.py index 563f09cbe6..e8a6730da6 100644 --- a/mteb/tasks/aggregated_tasks/STS17MultilingualVisualSTS.py +++ b/mteb/tasks/aggregated_tasks/STS17MultilingualVisualSTS.py @@ -28,12 +28,14 @@ class STS17MultilingualVisualSTSEng(AbsTaskAggregate): main_score="cosine_spearman", type="VisualSTS(eng)", eval_splits=["test"], - bibtex_citation="""@article{xiao2024pixel, - title={Pixel Sentence Representation Learning}, - author={Xiao, Chenghao and Huang, Zhuoxu and Chen, Danlu and Hudson, G Thomas and Li, Yizhi and Duan, Haoran and Lin, Chenghua and Fu, Jie and Han, Jungong and Moubayed, Noura Al}, - journal={arXiv preprint arXiv:2402.08183}, - year={2024} -}""", + bibtex_citation=r""" +@article{xiao2024pixel, + author = {Xiao, Chenghao and Huang, Zhuoxu and Chen, Danlu and Hudson, G Thomas and Li, Yizhi and Duan, Haoran and Lin, Chenghua and Fu, Jie and Han, Jungong and Moubayed, Noura Al}, + journal = {arXiv preprint arXiv:2402.08183}, + title = {Pixel Sentence Representation Learning}, + year = {2024}, +} +""", ) @@ -82,10 +84,12 @@ class STS17MultilingualVisualSTSMultilingual(AbsTaskAggregate): "it-en": ["ita-Latn", "eng-Latn"], "nl-en": ["nld-Latn", "eng-Latn"], }, - bibtex_citation="""@article{xiao2024pixel, - title={Pixel Sentence Representation Learning}, - author={Xiao, Chenghao and Huang, Zhuoxu and Chen, Danlu and Hudson, G Thomas and Li, Yizhi and Duan, Haoran and Lin, Chenghua and Fu, Jie and Han, Jungong and Moubayed, Noura Al}, - journal={arXiv preprint arXiv:2402.08183}, - year={2024} -}""", + bibtex_citation=r""" +@article{xiao2024pixel, + author = {Xiao, Chenghao and Huang, Zhuoxu and Chen, Danlu and Hudson, G Thomas and Li, Yizhi and Duan, Haoran and Lin, Chenghua and Fu, Jie and Han, Jungong and Moubayed, Noura Al}, + journal = {arXiv preprint arXiv:2402.08183}, + title = {Pixel Sentence Representation Learning}, + year = {2024}, +} +""", ) diff --git a/mteb/tasks/aggregated_tasks/STSBenchmarkMultilingualVisualSTS.py b/mteb/tasks/aggregated_tasks/STSBenchmarkMultilingualVisualSTS.py index 74c5f9feb6..94cc13ecef 100644 --- a/mteb/tasks/aggregated_tasks/STSBenchmarkMultilingualVisualSTS.py +++ b/mteb/tasks/aggregated_tasks/STSBenchmarkMultilingualVisualSTS.py @@ -26,12 +26,14 @@ class STSBenchmarkMultilingualVisualSTSEng(AbsTaskAggregate): type="VisualSTS(eng)", eval_splits=["test"], eval_langs=["eng-Latn"], - bibtex_citation="""@article{xiao2024pixel, - title={Pixel Sentence Representation Learning}, - author={Xiao, Chenghao and Huang, Zhuoxu and Chen, Danlu and Hudson, G Thomas and Li, Yizhi and Duan, Haoran and Lin, Chenghua and Fu, Jie and Han, Jungong and Moubayed, Noura Al}, - journal={arXiv preprint arXiv:2402.08183}, - year={2024} -}""", + bibtex_citation=r""" +@article{xiao2024pixel, + author = {Xiao, Chenghao and Huang, Zhuoxu and Chen, Danlu and Hudson, G Thomas and Li, Yizhi and Duan, Haoran and Lin, Chenghua and Fu, Jie and Han, Jungong and Moubayed, Noura Al}, + journal = {arXiv preprint arXiv:2402.08183}, + title = {Pixel Sentence Representation Learning}, + year = {2024}, +} +""", ) @@ -88,10 +90,12 @@ class STSBenchmarkMultilingualVisualSTSMultilingual(AbsTaskAggregate): "rus-Cyrl", "cmn-Hans", ], - bibtex_citation="""@article{xiao2024pixel, - title={Pixel Sentence Representation Learning}, - author={Xiao, Chenghao and Huang, Zhuoxu and Chen, Danlu and Hudson, G Thomas and Li, Yizhi and Duan, Haoran and Lin, Chenghua and Fu, Jie and Han, Jungong and Moubayed, Noura Al}, - journal={arXiv preprint arXiv:2402.08183}, - year={2024} -}""", + bibtex_citation=r""" +@article{xiao2024pixel, + author = {Xiao, Chenghao and Huang, Zhuoxu and Chen, Danlu and Hudson, G Thomas and Li, Yizhi and Duan, Haoran and Lin, Chenghua and Fu, Jie and Han, Jungong and Moubayed, Noura Al}, + journal = {arXiv preprint arXiv:2402.08183}, + title = {Pixel Sentence Representation Learning}, + year = {2024}, +} +""", ) diff --git a/pyproject.toml b/pyproject.toml index 0832c20861..b86989b2b2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "mteb" -version = "1.36.36" +version = "1.38.4" description = "Massive Text Embedding Benchmark" readme = "README.md" authors = [ @@ -62,6 +62,7 @@ dev = [ "pytest-rerunfailures>=15.0", "iso639>=0.1.4", # used for tests/scripts/test_generate_model_meta.py "pre-commit>=4.1.0", +"bibtexparser>=1.4.3" # used for tests/test_citation_formatting.py ] codecarbon = ["codecarbon"] speedtask = [ @@ -71,7 +72,7 @@ speedtask = [ ] peft = ["peft>=0.11.0"] leaderboard = [ - "gradio==5.16.0; python_version > '3.9'", # 3.10 is required for gradio + "gradio==5.27.1; python_version > '3.9'", # 3.10 is required for gradio "gradio_rangeslider>=0.0.8", "plotly>=5.24.0,<6.0.0", "cachetools>=5.2.0", diff --git a/scripts/data/scandisent/create_data.py b/scripts/data/scandisent/create_data.py new file mode 100644 index 0000000000..0a1b3d6ba0 --- /dev/null +++ b/scripts/data/scandisent/create_data.py @@ -0,0 +1,26 @@ +from __future__ import annotations + +from datasets import Dataset, DatasetDict, load_dataset +from huggingface_hub import create_repo + +ds = load_dataset("timpal0l/scandisent") +repo_name = "mteb/scandisent" +create_repo(repo_name, repo_type="dataset") + +ds1 = {} +df_split = ds["train"].to_polars() +df_grouped = dict(df_split.group_by(["language"])) +for lang in set(df_split["language"].unique()): + ds1.setdefault(lang, {}) + # Remove lang column and convert back to HF datasets, not strictly necessary but better for compatibility + ds1[lang] = DatasetDict( + { + "train": Dataset.from_polars(df_grouped[(lang,)].drop("language")).select( + range(7500) + ), + "test": Dataset.from_polars(df_grouped[(lang,)].drop("language")).select( + range(7500, 10000) + ), + } + ) + ds1[lang].push_to_hub(repo_name, config_name=lang) diff --git a/scripts/format_citations.py b/scripts/format_citations.py new file mode 100644 index 0000000000..e1e9339e35 --- /dev/null +++ b/scripts/format_citations.py @@ -0,0 +1,342 @@ +from __future__ import annotations + +import ast +import logging +from pathlib import Path + +import bibtexparser +import typer +from bibtexparser.bwriter import BibTexWriter + +app = typer.Typer() + +logging.basicConfig( + level=logging.INFO, + format="%(levelname)s: %(message)s", +) +logger = logging.getLogger(__name__) + + +class KeywordLiteralFinder(ast.NodeVisitor): + def __init__(self, target_function_name: str, target_keyword_arg: str): + self.target_function_name = target_function_name + self.target_keyword_arg = target_keyword_arg + self.locations: list[tuple[int, int, int, int]] = [] + self.keyword_found_anywhere = False + + def visit_Call(self, node: ast.Call): + func_name = "" + if isinstance(node.func, ast.Name): + func_name = node.func.id + elif isinstance(node.func, ast.Attribute): + func_name = node.func.attr + + if func_name != self.target_function_name: + self.generic_visit(node) + return + + for keyword in node.keywords: + if keyword.arg != self.target_keyword_arg: + continue + self.keyword_found_anywhere = True + if not isinstance(keyword.value, ast.Constant) or not isinstance( + keyword.value.value, str + ): + continue + + if ( + keyword.value.end_lineno is not None + and keyword.value.end_col_offset is not None + ): + self.locations.append( + ( + keyword.value.lineno, + keyword.value.col_offset, + keyword.value.end_lineno, + keyword.value.end_col_offset, + ) + ) + else: + logger.warning( + f"Could not get end location for a {self.target_keyword_arg} string. Skipping this instance." + ) + self.generic_visit(node) + + +def extract_string_literal( + lines: list[str], location: tuple[int, int, int, int] +) -> tuple[str | None, str | None]: + start_line, start_col, end_line, end_col = location + start_line_0, end_line_0 = start_line - 1, end_line - 1 + + if ( + start_line_0 < 0 + or end_line_0 >= len(lines) + or start_col > len(lines[start_line_0]) + or end_col > len(lines[end_line_0]) + ): + return None, None + + if start_line == end_line: + literal = lines[start_line_0][start_col:end_col] + else: + first_line = lines[start_line_0][start_col:] + middle_lines = ( + lines[start_line_0 + 1 : end_line_0] + if start_line_0 + 1 <= end_line_0 + else [] + ) + last_line = lines[end_line_0][:end_col] + literal = "\n".join([first_line] + middle_lines + [last_line]) + + quote_types = ['"""', "'''", '"', "'"] + for quote in quote_types: + for prefix in [f"r{quote}", quote]: + if literal.startswith(prefix) and literal.endswith(quote): + return literal[len(prefix) : -len(quote)], quote + + return None, None + + +def format_bibtex(bibtex_str: str) -> str | None: + parser = bibtexparser.bparser.BibTexParser( + common_strings=True, ignore_nonstandard_types=False, interpolate_strings=False + ) + + try: + bib_database = bibtexparser.loads(bibtex_str, parser=parser) + if not bib_database.entries: + return None + bib_database.comments = [] + + writer = BibTexWriter() + writer.indent = " " + writer.comma_first = False + writer.add_trailing_comma = True + + return writer.write(bib_database).strip() + except Exception: + return None + + +def process_file( + file_path: Path, + target_function_name: str, + target_keyword_arg: str, + dry_run: bool, +) -> tuple[bool, bool, int, bool, bool]: + file_modified = file_error = skipped_no_keyword = skipped_no_locations = False + num_modified_in_file = 0 + replacements_for_file = [] + + try: + content = file_path.read_text() + tree = ast.parse(content, filename=str(file_path)) + + finder = KeywordLiteralFinder(target_function_name, target_keyword_arg) + finder.visit(tree) + + if not finder.keyword_found_anywhere: + return False, False, 0, True, False + + if not finder.locations: + return False, False, 0, False, True + + content_lines = content.splitlines() + content_lines_with_endings = content.splitlines(True) + + for location in finder.locations: + literal_value, quote_type = extract_string_literal(content_lines, location) + + if literal_value is None or quote_type is None: + logger.error( + f"In {file_path.name}: Could not extract {target_keyword_arg} string literal at {location}" + ) + file_error = True + continue + + literal_str = literal_value.strip() + if not literal_str: + continue + + formatted_literal = format_bibtex(literal_str) + if formatted_literal is None: + logger.error( + f"In {file_path.name}: Failed to parse/format {target_keyword_arg} at {location}" + ) + file_error = True + continue + + if literal_str == formatted_literal: + continue + + new_literal = f'r"""\n{formatted_literal}\n"""' + + start_line, start_col, end_line, end_col = location + start_char_index = ( + sum(len(line) for line in content_lines_with_endings[: start_line - 1]) + + start_col + ) + end_char_index = ( + sum(len(line) for line in content_lines_with_endings[: end_line - 1]) + + end_col + ) + + original_slice = content[start_char_index:end_char_index] + matched_prefix = "" + if original_slice.startswith(f"r{quote_type}"): + matched_prefix = "r" + + full_original_literal = ( + f"{matched_prefix}{quote_type}{literal_value}{quote_type}" + ) + + try: + actual_start = content.index(full_original_literal, start_char_index) + actual_end = actual_start + len(full_original_literal) + replacements_for_file.append((actual_start, actual_end, new_literal)) + num_modified_in_file += 1 + except ValueError: + logger.warning( + f"In {file_path.name}: Could not find exact original literal match for {target_keyword_arg} at {location}. Using offset-based replacement." + ) + replacements_for_file.append( + (start_char_index, end_char_index, new_literal) + ) + num_modified_in_file += 1 + + if replacements_for_file: + replacements_for_file.sort(key=lambda x: x[0], reverse=True) + new_content = content + for start, end, literal in replacements_for_file: + new_content = new_content[:start] + literal + new_content[end:] + + if not dry_run: + file_path.write_text(new_content) + file_modified = True + + except SyntaxError as e: + logger.error(f"SyntaxError in {file_path.name}: {e}") + file_error = True + except Exception as e: + logger.error(f"Unexpected error in {file_path.name}: {e}") + import traceback + + traceback.print_exc() + file_error = True + + return ( + file_modified, + file_error, + num_modified_in_file, + skipped_no_keyword, + skipped_no_locations, + ) + + +@app.command() +def tasks( + tasks_dir: Path = typer.Argument( + Path("mteb/tasks"), + exists=True, + file_okay=False, + dir_okay=True, + readable=True, + help="Directory containing MTEB task Python files.", + ), + dry_run: bool = typer.Option( + True, + "--dry-run", + help="Perform parsing and formatting but do not modify files.", + ), +): + modified_files = error_files = skipped_files = processed_files = bibtex_modified = 0 + task_files = sorted(tasks_dir.rglob("*.py")) + + if not task_files: + logger.error(f"No Python files found in {tasks_dir}") + raise typer.Exit(code=1) + + logger.info(f"Found {len(task_files)} Python files in {tasks_dir}. Processing...") + + for file_path in task_files: + if file_path.name == "__init__.py": + continue + + processed_files += 1 + file_modified, file_error, num_modified, no_keyword, no_locations = ( + process_file(file_path, "TaskMetadata", "bibtex_citation", dry_run) + ) + + if file_error: + error_files += 1 + elif file_modified: + modified_files += 1 + bibtex_modified += num_modified + else: + skipped_files += 1 + + logger.info("\n--- Summary ---") + logger.info(f"Processed Files: {processed_files}") + logger.info(f"Modified Files: {modified_files}") + logger.info(f"Skipped Files: {skipped_files}") + logger.info(f"Error Files: {error_files}") + logger.info(f"Total BibTeX Instances Modified: {bibtex_modified}") + + if dry_run: + logger.info("\nNOTE: Dry run mode was enabled. No files were actually changed.") + + if error_files > 0: + logger.warning("Errors occurred during processing. Check logs above.") + raise typer.Exit(code=1) + + +@app.command() +def benchmarks( + benchmarks_file: Path = typer.Argument( + Path("mteb/benchmarks/benchmarks.py"), + exists=True, + file_okay=True, + dir_okay=False, + readable=True, + help="Path to the benchmarks.py file.", + ), + dry_run: bool = typer.Option( + True, + "--dry-run", + help="Perform parsing and formatting but do not modify the file.", + ), +): + logger.info(f"Processing {benchmarks_file}...") + + file_modified, file_error, num_modified, no_keyword, no_locations = process_file( + benchmarks_file, "Benchmark", "citation", dry_run + ) + + if no_keyword: + logger.info(f"SKIPPED: No 'citation' keyword found in {benchmarks_file.name}.") + raise typer.Exit() + if no_locations: + logger.info( + f"SKIPPED: 'citation' keyword found, but no valid string literals detected in {benchmarks_file.name}." + ) + raise typer.Exit() + + logger.info("\n--- Summary ---") + logger.info(f"Processed File: {benchmarks_file.name}") + logger.info(f"Modified: {'Yes' if file_modified else 'No'}") + logger.info(f"Errors Occurred: {'Yes' if file_error else 'No'}") + logger.info(f"Citations Modified: {num_modified}") + + if dry_run and file_modified: + logger.info("\nNOTE: Dry run mode was enabled. File was not actually changed.") + + if file_error: + logger.warning("Errors occurred during processing. Check logs above.") + raise typer.Exit(code=1) + elif not file_modified and not file_error: + logger.info("No changes needed.") + + +if __name__ == "__main__": + app() diff --git a/scripts/task_selection/europe_results.csv b/scripts/task_selection/europe_results.csv deleted file mode 100644 index 6abf16e11a..0000000000 --- a/scripts/task_selection/europe_results.csv +++ /dev/null @@ -1,13 +0,0 @@ -,model,revision,mean,mean (Classification),mean (Retrieval),mean (PairClassification),mean (BitextMining),mean (Clustering),mean (MultilabelClassification),mean (STS),mean (Reranking),mean (InstructionRetrieval),mean (wieghted by task type),borda_count,Total Evaluation time (hours) -2,GritLM/GritLM-7B,13f00a0e36500c80ce12870ea513846a066004af,0.607,0.643,0.571,0.894,0.708,0.435,0.176,0.755,0.589,0.035,0.534,680.0,6.408 -7,intfloat/multilingual-e5-large-instruct,baa7be480a7de1539afce709c8f13f833a510e0a,0.61,0.635,0.555,0.899,0.767,0.46,0.173,0.772,0.575,-0.004,0.537,679.0,4.463 -11,intfloat/e5-mistral-7b-instruct,07163b72af1488142a360786df853f237b1a3ca1,0.592,0.625,0.524,0.907,0.702,0.445,0.155,0.76,0.585,-0.006,0.522,643.0,5.718 -3,intfloat/multilingual-e5-large,4dc6d853a804b9c8886ede6dda8a073b7dc08a81,0.571,0.609,0.513,0.887,0.69,0.367,0.15,0.756,0.552,-0.031,0.499,527.0,5.765 -9,intfloat/multilingual-e5-base,d13f1b27baf31030b7fd040960d60d909913633f,0.557,0.583,0.506,0.876,0.683,0.367,0.149,0.734,0.53,-0.027,0.489,438.0,2.712 -4,sentence-transformers/paraphrase-multilingual-mpnet-base-v2,79f2382ceacceacdf38563d7c5d16b9ff8d725d6,0.512,0.554,0.393,0.906,0.554,0.343,0.069,0.741,0.516,-0.011,0.451,387.0,14.898 -0,intfloat/multilingual-e5-small,e4ce9877abf3edfe10b0d82785e83bdcb973e22e,0.537,0.565,0.465,0.869,0.66,0.355,0.14,0.71,0.534,-0.024,0.475,347.0,1.901 -1,sentence-transformers/LaBSE,e34fab64a3011d2176c99545a93d5cbddc9a91b7,0.498,0.54,0.338,0.85,0.723,0.335,0.163,0.657,0.488,-0.03,0.452,296.0,2.439 -5,sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2,bf3bf13ab40c3157080a7ab344c831b9ad18b5eb,0.484,0.517,0.355,0.888,0.513,0.327,0.057,0.724,0.492,-0.013,0.429,252.0,1.809 -6,sentence-transformers/all-mpnet-base-v2,84f2bcc00d77236f9e89c8a360a00fb1139bf47d,0.433,0.485,0.359,0.796,0.236,0.36,0.109,0.63,0.472,-0.031,0.379,241.5,2.887 -8,sentence-transformers/all-MiniLM-L12-v2,a05860a77cef7b37e0048a7864658139bc18a854,0.431,0.487,0.345,0.809,0.256,0.323,0.076,0.635,0.47,-0.008,0.377,221.0,1.78 -10,sentence-transformers/all-MiniLM-L6-v2,8b3219a92973c328a8e22fadcfa821b5dc75636a,0.425,0.475,0.366,0.796,0.218,0.335,0.088,0.618,0.445,-0.028,0.368,172.5,1.606 diff --git a/scripts/task_selection/indic_results.csv b/scripts/task_selection/indic_results.csv deleted file mode 100644 index 49c7a77297..0000000000 --- a/scripts/task_selection/indic_results.csv +++ /dev/null @@ -1,13 +0,0 @@ -,model,revision,mean,mean (BitextMining),mean (Classification),mean (Retrieval),mean (STS),mean (Reranking),mean (Clustering),mean (PairClassification),mean (wieghted by task type),borda_count,Total Evaluation time (hours) -7,intfloat/multilingual-e5-large-instruct,baa7be480a7de1539afce709c8f13f833a510e0a,0.718,0.703,0.709,0.887,0.537,0.91,0.472,0.785,0.715,224.0,1.887 -3,intfloat/multilingual-e5-large,4dc6d853a804b9c8886ede6dda8a073b7dc08a81,0.645,0.644,0.631,0.875,0.439,0.897,0.237,0.739,0.637,190.0,1.269 -2,GritLM/GritLM-7B,13f00a0e36500c80ce12870ea513846a066004af,0.646,0.607,0.652,0.832,0.272,0.91,0.361,0.741,0.625,165.0,2.136 -9,intfloat/multilingual-e5-base,d13f1b27baf31030b7fd040960d60d909913633f,0.625,0.612,0.619,0.833,0.411,0.877,0.216,0.71,0.611,164.0,0.851 -11,intfloat/e5-mistral-7b-instruct,07163b72af1488142a360786df853f237b1a3ca1,0.637,0.616,0.636,0.808,0.23,0.903,0.387,0.779,0.623,154.0,1.644 -0,intfloat/multilingual-e5-small,e4ce9877abf3edfe10b0d82785e83bdcb973e22e,0.619,0.612,0.613,0.808,0.408,0.87,0.239,0.69,0.606,150.0,0.736 -1,sentence-transformers/LaBSE,e34fab64a3011d2176c99545a93d5cbddc9a91b7,0.607,0.636,0.6,0.716,0.528,0.809,0.188,0.652,0.59,135.0,0.809 -4,sentence-transformers/paraphrase-multilingual-mpnet-base-v2,79f2382ceacceacdf38563d7c5d16b9ff8d725d6,0.571,0.42,0.602,0.696,0.341,0.822,0.241,0.827,0.564,127.0,0.785 -5,sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2,bf3bf13ab40c3157080a7ab344c831b9ad18b5eb,0.5,0.236,0.563,0.641,0.198,0.785,0.194,0.789,0.486,91.0,0.668 -6,sentence-transformers/all-mpnet-base-v2,84f2bcc00d77236f9e89c8a360a00fb1139bf47d,0.364,0.072,0.472,0.323,-0.025,0.647,0.089,0.584,0.309,52.0,0.84 -8,sentence-transformers/all-MiniLM-L12-v2,a05860a77cef7b37e0048a7864658139bc18a854,0.359,0.078,0.46,0.329,-0.053,0.692,0.076,0.584,0.31,39.0,0.689 -10,sentence-transformers/all-MiniLM-L6-v2,8b3219a92973c328a8e22fadcfa821b5dc75636a,0.351,0.063,0.463,0.294,-0.063,0.645,0.066,0.574,0.292,27.0,0.629 diff --git a/scripts/task_selection/mteb_lite_results.csv b/scripts/task_selection/mteb_lite_results.csv deleted file mode 100644 index e382c9e3b1..0000000000 --- a/scripts/task_selection/mteb_lite_results.csv +++ /dev/null @@ -1,13 +0,0 @@ -,model,revision,mean,mean (Clustering),mean (STS),mean (Classification),mean (Reranking),mean (Retrieval),mean (PairClassification),mean (weighted by task type),borda_count,Total Evaluation time (hours),Total CO2-eq emissions (kg) -11,intfloat/e5-mistral-7b-instruct,07163b72af1488142a360786df853f237b1a3ca1,0.67,0.514,0.836,0.752,0.498,0.548,0.884,0.672,393.0,2.502,2.971 -2,GritLM/GritLM-7B,13f00a0e36500c80ce12870ea513846a066004af,0.664,0.508,0.825,0.77,0.496,0.532,0.873,0.667,384.0,3.111,3.409 -7,intfloat/multilingual-e5-large-instruct,baa7be480a7de1539afce709c8f13f833a510e0a,0.652,0.499,0.843,0.732,0.487,0.51,0.862,0.656,357.0,2.033,1.418 -3,intfloat/multilingual-e5-large,4dc6d853a804b9c8886ede6dda8a073b7dc08a81,0.621,0.428,0.806,0.728,0.447,0.49,0.847,0.624,270.0,2.549,1.563 -6,sentence-transformers/all-mpnet-base-v2,84f2bcc00d77236f9e89c8a360a00fb1139bf47d,0.56,0.466,0.722,0.566,0.484,0.419,0.83,0.581,211.0,1.19,0.688 -9,intfloat/multilingual-e5-base,d13f1b27baf31030b7fd040960d60d909913633f,0.602,0.422,0.791,0.7,0.443,0.461,0.836,0.609,211.0,1.17,0.691 -4,sentence-transformers/paraphrase-multilingual-mpnet-base-v2,79f2382ceacceacdf38563d7c5d16b9ff8d725d6,0.573,0.435,0.798,0.686,0.452,0.341,0.817,0.588,188.0,1.017,0.563 -8,sentence-transformers/all-MiniLM-L12-v2,a05860a77cef7b37e0048a7864658139bc18a854,0.547,0.446,0.707,0.558,0.475,0.407,0.825,0.57,172.0,0.814,0.442 -10,sentence-transformers/all-MiniLM-L6-v2,8b3219a92973c328a8e22fadcfa821b5dc75636a,0.544,0.449,0.704,0.554,0.471,0.398,0.824,0.567,149.0,0.733,0.391 -0,intfloat/multilingual-e5-small,e4ce9877abf3edfe10b0d82785e83bdcb973e22e,0.584,0.408,0.776,0.677,0.432,0.437,0.827,0.593,147.0,0.833,0.459 -5,sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2,bf3bf13ab40c3157080a7ab344c831b9ad18b5eb,0.551,0.417,0.775,0.644,0.454,0.328,0.8,0.57,109.0,0.879,0.469 -1,sentence-transformers/LaBSE,e34fab64a3011d2176c99545a93d5cbddc9a91b7,0.486,0.361,0.702,0.668,0.413,0.168,0.789,0.517,49.0,1.02,0.582 diff --git a/scripts/task_selection/mteb_lite_tasks.csv b/scripts/task_selection/mteb_lite_tasks.csv deleted file mode 100644 index 25e359e383..0000000000 --- a/scripts/task_selection/mteb_lite_tasks.csv +++ /dev/null @@ -1,41 +0,0 @@ -,name,type,languages,domains,license -0,AmazonCounterfactualClassification,Classification,"['deu', 'eng', 'jpn']","['Reviews', 'Written']",cc-by-4.0 -1,ArguAna,Retrieval,['eng'],"['Medical', 'Written']",cc-by-sa-4.0 -2,ArXivHierarchicalClusteringP2P,Clustering,['eng'],"['Academic', 'Written']",cc0-1.0 -3,ArXivHierarchicalClusteringS2S,Clustering,['eng'],"['Academic', 'Written']",cc0-1.0 -4,AskUbuntuDupQuestions,Reranking,['eng'],, -5,BIOSSES,STS,['eng'],, -6,Banking77Classification,Classification,['eng'],['Written'],mit -7,BiorxivClusteringP2P.v2,Clustering,['eng'],"['Academic', 'Written']",https://www.biorxiv.org/content/about-biorxiv -8,CQADupstackGamingRetrieval,Retrieval,['eng'],, -9,CQADupstackUnixRetrieval,Retrieval,['eng'],, -10,ClimateFEVERHardNegatives,Retrieval,['eng'],, -11,FEVERHardNegatives,Retrieval,['eng'],, -12,FiQA2018,Retrieval,['eng'],, -13,HotpotQAHardNegatives,Retrieval,['eng'],"['Web', 'Written']",cc-by-sa-4.0 -14,ImdbClassification,Classification,['eng'],"['Reviews', 'Written']",not specified -15,MTOPDomainClassification,Classification,"['deu', 'eng', 'fra', 'hin', 'spa', 'tha']","['Spoken', 'Spoken']",not specified -16,MassiveIntentClassification,Classification,"['afr', 'amh', 'ara', 'aze', 'ben', 'cmo', 'cym', 'dan', 'deu', 'ell', 'eng', 'fas', 'fin', 'fra', 'heb', 'hin', 'hun', 'hye', 'ind', 'isl', 'ita', 'jav', 'jpn', 'kan', 'kat', 'khm', 'kor', 'lav', 'mal', 'mon', 'msa', 'mya', 'nld', 'nob', 'pol', 'por', 'ron', 'rus', 'slv', 'spa', 'sqi', 'swa', 'swe', 'tam', 'tel', 'tgl', 'tha', 'tur', 'urd', 'vie']",['Spoken'],apache-2.0 -17,MassiveScenarioClassification,Classification,"['afr', 'amh', 'ara', 'aze', 'ben', 'cmo', 'cym', 'dan', 'deu', 'ell', 'eng', 'fas', 'fin', 'fra', 'heb', 'hin', 'hun', 'hye', 'ind', 'isl', 'ita', 'jav', 'jpn', 'kan', 'kat', 'khm', 'kor', 'lav', 'mal', 'mon', 'msa', 'mya', 'nld', 'nob', 'pol', 'por', 'ron', 'rus', 'slv', 'spa', 'sqi', 'swa', 'swe', 'tam', 'tel', 'tgl', 'tha', 'tur', 'urd', 'vie']",['Spoken'],apache-2.0 -18,MedrxivClusteringP2P.v2,Clustering,['eng'],"['Academic', 'Medical', 'Written']",https://www.medrxiv.org/content/about-medrxiv -19,MedrxivClusteringS2S.v2,Clustering,['eng'],"['Academic', 'Medical', 'Written']",https://www.medrxiv.org/content/about-medrxiv -20,MindSmallReranking,Reranking,['eng'],"['News', 'Written']",https://github.com/msnews/MIND/blob/master/MSR%20License_Data.pdf -21,SCIDOCS,Retrieval,['eng'],"['Academic', 'Written', 'Non-fiction']",cc-by-sa-4.0 -22,SICK-R,STS,['eng'],, -23,STS12,STS,['eng'],"['Encyclopaedic', 'News', 'Written']",not specified -24,STS13,STS,['eng'],"['Web', 'News', 'Non-fiction', 'Written']",not specified -25,STS14,STS,['eng'],"['Blog', 'Web', 'Spoken']",not specified -26,STS15,STS,['eng'],"['Blog', 'News', 'Web', 'Written', 'Spoken']",not specified -27,STS17,STS,"['ara', 'deu', 'eng', 'fra', 'ita', 'kor', 'nld', 'spa', 'tur']","['News', 'Web', 'Written']",not specified -28,STS22.v2,STS,"['ara', 'cmn', 'deu', 'eng', 'fra', 'ita', 'pol', 'rus', 'spa', 'tur']","['News', 'Written']",not specified -29,STSBenchmark,STS,['eng'],, -30,SprintDuplicateQuestions,PairClassification,['eng'],"['Programming', 'Written']",not specified -31,StackExchangeClustering.v2,Clustering,['eng'],"['Web', 'Written']",not specified -32,StackExchangeClusteringP2P.v2,Clustering,['eng'],"['Web', 'Written']",not specified -33,TRECCOVID,Retrieval,['eng'],, -34,Touche2020,Retrieval,['eng'],, -35,ToxicConversationsClassification,Classification,['eng'],"['Social', 'Written']",cc-by-4.0 -36,TweetSentimentExtractionClassification,Classification,['eng'],"['Social', 'Written']",not specified -37,TwentyNewsgroupsClustering.v2,Clustering,['eng'],"['News', 'Written']",not specified -38,TwitterSemEval2015,PairClassification,['eng'],, -39,TwitterURLCorpus,PairClassification,['eng'],, diff --git a/scripts/task_selection/mult_results.csv b/scripts/task_selection/mult_results.csv deleted file mode 100644 index 98edf2b0e1..0000000000 --- a/scripts/task_selection/mult_results.csv +++ /dev/null @@ -1,13 +0,0 @@ -,model,revision,mean,mean (BitextMining),mean (PairClassification),mean (Classification),mean (STS),mean (Retrieval),mean (MultilabelClassification),mean (Clustering),mean (Reranking),mean (InstructionRetrieval),mean (weighted by task type),borda_count,Total Evaluation time (hours) -7,intfloat/multilingual-e5-large-instruct,baa7be480a7de1539afce709c8f13f833a510e0a,0.634,0.801,0.812,0.65,0.767,0.58,0.229,0.515,0.63,-0.004,0.553,1244.0,6.884 -2,GritLM/GritLM-7B,13f00a0e36500c80ce12870ea513846a066004af,0.609,0.705,0.802,0.619,0.732,0.591,0.212,0.504,0.628,0.035,0.536,1119.0,10.675 -11,intfloat/e5-mistral-7b-instruct,07163b72af1488142a360786df853f237b1a3ca1,0.602,0.706,0.814,0.603,0.739,0.554,0.222,0.514,0.634,-0.006,0.531,1100.0,9.969 -3,intfloat/multilingual-e5-large,4dc6d853a804b9c8886ede6dda8a073b7dc08a81,0.587,0.717,0.793,0.599,0.734,0.55,0.213,0.431,0.626,-0.031,0.515,980.0,9.206 -9,intfloat/multilingual-e5-base,d13f1b27baf31030b7fd040960d60d909913633f,0.571,0.694,0.776,0.582,0.712,0.536,0.202,0.428,0.599,-0.027,0.5,811.0,4.261 -4,sentence-transformers/paraphrase-multilingual-mpnet-base-v2,79f2382ceacceacdf38563d7c5d16b9ff8d725d6,0.52,0.521,0.816,0.551,0.695,0.393,0.164,0.412,0.532,-0.011,0.452,698.0,16.15 -0,intfloat/multilingual-e5-small,e4ce9877abf3edfe10b0d82785e83bdcb973e22e,0.556,0.675,0.768,0.565,0.699,0.502,0.191,0.418,0.602,-0.024,0.488,654.0,2.893 -1,sentence-transformers/LaBSE,e34fab64a3011d2176c99545a93d5cbddc9a91b7,0.521,0.763,0.761,0.546,0.652,0.329,0.201,0.394,0.504,-0.03,0.458,589.0,3.818 -5,sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2,bf3bf13ab40c3157080a7ab344c831b9ad18b5eb,0.488,0.445,0.794,0.517,0.664,0.362,0.149,0.396,0.51,-0.013,0.425,475.0,2.759 -6,sentence-transformers/all-mpnet-base-v2,84f2bcc00d77236f9e89c8a360a00fb1139bf47d,0.424,0.212,0.71,0.47,0.571,0.328,0.163,0.411,0.421,-0.031,0.362,397.5,4.772 -8,sentence-transformers/all-MiniLM-L12-v2,a05860a77cef7b37e0048a7864658139bc18a854,0.421,0.229,0.719,0.468,0.566,0.324,0.146,0.368,0.443,-0.008,0.362,355.0,2.691 -10,sentence-transformers/all-MiniLM-L6-v2,8b3219a92973c328a8e22fadcfa821b5dc75636a,0.415,0.201,0.713,0.463,0.556,0.331,0.151,0.383,0.4,-0.028,0.352,289.5,2.43 diff --git a/tests/mock_mteb_cache/results/results/sentence-transformers__all-MiniLM-L6-v2/no_revision_available/BornholmBitextMining.json b/tests/mock_mteb_cache/results/results/sentence-transformers__all-MiniLM-L6-v2/no_revision_available/BornholmBitextMining.json new file mode 100644 index 0000000000..7401646df1 --- /dev/null +++ b/tests/mock_mteb_cache/results/results/sentence-transformers__all-MiniLM-L6-v2/no_revision_available/BornholmBitextMining.json @@ -0,0 +1,22 @@ +{ + "dataset_revision": "3bc5cfb4ec514264fe2db5615fac9016f7251552", + "evaluation_time": 2.3900349140167236, + "kg_co2_emissions": null, + "mteb_version": "1.12.75", + "scores": { + "test": [ + { + "accuracy": 0.36, + "f1": 0.2968132161955691, + "hf_subset": "default", + "languages": [ + "dan-Latn" + ], + "main_score": 0.2968132161955691, + "precision": 0.27690919913419915, + "recall": 0.36 + } + ] + }, + "task_name": "BornholmBitextMining" +} diff --git a/tests/mock_mteb_cache/results/results/sentence-transformers__all-MiniLM-L6-v2/no_revision_available/model_meta.json b/tests/mock_mteb_cache/results/results/sentence-transformers__all-MiniLM-L6-v2/no_revision_available/model_meta.json new file mode 100644 index 0000000000..1dfa77ef69 --- /dev/null +++ b/tests/mock_mteb_cache/results/results/sentence-transformers__all-MiniLM-L6-v2/no_revision_available/model_meta.json @@ -0,0 +1,15 @@ +{ + "name": "sentence-transformers/all-MiniLM-L6-v2", + "revision": "no_revision_available", + "release_date": null, + "languages": null, + "n_parameters": null, + "memory_usage": null, + "max_tokens": null, + "embed_dim": null, + "license": null, + "open_source": true, + "similarity_fn_name": null, + "framework": [], + "loader": null +} diff --git a/tests/mock_mteb_cache/results/results/sentence-transformers__average_word_embeddings_levy_dependency/no_revision_available/Banking77Classification.json b/tests/mock_mteb_cache/results/results/sentence-transformers__average_word_embeddings_levy_dependency/no_revision_available/Banking77Classification.json new file mode 100644 index 0000000000..4e80208915 --- /dev/null +++ b/tests/mock_mteb_cache/results/results/sentence-transformers__average_word_embeddings_levy_dependency/no_revision_available/Banking77Classification.json @@ -0,0 +1,73 @@ +{ + "dataset_revision": "0fd18e25b25c072e09e0d92ab615fda904d66300", + "evaluation_time": 14.252699851989746, + "kg_co2_emissions": null, + "mteb_version": "1.12.75", + "scores": { + "test": [ + { + "accuracy": 0.5273051948051948, + "f1": 0.5052927918458757, + "f1_weighted": 0.5052927918458758, + "hf_subset": "default", + "languages": [ + "eng-Latn" + ], + "main_score": 0.5273051948051948, + "scores_per_experiment": [ + { + "accuracy": 0.5357142857142857, + "f1": 0.5166944909452982, + "f1_weighted": 0.5166944909452983 + }, + { + "accuracy": 0.5285714285714286, + "f1": 0.5029664916598314, + "f1_weighted": 0.5029664916598315 + }, + { + "accuracy": 0.5418831168831169, + "f1": 0.519703769216604, + "f1_weighted": 0.519703769216604 + }, + { + "accuracy": 0.5409090909090909, + "f1": 0.5167890820688497, + "f1_weighted": 0.5167890820688498 + }, + { + "accuracy": 0.5064935064935064, + "f1": 0.48486522111113695, + "f1_weighted": 0.48486522111113706 + }, + { + "accuracy": 0.5253246753246753, + "f1": 0.5119380327833764, + "f1_weighted": 0.5119380327833764 + }, + { + "accuracy": 0.5240259740259741, + "f1": 0.499252464470387, + "f1_weighted": 0.4992524644703871 + }, + { + "accuracy": 0.5415584415584416, + "f1": 0.521100828043587, + "f1_weighted": 0.521100828043587 + }, + { + "accuracy": 0.5188311688311689, + "f1": 0.4949066186283181, + "f1_weighted": 0.49490661862831814 + }, + { + "accuracy": 0.5097402597402597, + "f1": 0.48471091953136874, + "f1_weighted": 0.48471091953136886 + } + ] + } + ] + }, + "task_name": "Banking77Classification" +} diff --git a/tests/mock_mteb_cache/results/results/sentence-transformers__average_word_embeddings_levy_dependency/no_revision_available/BornholmBitextMining.json b/tests/mock_mteb_cache/results/results/sentence-transformers__average_word_embeddings_levy_dependency/no_revision_available/BornholmBitextMining.json new file mode 100644 index 0000000000..35238bf4e9 --- /dev/null +++ b/tests/mock_mteb_cache/results/results/sentence-transformers__average_word_embeddings_levy_dependency/no_revision_available/BornholmBitextMining.json @@ -0,0 +1,22 @@ +{ + "dataset_revision": "3bc5cfb4ec514264fe2db5615fac9016f7251552", + "evaluation_time": 0.6970162391662598, + "kg_co2_emissions": null, + "mteb_version": "1.12.75", + "scores": { + "test": [ + { + "accuracy": 0.068, + "f1": 0.0393941119503411, + "hf_subset": "default", + "languages": [ + "dan-Latn" + ], + "main_score": 0.0393941119503411, + "precision": 0.03443237936739081, + "recall": 0.068 + } + ] + }, + "task_name": "BornholmBitextMining" +} diff --git a/tests/mock_mteb_cache/results/results/sentence-transformers__average_word_embeddings_levy_dependency/no_revision_available/BrazilianToxicTweetsClassification.json b/tests/mock_mteb_cache/results/results/sentence-transformers__average_word_embeddings_levy_dependency/no_revision_available/BrazilianToxicTweetsClassification.json new file mode 100644 index 0000000000..e8c4a55029 --- /dev/null +++ b/tests/mock_mteb_cache/results/results/sentence-transformers__average_word_embeddings_levy_dependency/no_revision_available/BrazilianToxicTweetsClassification.json @@ -0,0 +1,73 @@ +{ + "dataset_revision": "fb4f11a5bc68b99891852d20f1ec074be6289768", + "evaluation_time": 1.8276739120483398, + "kg_co2_emissions": null, + "mteb_version": "1.12.75", + "scores": { + "test": [ + { + "accuracy": 0.140673828125, + "f1": 0.13833202500664074, + "hf_subset": "default", + "languages": [ + "por-Latn" + ], + "lrap": 0.7783620198567717, + "main_score": 0.140673828125, + "scores_per_experiment": [ + { + "accuracy": 0.14990234375, + "f1": 0.1241379234882865, + "lrap": 0.7829996744791681 + }, + { + "accuracy": 0.1181640625, + "f1": 0.13316116423433702, + "lrap": 0.7597656250000007 + }, + { + "accuracy": 0.0986328125, + "f1": 0.14206234991723302, + "lrap": 0.788099500868056 + }, + { + "accuracy": 0.119140625, + "f1": 0.1353569546255579, + "lrap": 0.7939385308159737 + }, + { + "accuracy": 0.22705078125, + "f1": 0.13920443666285207, + "lrap": 0.7721761067708341 + }, + { + "accuracy": 0.125, + "f1": 0.14052258848135046, + "lrap": 0.7596333821614589 + }, + { + "accuracy": 0.13623046875, + "f1": 0.13626463099322808, + "lrap": 0.751844618055556 + }, + { + "accuracy": 0.107421875, + "f1": 0.1597041037726131, + "lrap": 0.8121066623263893 + }, + { + "accuracy": 0.1806640625, + "f1": 0.12107054989596888, + "lrap": 0.7797173394097238 + }, + { + "accuracy": 0.14453125, + "f1": 0.15183554799498014, + "lrap": 0.7833387586805565 + } + ] + } + ] + }, + "task_name": "BrazilianToxicTweetsClassification" +} diff --git a/tests/mock_mteb_cache/results/results/sentence-transformers__average_word_embeddings_levy_dependency/no_revision_available/FaroeseSTS.json b/tests/mock_mteb_cache/results/results/sentence-transformers__average_word_embeddings_levy_dependency/no_revision_available/FaroeseSTS.json new file mode 100644 index 0000000000..0856e05735 --- /dev/null +++ b/tests/mock_mteb_cache/results/results/sentence-transformers__average_word_embeddings_levy_dependency/no_revision_available/FaroeseSTS.json @@ -0,0 +1,32 @@ +{ + "dataset_revision": "8cb36efa69428b3dc290e1125995a999963163c5", + "evaluation_time": 0.6911499500274658, + "kg_co2_emissions": null, + "mteb_version": "1.12.75", + "scores": { + "train": [ + { + "cosine_pearson": 0.1106871505768587, + "cosine_spearman": 0.20133273626294929, + "euclidean_pearson": 0.14969537470030972, + "euclidean_spearman": 0.19061456589163325, + "hf_subset": "default", + "languages": [ + "fao-Latn" + ], + "main_score": 0.20133273626294929, + "manhattan_pearson": 0.14973612252759752, + "manhattan_spearman": 0.19080662705097662, + "pearson": [ + 0.04620749634924611, + 0.21271772666081656 + ], + "spearman": [ + 0.17833030945908993, + 1.2626269918000071e-06 + ] + } + ] + }, + "task_name": "FaroeseSTS" +} diff --git a/tests/mock_mteb_cache/results/results/sentence-transformers__average_word_embeddings_levy_dependency/no_revision_available/FarsTail.json b/tests/mock_mteb_cache/results/results/sentence-transformers__average_word_embeddings_levy_dependency/no_revision_available/FarsTail.json new file mode 100644 index 0000000000..77f0a8745f --- /dev/null +++ b/tests/mock_mteb_cache/results/results/sentence-transformers__average_word_embeddings_levy_dependency/no_revision_available/FarsTail.json @@ -0,0 +1,57 @@ +{ + "dataset_revision": "7335288588f14e5a687d97fc979194c2abe6f4e7", + "evaluation_time": 0.2702820301055908, + "kg_co2_emissions": null, + "mteb_version": "1.12.75", + "scores": { + "test": [ + { + "cosine_accuracy": 0.5053449951409135, + "cosine_accuracy_threshold": 1.0, + "cosine_ap": 0.5043816778458392, + "cosine_f1": 0.6709760827407886, + "cosine_f1_threshold": 0.5, + "cosine_precision": 0.504863813229572, + "cosine_recall": 1.0, + "dot_accuracy": 0.5053449951409135, + "dot_accuracy_threshold": 0.0, + "dot_ap": 0.5043731778425656, + "dot_f1": 0.6709760827407886, + "dot_f1_threshold": 0.0, + "dot_precision": 0.504863813229572, + "dot_recall": 1.0, + "euclidean_accuracy": 0.5053449951409135, + "euclidean_accuracy_threshold": 0.0, + "euclidean_ap": 0.5043816778458392, + "euclidean_f1": 0.6709760827407886, + "euclidean_f1_threshold": 1.0, + "euclidean_precision": 0.504863813229572, + "euclidean_recall": 1.0, + "hf_subset": "default", + "languages": [ + "fas-Arab" + ], + "main_score": 0.5043816778458392, + "manhattan_accuracy": 0.5053449951409135, + "manhattan_accuracy_threshold": 0.0, + "manhattan_ap": 0.5043816778458392, + "manhattan_f1": 0.6709760827407886, + "manhattan_f1_threshold": 14.147068977355957, + "manhattan_precision": 0.504863813229572, + "manhattan_recall": 1.0, + "max_ap": 0.5043816778458392, + "max_f1": 0.6709760827407886, + "max_precision": 0.504863813229572, + "max_recall": 1.0, + "similarity_accuracy": 0.5053449951409135, + "similarity_accuracy_threshold": 0.0, + "similarity_ap": 0.5043731778425656, + "similarity_f1": 0.6709760827407886, + "similarity_f1_threshold": 0.0, + "similarity_precision": 0.504863813229572, + "similarity_recall": 1.0 + } + ] + }, + "task_name": "FarsTail" +} diff --git a/tests/mock_mteb_cache/results/results/sentence-transformers__average_word_embeddings_levy_dependency/no_revision_available/IndicSentimentClassification.json b/tests/mock_mteb_cache/results/results/sentence-transformers__average_word_embeddings_levy_dependency/no_revision_available/IndicSentimentClassification.json new file mode 100644 index 0000000000..525f4a8aa1 --- /dev/null +++ b/tests/mock_mteb_cache/results/results/sentence-transformers__average_word_embeddings_levy_dependency/no_revision_available/IndicSentimentClassification.json @@ -0,0 +1,39 @@ +{ + "dataset_revision": "3389cc78b2ffcbd33639e91dfc57e6b6b6496241", + "evaluation_time": 0.8038129806518555, + "kg_co2_emissions": null, + "mteb_version": "1.12.75", + "scores": { + "test": [ + { + "accuracy": 0.49448897795591185, + "ap": 0.5072290593742705, + "ap_weighted": 0.5072290593742705, + "f1": 0.38022739586494403, + "f1_weighted": 0.3765539215198198, + "hf_subset": "as", + "languages": [ + "asm-Beng" + ], + "main_score": 0.49448897795591185, + "scores_per_experiment": [ + { + "accuracy": 0.49298597194388777, + "ap": 0.5070140280561122, + "ap_weighted": 0.5070140280561122, + "f1": 0.3302013422818792, + "f1_weighted": 0.3255692593240172 + }, + { + "accuracy": 0.49599198396793587, + "ap": 0.5074440906924288, + "ap_weighted": 0.5074440906924288, + "f1": 0.4302534494480088, + "f1_weighted": 0.4275385837156225 + } + ] + } + ] + }, + "task_name": "IndicSentimentClassification" +} diff --git a/tests/mock_mteb_cache/results/results/sentence-transformers__average_word_embeddings_levy_dependency/no_revision_available/MalteseNewsClassification.json b/tests/mock_mteb_cache/results/results/sentence-transformers__average_word_embeddings_levy_dependency/no_revision_available/MalteseNewsClassification.json new file mode 100644 index 0000000000..b3d60f0dcc --- /dev/null +++ b/tests/mock_mteb_cache/results/results/sentence-transformers__average_word_embeddings_levy_dependency/no_revision_available/MalteseNewsClassification.json @@ -0,0 +1,73 @@ +{ + "dataset_revision": "6bb0321659c4f07c4c2176c30c98c971be6571b4", + "evaluation_time": 5.3402721881866455, + "kg_co2_emissions": null, + "mteb_version": "1.12.34", + "scores": { + "test": [ + { + "accuracy": 0.1, + "f1": 0.07807827329759731, + "hf_subset": "default", + "languages": [ + "mlt-Latn" + ], + "lrap": 0.1770734099037994, + "main_score": 0.1, + "scores_per_experiment": [ + { + "accuracy": 0.07357422725293862, + "f1": 0.055031749516062954, + "lrap": 0.1488701161788215 + }, + { + "accuracy": 0.051371353939921635, + "f1": 0.04463325542256335, + "lrap": 0.12601901883958105 + }, + { + "accuracy": 0.08532868959512407, + "f1": 0.07383727780154527, + "lrap": 0.16476520952308119 + }, + { + "accuracy": 0.09403569873748367, + "f1": 0.07629750510182237, + "lrap": 0.16899387094846982 + }, + { + "accuracy": 0.11057901610796692, + "f1": 0.09706152712588817, + "lrap": 0.19162035732882943 + }, + { + "accuracy": 0.13974749673487158, + "f1": 0.09274566564182024, + "lrap": 0.22327076237548005 + }, + { + "accuracy": 0.12581628210709622, + "f1": 0.09769734857780066, + "lrap": 0.20463392148326792 + }, + { + "accuracy": 0.07313887679582064, + "f1": 0.0653950768879792, + "lrap": 0.14928999462214482 + }, + { + "accuracy": 0.12363952982150632, + "f1": 0.0938568045912992, + "lrap": 0.20007127796700205 + }, + { + "accuracy": 0.12276882890727035, + "f1": 0.0842265223091917, + "lrap": 0.1931995697713163 + } + ] + } + ] + }, + "task_name": "MalteseNewsClassification" +} diff --git a/tests/mock_mteb_cache/results/results/sentence-transformers__average_word_embeddings_levy_dependency/no_revision_available/MockBitextMiningTask.json b/tests/mock_mteb_cache/results/results/sentence-transformers__average_word_embeddings_levy_dependency/no_revision_available/MockBitextMiningTask.json new file mode 100644 index 0000000000..71d12f822d --- /dev/null +++ b/tests/mock_mteb_cache/results/results/sentence-transformers__average_word_embeddings_levy_dependency/no_revision_available/MockBitextMiningTask.json @@ -0,0 +1,22 @@ +{ + "dataset_revision": "NA", + "evaluation_time": 0.008999824523925781, + "kg_co2_emissions": null, + "mteb_version": "1.12.75", + "scores": { + "test": [ + { + "accuracy": 0.5, + "f1": 0.3333333333333333, + "hf_subset": "default", + "languages": [ + "eng-Latn" + ], + "main_score": 0.5, + "precision": 0.25, + "recall": 0.5 + } + ] + }, + "task_name": "MockBitextMiningTask" +} diff --git a/tests/mock_mteb_cache/results/results/sentence-transformers__average_word_embeddings_levy_dependency/no_revision_available/MockClassificationTask.json b/tests/mock_mteb_cache/results/results/sentence-transformers__average_word_embeddings_levy_dependency/no_revision_available/MockClassificationTask.json new file mode 100644 index 0000000000..09d4136bec --- /dev/null +++ b/tests/mock_mteb_cache/results/results/sentence-transformers__average_word_embeddings_levy_dependency/no_revision_available/MockClassificationTask.json @@ -0,0 +1,95 @@ +{ + "dataset_revision": "NA", + "evaluation_time": 0.18496918678283691, + "kg_co2_emissions": null, + "mteb_version": "1.12.75", + "scores": { + "test": [ + { + "accuracy": 0.5, + "ap": 0.5, + "ap_weighted": 0.5, + "f1": 0.33333333333333337, + "f1_weighted": 0.33333333333333337, + "hf_subset": "default", + "languages": [ + "eng-Latn" + ], + "main_score": 0.5, + "scores_per_experiment": [ + { + "accuracy": 0.5, + "ap": 0.5, + "ap_weighted": 0.5, + "f1": 0.3333333333333333, + "f1_weighted": 0.3333333333333333 + }, + { + "accuracy": 0.5, + "ap": 0.5, + "ap_weighted": 0.5, + "f1": 0.3333333333333333, + "f1_weighted": 0.3333333333333333 + }, + { + "accuracy": 0.5, + "ap": 0.5, + "ap_weighted": 0.5, + "f1": 0.3333333333333333, + "f1_weighted": 0.3333333333333333 + }, + { + "accuracy": 0.5, + "ap": 0.5, + "ap_weighted": 0.5, + "f1": 0.3333333333333333, + "f1_weighted": 0.3333333333333333 + }, + { + "accuracy": 0.5, + "ap": 0.5, + "ap_weighted": 0.5, + "f1": 0.3333333333333333, + "f1_weighted": 0.3333333333333333 + }, + { + "accuracy": 0.5, + "ap": 0.5, + "ap_weighted": 0.5, + "f1": 0.3333333333333333, + "f1_weighted": 0.3333333333333333 + }, + { + "accuracy": 0.5, + "ap": 0.5, + "ap_weighted": 0.5, + "f1": 0.3333333333333333, + "f1_weighted": 0.3333333333333333 + }, + { + "accuracy": 0.5, + "ap": 0.5, + "ap_weighted": 0.5, + "f1": 0.3333333333333333, + "f1_weighted": 0.3333333333333333 + }, + { + "accuracy": 0.5, + "ap": 0.5, + "ap_weighted": 0.5, + "f1": 0.3333333333333333, + "f1_weighted": 0.3333333333333333 + }, + { + "accuracy": 0.5, + "ap": 0.5, + "ap_weighted": 0.5, + "f1": 0.3333333333333333, + "f1_weighted": 0.3333333333333333 + } + ] + } + ] + }, + "task_name": "MockClassificationTask" +} diff --git a/tests/mock_mteb_cache/results/results/sentence-transformers__average_word_embeddings_levy_dependency/no_revision_available/MockClusteringFastTask.json b/tests/mock_mteb_cache/results/results/sentence-transformers__average_word_embeddings_levy_dependency/no_revision_available/MockClusteringFastTask.json new file mode 100644 index 0000000000..42c1a8f1f9 --- /dev/null +++ b/tests/mock_mteb_cache/results/results/sentence-transformers__average_word_embeddings_levy_dependency/no_revision_available/MockClusteringFastTask.json @@ -0,0 +1,34 @@ +{ + "dataset_revision": "NA", + "evaluation_time": 1.1901772022247314, + "kg_co2_emissions": null, + "mteb_version": "1.12.75", + "scores": { + "test": [ + { + "hf_subset": "default", + "languages": [ + "eng-Latn" + ], + "main_score": 0.0, + "v_measure": 0.0, + "v_measure_std": 0.0, + "v_measures": { + "Level 0": [ + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + } + } + ] + }, + "task_name": "MockClusteringFastTask" +} diff --git a/tests/mock_mteb_cache/results/results/sentence-transformers__average_word_embeddings_levy_dependency/no_revision_available/MockClusteringTask.json b/tests/mock_mteb_cache/results/results/sentence-transformers__average_word_embeddings_levy_dependency/no_revision_available/MockClusteringTask.json new file mode 100644 index 0000000000..c0086aefea --- /dev/null +++ b/tests/mock_mteb_cache/results/results/sentence-transformers__average_word_embeddings_levy_dependency/no_revision_available/MockClusteringTask.json @@ -0,0 +1,23 @@ +{ + "dataset_revision": "NA", + "evaluation_time": 0.015342950820922852, + "kg_co2_emissions": null, + "mteb_version": "1.12.75", + "scores": { + "test": [ + { + "hf_subset": "default", + "languages": [ + "eng-Latn" + ], + "main_score": 0.0, + "v_measure": 0.0, + "v_measure_std": 0.0, + "v_measures": [ + 0.0 + ] + } + ] + }, + "task_name": "MockClusteringTask" +} diff --git a/tests/mock_mteb_cache/results/results/sentence-transformers__average_word_embeddings_levy_dependency/no_revision_available/MockMultilabelClassification.json b/tests/mock_mteb_cache/results/results/sentence-transformers__average_word_embeddings_levy_dependency/no_revision_available/MockMultilabelClassification.json new file mode 100644 index 0000000000..c67ad951b7 --- /dev/null +++ b/tests/mock_mteb_cache/results/results/sentence-transformers__average_word_embeddings_levy_dependency/no_revision_available/MockMultilabelClassification.json @@ -0,0 +1,73 @@ +{ + "dataset_revision": "NA", + "evaluation_time": 0.07277321815490723, + "kg_co2_emissions": null, + "mteb_version": "1.12.75", + "scores": { + "test": [ + { + "accuracy": 1.0, + "f1": 1.0, + "hf_subset": "default", + "languages": [ + "eng-Latn" + ], + "lrap": 1.0, + "main_score": 1.0, + "scores_per_experiment": [ + { + "accuracy": 1.0, + "f1": 1.0, + "lrap": 1.0 + }, + { + "accuracy": 1.0, + "f1": 1.0, + "lrap": 1.0 + }, + { + "accuracy": 1.0, + "f1": 1.0, + "lrap": 1.0 + }, + { + "accuracy": 1.0, + "f1": 1.0, + "lrap": 1.0 + }, + { + "accuracy": 1.0, + "f1": 1.0, + "lrap": 1.0 + }, + { + "accuracy": 1.0, + "f1": 1.0, + "lrap": 1.0 + }, + { + "accuracy": 1.0, + "f1": 1.0, + "lrap": 1.0 + }, + { + "accuracy": 1.0, + "f1": 1.0, + "lrap": 1.0 + }, + { + "accuracy": 1.0, + "f1": 1.0, + "lrap": 1.0 + }, + { + "accuracy": 1.0, + "f1": 1.0, + "lrap": 1.0 + } + ] + } + ] + }, + "task_name": "MockMultilabelClassification" +} diff --git a/tests/mock_mteb_cache/results/results/sentence-transformers__average_word_embeddings_levy_dependency/no_revision_available/MockPairClassificationTask.json b/tests/mock_mteb_cache/results/results/sentence-transformers__average_word_embeddings_levy_dependency/no_revision_available/MockPairClassificationTask.json new file mode 100644 index 0000000000..fda0b0e94f --- /dev/null +++ b/tests/mock_mteb_cache/results/results/sentence-transformers__average_word_embeddings_levy_dependency/no_revision_available/MockPairClassificationTask.json @@ -0,0 +1,57 @@ +{ + "dataset_revision": "NA", + "evaluation_time": 0.007855892181396484, + "kg_co2_emissions": null, + "mteb_version": "1.12.75", + "scores": { + "test": [ + { + "cosine_accuracy": 1.0, + "cosine_accuracy_threshold": 0.48992985486984253, + "cosine_ap": 1.0, + "cosine_f1": 1.0, + "cosine_f1_threshold": 0.48992985486984253, + "cosine_precision": 1.0, + "cosine_recall": 1.0, + "dot_accuracy": 1.0, + "dot_accuracy_threshold": 0.30161741375923157, + "dot_ap": 1.0, + "dot_f1": 1.0, + "dot_f1_threshold": 0.30161741375923157, + "dot_precision": 1.0, + "dot_recall": 1.0, + "euclidean_accuracy": 1.0, + "euclidean_accuracy_threshold": 0.8020626306533813, + "euclidean_ap": 1.0, + "euclidean_f1": 1.0, + "euclidean_f1_threshold": 0.8020626306533813, + "euclidean_precision": 1.0, + "euclidean_recall": 1.0, + "hf_subset": "default", + "languages": [ + "eng-Latn" + ], + "main_score": 1.0, + "manhattan_accuracy": 1.0, + "manhattan_accuracy_threshold": 10.830148696899414, + "manhattan_ap": 1.0, + "manhattan_f1": 1.0, + "manhattan_f1_threshold": 10.830148696899414, + "manhattan_precision": 1.0, + "manhattan_recall": 1.0, + "max_ap": 1.0, + "max_f1": 1.0, + "max_precision": 1.0, + "max_recall": 1.0, + "similarity_accuracy": 1.0, + "similarity_accuracy_threshold": 0.48992985486984253, + "similarity_ap": 1.0, + "similarity_f1": 1.0, + "similarity_f1_threshold": 0.48992985486984253, + "similarity_precision": 1.0, + "similarity_recall": 1.0 + } + ] + }, + "task_name": "MockPairClassificationTask" +} diff --git a/tests/mock_mteb_cache/results/results/sentence-transformers__average_word_embeddings_levy_dependency/no_revision_available/MockRerankingTask.json b/tests/mock_mteb_cache/results/results/sentence-transformers__average_word_embeddings_levy_dependency/no_revision_available/MockRerankingTask.json new file mode 100644 index 0000000000..1a00bd05d7 --- /dev/null +++ b/tests/mock_mteb_cache/results/results/sentence-transformers__average_word_embeddings_levy_dependency/no_revision_available/MockRerankingTask.json @@ -0,0 +1,26 @@ +{ + "dataset_revision": "NA", + "evaluation_time": 0.01447296142578125, + "kg_co2_emissions": null, + "mteb_version": "1.12.75", + "scores": { + "test": [ + { + "hf_subset": "default", + "languages": [ + "eng-Latn" + ], + "main_score": 0.49749687819177474, + "map": 0.49749687819177474, + "mrr": 1.0, + "nAUC_map_diff1": 1.0, + "nAUC_map_max": -0.99999999999999, + "nAUC_map_std": -0.99999999999999, + "nAUC_mrr_diff1": NaN, + "nAUC_mrr_max": NaN, + "nAUC_mrr_std": NaN + } + ] + }, + "task_name": "MockRerankingTask" +} diff --git a/tests/mock_mteb_cache/results/results/sentence-transformers__average_word_embeddings_levy_dependency/no_revision_available/MockRetrievalTask.json b/tests/mock_mteb_cache/results/results/sentence-transformers__average_word_embeddings_levy_dependency/no_revision_available/MockRetrievalTask.json new file mode 100644 index 0000000000..ce0e34f6e4 --- /dev/null +++ b/tests/mock_mteb_cache/results/results/sentence-transformers__average_word_embeddings_levy_dependency/no_revision_available/MockRetrievalTask.json @@ -0,0 +1,158 @@ +{ + "dataset_revision": "NA", + "evaluation_time": 0.03309297561645508, + "kg_co2_emissions": null, + "mteb_version": "1.12.75", + "scores": { + "test": [ + { + "hf_subset": "default", + "languages": [ + "eng-Latn" + ], + "main_score": 0.81546, + "map_at_1": 0.5, + "map_at_10": 0.75, + "map_at_100": 0.75, + "map_at_1000": 0.75, + "map_at_20": 0.75, + "map_at_3": 0.75, + "map_at_5": 0.75, + "mrr_at_1": 0.5, + "mrr_at_10": 0.75, + "mrr_at_100": 0.75, + "mrr_at_1000": 0.75, + "mrr_at_20": 0.75, + "mrr_at_3": 0.75, + "mrr_at_5": 0.75, + "nauc_map_at_1000_diff1": 1.0, + "nauc_map_at_1000_max": 1.0, + "nauc_map_at_1000_std": 1.0, + "nauc_map_at_100_diff1": 1.0, + "nauc_map_at_100_max": 1.0, + "nauc_map_at_100_std": 1.0, + "nauc_map_at_10_diff1": 1.0, + "nauc_map_at_10_max": 1.0, + "nauc_map_at_10_std": 1.0, + "nauc_map_at_1_diff1": 1.0, + "nauc_map_at_1_max": 1.0, + "nauc_map_at_1_std": 1.0, + "nauc_map_at_20_diff1": 1.0, + "nauc_map_at_20_max": 1.0, + "nauc_map_at_20_std": 1.0, + "nauc_map_at_3_diff1": 1.0, + "nauc_map_at_3_max": 1.0, + "nauc_map_at_3_std": 1.0, + "nauc_map_at_5_diff1": 1.0, + "nauc_map_at_5_max": 1.0, + "nauc_map_at_5_std": 1.0, + "nauc_mrr_at_1000_diff1": -1.0, + "nauc_mrr_at_1000_max": -1.0, + "nauc_mrr_at_1000_std": -1.0, + "nauc_mrr_at_100_diff1": -1.0, + "nauc_mrr_at_100_max": -1.0, + "nauc_mrr_at_100_std": -1.0, + "nauc_mrr_at_10_diff1": -1.0, + "nauc_mrr_at_10_max": -1.0, + "nauc_mrr_at_10_std": -1.0, + "nauc_mrr_at_1_diff1": -1.0, + "nauc_mrr_at_1_max": -1.0, + "nauc_mrr_at_1_std": -1.0, + "nauc_mrr_at_20_diff1": -1.0, + "nauc_mrr_at_20_max": -1.0, + "nauc_mrr_at_20_std": -1.0, + "nauc_mrr_at_3_diff1": -1.0, + "nauc_mrr_at_3_max": -1.0, + "nauc_mrr_at_3_std": -1.0, + "nauc_mrr_at_5_diff1": -1.0, + "nauc_mrr_at_5_max": -1.0, + "nauc_mrr_at_5_std": -1.0, + "nauc_ndcg_at_1000_diff1": 1.0, + "nauc_ndcg_at_1000_max": 1.0, + "nauc_ndcg_at_1000_std": 1.0, + "nauc_ndcg_at_100_diff1": 1.0, + "nauc_ndcg_at_100_max": 1.0, + "nauc_ndcg_at_100_std": 1.0, + "nauc_ndcg_at_10_diff1": 1.0, + "nauc_ndcg_at_10_max": 1.0, + "nauc_ndcg_at_10_std": 1.0, + "nauc_ndcg_at_1_diff1": 1.0, + "nauc_ndcg_at_1_max": 1.0, + "nauc_ndcg_at_1_std": 1.0, + "nauc_ndcg_at_20_diff1": 1.0, + "nauc_ndcg_at_20_max": 1.0, + "nauc_ndcg_at_20_std": 1.0, + "nauc_ndcg_at_3_diff1": 1.0, + "nauc_ndcg_at_3_max": 1.0, + "nauc_ndcg_at_3_std": 1.0, + "nauc_ndcg_at_5_diff1": 1.0, + "nauc_ndcg_at_5_max": 1.0, + "nauc_ndcg_at_5_std": 1.0, + "nauc_precision_at_1000_diff1": 1.0, + "nauc_precision_at_1000_max": 1.0, + "nauc_precision_at_1000_std": 1.0, + "nauc_precision_at_100_diff1": 1.0, + "nauc_precision_at_100_max": 1.0, + "nauc_precision_at_100_std": 1.0, + "nauc_precision_at_10_diff1": NaN, + "nauc_precision_at_10_max": NaN, + "nauc_precision_at_10_std": NaN, + "nauc_precision_at_1_diff1": 1.0, + "nauc_precision_at_1_max": 1.0, + "nauc_precision_at_1_std": 1.0, + "nauc_precision_at_20_diff1": NaN, + "nauc_precision_at_20_max": NaN, + "nauc_precision_at_20_std": NaN, + "nauc_precision_at_3_diff1": NaN, + "nauc_precision_at_3_max": NaN, + "nauc_precision_at_3_std": NaN, + "nauc_precision_at_5_diff1": NaN, + "nauc_precision_at_5_max": NaN, + "nauc_precision_at_5_std": NaN, + "nauc_recall_at_1000_diff1": NaN, + "nauc_recall_at_1000_max": NaN, + "nauc_recall_at_1000_std": NaN, + "nauc_recall_at_100_diff1": NaN, + "nauc_recall_at_100_max": NaN, + "nauc_recall_at_100_std": NaN, + "nauc_recall_at_10_diff1": NaN, + "nauc_recall_at_10_max": NaN, + "nauc_recall_at_10_std": NaN, + "nauc_recall_at_1_diff1": 1.0, + "nauc_recall_at_1_max": 1.0, + "nauc_recall_at_1_std": 1.0, + "nauc_recall_at_20_diff1": NaN, + "nauc_recall_at_20_max": NaN, + "nauc_recall_at_20_std": NaN, + "nauc_recall_at_3_diff1": NaN, + "nauc_recall_at_3_max": NaN, + "nauc_recall_at_3_std": NaN, + "nauc_recall_at_5_diff1": NaN, + "nauc_recall_at_5_max": NaN, + "nauc_recall_at_5_std": NaN, + "ndcg_at_1": 0.5, + "ndcg_at_10": 0.81546, + "ndcg_at_100": 0.81546, + "ndcg_at_1000": 0.81546, + "ndcg_at_20": 0.81546, + "ndcg_at_3": 0.81546, + "ndcg_at_5": 0.81546, + "precision_at_1": 0.5, + "precision_at_10": 0.1, + "precision_at_100": 0.01, + "precision_at_1000": 0.001, + "precision_at_20": 0.05, + "precision_at_3": 0.33333, + "precision_at_5": 0.2, + "recall_at_1": 0.5, + "recall_at_10": 1.0, + "recall_at_100": 1.0, + "recall_at_1000": 1.0, + "recall_at_20": 1.0, + "recall_at_3": 1.0, + "recall_at_5": 1.0 + } + ] + }, + "task_name": "MockRetrievalTask" +} diff --git a/tests/mock_mteb_cache/results/results/sentence-transformers__average_word_embeddings_levy_dependency/no_revision_available/MockSTSTask.json b/tests/mock_mteb_cache/results/results/sentence-transformers__average_word_embeddings_levy_dependency/no_revision_available/MockSTSTask.json new file mode 100644 index 0000000000..f88467751a --- /dev/null +++ b/tests/mock_mteb_cache/results/results/sentence-transformers__average_word_embeddings_levy_dependency/no_revision_available/MockSTSTask.json @@ -0,0 +1,32 @@ +{ + "dataset_revision": "NA", + "evaluation_time": 0.010137796401977539, + "kg_co2_emissions": null, + "mteb_version": "1.12.75", + "scores": { + "test": [ + { + "cosine_pearson": 1.0, + "cosine_spearman": 0.9999999999999999, + "euclidean_pearson": 1.0, + "euclidean_spearman": 0.9999999999999999, + "hf_subset": "default", + "languages": [ + "eng-Latn" + ], + "main_score": 0.9999999999999999, + "manhattan_pearson": 1.0, + "manhattan_spearman": 0.9999999999999999, + "pearson": [ + 1.0, + 1.0 + ], + "spearman": [ + 0.9999999999999999, + NaN + ] + } + ] + }, + "task_name": "MockSTSTask" +} diff --git a/tests/mock_mteb_cache/results/results/sentence-transformers__average_word_embeddings_levy_dependency/no_revision_available/MockSummarizationTask.json b/tests/mock_mteb_cache/results/results/sentence-transformers__average_word_embeddings_levy_dependency/no_revision_available/MockSummarizationTask.json new file mode 100644 index 0000000000..b1e27ad5ec --- /dev/null +++ b/tests/mock_mteb_cache/results/results/sentence-transformers__average_word_embeddings_levy_dependency/no_revision_available/MockSummarizationTask.json @@ -0,0 +1,24 @@ +{ + "dataset_revision": "NA", + "evaluation_time": 0.008913993835449219, + "kg_co2_emissions": null, + "mteb_version": "1.12.75", + "scores": { + "test": [ + { + "cosine_pearson": NaN, + "cosine_spearman": NaN, + "dot_pearson": NaN, + "dot_spearman": NaN, + "hf_subset": "default", + "languages": [ + "eng-Latn" + ], + "main_score": NaN, + "pearson": NaN, + "spearman": NaN + } + ] + }, + "task_name": "MockSummarizationTask" +} diff --git a/tests/mock_mteb_cache/results/results/sentence-transformers__average_word_embeddings_levy_dependency/no_revision_available/NFCorpus.json b/tests/mock_mteb_cache/results/results/sentence-transformers__average_word_embeddings_levy_dependency/no_revision_available/NFCorpus.json new file mode 100644 index 0000000000..5ee4d5c2c5 --- /dev/null +++ b/tests/mock_mteb_cache/results/results/sentence-transformers__average_word_embeddings_levy_dependency/no_revision_available/NFCorpus.json @@ -0,0 +1,158 @@ +{ + "dataset_revision": "ec0fa4fe99da2ff19ca1214b7966684033a58814", + "evaluation_time": 2.8802309036254883, + "kg_co2_emissions": null, + "mteb_version": "1.12.34", + "scores": { + "test": [ + { + "hf_subset": "default", + "languages": [ + "eng-Latn" + ], + "main_score": 0.07071, + "map_at_1": 0.00538, + "map_at_10": 0.01495, + "map_at_100": 0.01976, + "map_at_1000": 0.02665, + "map_at_20": 0.0166, + "map_at_3": 0.01111, + "map_at_5": 0.01252, + "mrr_at_1": 0.09287925696594428, + "mrr_at_10": 0.14995331465919706, + "mrr_at_100": 0.16101571404978912, + "mrr_at_1000": 0.16223373810445677, + "mrr_at_20": 0.15481683556799306, + "mrr_at_3": 0.1320949432404541, + "mrr_at_5": 0.13998968008255938, + "nauc_map_at_1000_diff1": 0.2918947310849265, + "nauc_map_at_1000_max": 0.08601688813804562, + "nauc_map_at_1000_std": 0.2333371927190971, + "nauc_map_at_100_diff1": 0.29032458527373306, + "nauc_map_at_100_max": 0.12472673102176648, + "nauc_map_at_100_std": 0.22134069097287792, + "nauc_map_at_10_diff1": 0.3254307682385631, + "nauc_map_at_10_max": 0.161650011828213, + "nauc_map_at_10_std": 0.25413545118345, + "nauc_map_at_1_diff1": 0.3349622392758938, + "nauc_map_at_1_max": 0.004254004877583021, + "nauc_map_at_1_std": 0.4754683060634058, + "nauc_map_at_20_diff1": 0.3111812212524331, + "nauc_map_at_20_max": 0.14271075496215116, + "nauc_map_at_20_std": 0.24451170153420657, + "nauc_map_at_3_diff1": 0.4264352201746056, + "nauc_map_at_3_max": 0.14835119189370422, + "nauc_map_at_3_std": 0.4108842097097752, + "nauc_map_at_5_diff1": 0.3963790626389921, + "nauc_map_at_5_max": 0.17927931796920032, + "nauc_map_at_5_std": 0.3398529553182776, + "nauc_mrr_at_1000_diff1": 0.1273385780636387, + "nauc_mrr_at_1000_max": 0.037167478980682296, + "nauc_mrr_at_1000_std": 0.10128096286158758, + "nauc_mrr_at_100_diff1": 0.12653948231890733, + "nauc_mrr_at_100_max": 0.036993619677288106, + "nauc_mrr_at_100_std": 0.10097733668527223, + "nauc_mrr_at_10_diff1": 0.12436002991921302, + "nauc_mrr_at_10_max": 0.03138073737936428, + "nauc_mrr_at_10_std": 0.09166454960222017, + "nauc_mrr_at_1_diff1": 0.06944501164031058, + "nauc_mrr_at_1_max": -0.004728867347540564, + "nauc_mrr_at_1_std": 0.07695233227507062, + "nauc_mrr_at_20_diff1": 0.1267941930573542, + "nauc_mrr_at_20_max": 0.03364839604204623, + "nauc_mrr_at_20_std": 0.10354023332519324, + "nauc_mrr_at_3_diff1": 0.13177307722402976, + "nauc_mrr_at_3_max": 0.014991755894987564, + "nauc_mrr_at_3_std": 0.1042015139235763, + "nauc_mrr_at_5_diff1": 0.12999517189096008, + "nauc_mrr_at_5_max": 0.019876119626624757, + "nauc_mrr_at_5_std": 0.0962475534059333, + "nauc_ndcg_at_1000_diff1": 0.2318081866368502, + "nauc_ndcg_at_1000_max": 0.1093735598308187, + "nauc_ndcg_at_1000_std": 0.16939791921344968, + "nauc_ndcg_at_100_diff1": 0.19883523442834397, + "nauc_ndcg_at_100_max": 0.09078487689358479, + "nauc_ndcg_at_100_std": 0.13073501626743556, + "nauc_ndcg_at_10_diff1": 0.19990576357362128, + "nauc_ndcg_at_10_max": 0.03255543468979134, + "nauc_ndcg_at_10_std": 0.10660797602269385, + "nauc_ndcg_at_1_diff1": 0.08252758871345879, + "nauc_ndcg_at_1_max": -0.005390601677998011, + "nauc_ndcg_at_1_std": 0.08941580633412254, + "nauc_ndcg_at_20_diff1": 0.21537908953269866, + "nauc_ndcg_at_20_max": 0.035049531358146827, + "nauc_ndcg_at_20_std": 0.1326242592145833, + "nauc_ndcg_at_3_diff1": 0.20293786276875925, + "nauc_ndcg_at_3_max": 0.021888032395159084, + "nauc_ndcg_at_3_std": 0.10265124876936188, + "nauc_ndcg_at_5_diff1": 0.20414118256259436, + "nauc_ndcg_at_5_max": 0.04143863792472334, + "nauc_ndcg_at_5_std": 0.10372505035302813, + "nauc_precision_at_1000_diff1": 0.19779120453190838, + "nauc_precision_at_1000_max": -0.14351824180051337, + "nauc_precision_at_1000_std": 0.25579387522195296, + "nauc_precision_at_100_diff1": 0.16245228882946416, + "nauc_precision_at_100_max": -0.11303788632241604, + "nauc_precision_at_100_std": 0.14956109825848515, + "nauc_precision_at_10_diff1": 0.13443586216803716, + "nauc_precision_at_10_max": -0.025714910000238612, + "nauc_precision_at_10_std": 0.028075492402551395, + "nauc_precision_at_1_diff1": 0.06944501164031058, + "nauc_precision_at_1_max": -0.004728867347540564, + "nauc_precision_at_1_std": 0.07695233227507062, + "nauc_precision_at_20_diff1": 0.13153881047931187, + "nauc_precision_at_20_max": -0.05860307439066314, + "nauc_precision_at_20_std": 0.06891095567577976, + "nauc_precision_at_3_diff1": 0.20494980937628893, + "nauc_precision_at_3_max": 0.016317803967424382, + "nauc_precision_at_3_std": 0.05022162040229963, + "nauc_precision_at_5_diff1": 0.1659782496219298, + "nauc_precision_at_5_max": 0.025374091669662643, + "nauc_precision_at_5_std": 0.0359956748684954, + "nauc_recall_at_1000_diff1": 0.18643889352757537, + "nauc_recall_at_1000_max": 0.06018063431037172, + "nauc_recall_at_1000_std": 0.15496662378535794, + "nauc_recall_at_100_diff1": 0.09969748900505483, + "nauc_recall_at_100_max": 0.10250818015186074, + "nauc_recall_at_100_std": 0.06775562108691197, + "nauc_recall_at_10_diff1": 0.24256921518559532, + "nauc_recall_at_10_max": 0.20417002840019832, + "nauc_recall_at_10_std": 0.14017561784073584, + "nauc_recall_at_1_diff1": 0.3349622392758938, + "nauc_recall_at_1_max": 0.004254004877583021, + "nauc_recall_at_1_std": 0.4754683060634058, + "nauc_recall_at_20_diff1": 0.21725327296711783, + "nauc_recall_at_20_max": 0.1404315259117994, + "nauc_recall_at_20_std": 0.1503290385316223, + "nauc_recall_at_3_diff1": 0.4797216829631059, + "nauc_recall_at_3_max": 0.2015892326946194, + "nauc_recall_at_3_std": 0.41401227231468263, + "nauc_recall_at_5_diff1": 0.4052447975654802, + "nauc_recall_at_5_max": 0.2481799676075496, + "nauc_recall_at_5_std": 0.28791608275753816, + "ndcg_at_1": 0.08978, + "ndcg_at_10": 0.07071, + "ndcg_at_100": 0.07755, + "ndcg_at_1000": 0.17328, + "ndcg_at_20": 0.06629, + "ndcg_at_3": 0.08114, + "ndcg_at_5": 0.07448, + "precision_at_1": 0.09288, + "precision_at_10": 0.05635, + "precision_at_100": 0.02728, + "precision_at_1000": 0.01464, + "precision_at_20": 0.0435, + "precision_at_3": 0.07946, + "precision_at_5": 0.06687, + "recall_at_1": 0.00538, + "recall_at_10": 0.03328, + "recall_at_100": 0.10227, + "recall_at_1000": 0.43083, + "recall_at_20": 0.04555, + "recall_at_3": 0.01818, + "recall_at_5": 0.02235 + } + ] + }, + "task_name": "NFCorpus" +} diff --git a/tests/mock_mteb_cache/results/results/sentence-transformers__average_word_embeddings_levy_dependency/no_revision_available/STS12.json b/tests/mock_mteb_cache/results/results/sentence-transformers__average_word_embeddings_levy_dependency/no_revision_available/STS12.json new file mode 100644 index 0000000000..93242abefe --- /dev/null +++ b/tests/mock_mteb_cache/results/results/sentence-transformers__average_word_embeddings_levy_dependency/no_revision_available/STS12.json @@ -0,0 +1,32 @@ +{ + "dataset_revision": "a0d554a64d88156834ff5ae9920b964011b16384", + "evaluation_time": 0.40897083282470703, + "kg_co2_emissions": null, + "mteb_version": "1.12.34", + "scores": { + "test": [ + { + "cosine_pearson": 0.5555605412269286, + "cosine_spearman": 0.515028900933014, + "euclidean_pearson": 0.5220851241163729, + "euclidean_spearman": 0.4852364222463822, + "hf_subset": "default", + "languages": [ + "eng-Latn" + ], + "main_score": 0.515028900933014, + "manhattan_pearson": 0.5224188971231623, + "manhattan_spearman": 0.4855020457010234, + "pearson": [ + 0.5493485037261685, + 1.3551953801268352e-244 + ], + "spearman": [ + 0.5147633253853967, + 6.385933901100005e-210 + ] + } + ] + }, + "task_name": "STS12" +} diff --git a/tests/mock_mteb_cache/results/results/sentence-transformers__average_word_embeddings_levy_dependency/no_revision_available/SciDocsRR.json b/tests/mock_mteb_cache/results/results/sentence-transformers__average_word_embeddings_levy_dependency/no_revision_available/SciDocsRR.json new file mode 100644 index 0000000000..fe7c3b66f1 --- /dev/null +++ b/tests/mock_mteb_cache/results/results/sentence-transformers__average_word_embeddings_levy_dependency/no_revision_available/SciDocsRR.json @@ -0,0 +1,26 @@ +{ + "dataset_revision": "d3c5e1fc0b855ab6097bf1cda04dd73947d7caab", + "evaluation_time": 203.36270904541016, + "kg_co2_emissions": null, + "mteb_version": "1.12.75", + "scores": { + "test": [ + { + "hf_subset": "default", + "languages": [ + "eng-Latn" + ], + "main_score": 0.5786186224378919, + "map": 0.5786186224378919, + "mrr": 0.8184027029615265, + "nAUC_map_diff1": 0.1637494152705613, + "nAUC_map_max": 0.5045013171546223, + "nAUC_map_std": 0.3674679240517658, + "nAUC_mrr_diff1": 0.4150582464942141, + "nAUC_mrr_max": 0.5881812181571324, + "nAUC_mrr_std": 0.35355248019712077 + } + ] + }, + "task_name": "SciDocsRR" +} diff --git a/tests/mock_mteb_cache/results/results/sentence-transformers__average_word_embeddings_levy_dependency/no_revision_available/SprintDuplicateQuestions.json b/tests/mock_mteb_cache/results/results/sentence-transformers__average_word_embeddings_levy_dependency/no_revision_available/SprintDuplicateQuestions.json new file mode 100644 index 0000000000..bdcc32667d --- /dev/null +++ b/tests/mock_mteb_cache/results/results/sentence-transformers__average_word_embeddings_levy_dependency/no_revision_available/SprintDuplicateQuestions.json @@ -0,0 +1,127 @@ +{ + "dataset_revision": "d66bd1f72af766a5cc4b0ca5e00c162f89e8cc46", + "evaluation_time": 21.549480199813843, + "kg_co2_emissions": null, + "mteb_version": "1.12.34", + "scores": { + "test": [ + { + "cosine": { + "accuracy": 0.9954950495049505, + "accuracy_threshold": 0.907312273979187, + "ap": 0.8012273184915149, + "f1": 0.7572016460905351, + "f1_threshold": 0.9018044471740723, + "precision": 0.7796610169491526, + "recall": 0.736 + }, + "dot": { + "accuracy": 0.9902079207920792, + "accuracy_threshold": 0.47660601139068604, + "ap": 0.2218132363528724, + "f1": 0.3066424021838035, + "f1_threshold": 0.403484582901001, + "precision": 0.28130217028380633, + "recall": 0.337 + }, + "euclidean": { + "accuracy": 0.9954653465346535, + "accuracy_threshold": 0.26685065031051636, + "ap": 0.7727693506179343, + "f1": 0.7418677859391395, + "f1_threshold": 0.2830333113670349, + "precision": 0.7803532008830022, + "recall": 0.707 + }, + "hf_subset": "default", + "languages": [ + "eng-Latn" + ], + "main_score": 0.8012273184915149, + "manhattan": { + "accuracy": 0.9954455445544554, + "accuracy_threshold": 3.6830081939697266, + "ap": 0.7726615479510065, + "f1": 0.7445652173913043, + "f1_threshold": 3.8327877521514893, + "precision": 0.8154761904761905, + "recall": 0.685 + }, + "max": { + "accuracy": 0.9954950495049505, + "ap": 0.8012273184915149, + "f1": 0.7572016460905351 + }, + "similarity": { + "accuracy": 0.9954950495049505, + "accuracy_threshold": 0.9073122143745422, + "ap": 0.8012269960916356, + "f1": 0.7572016460905351, + "f1_threshold": 0.9018043875694275, + "precision": 0.7796610169491526, + "recall": 0.736 + } + } + ], + "validation": [ + { + "cosine": { + "accuracy": 0.9956237623762376, + "accuracy_threshold": 0.9099201560020447, + "ap": 0.794888046665419, + "f1": 0.7552602436323367, + "f1_threshold": 0.909598171710968, + "precision": 0.8461538461538461, + "recall": 0.682 + }, + "dot": { + "accuracy": 0.9902277227722772, + "accuracy_threshold": 0.5442271828651428, + "ap": 0.19330120389645145, + "f1": 0.2625298329355608, + "f1_threshold": 0.4011155664920807, + "precision": 0.21796565389696168, + "recall": 0.33 + }, + "euclidean": { + "accuracy": 0.9954950495049505, + "accuracy_threshold": 0.2651556134223938, + "ap": 0.7694435884693833, + "f1": 0.7494553376906317, + "f1_threshold": 0.27809423208236694, + "precision": 0.8229665071770335, + "recall": 0.688 + }, + "hf_subset": "default", + "languages": [ + "eng-Latn" + ], + "main_score": 0.794888046665419, + "manhattan": { + "accuracy": 0.9955049504950495, + "accuracy_threshold": 3.766726493835449, + "ap": 0.7684690106594636, + "f1": 0.7508196721311474, + "f1_threshold": 3.8263633251190186, + "precision": 0.827710843373494, + "recall": 0.687 + }, + "max": { + "accuracy": 0.9956237623762376, + "ap": 0.794888046665419, + "f1": 0.7552602436323367 + }, + "similarity": { + "accuracy": 0.9956237623762376, + "accuracy_threshold": 0.9099200963973999, + "ap": 0.7948878387935692, + "f1": 0.7552602436323367, + "f1_threshold": 0.9095981121063232, + "precision": 0.8461538461538461, + "recall": 0.682 + } + } + ] + }, + "task_name": "SprintDuplicateQuestions" +} diff --git a/tests/mock_mteb_cache/results/results/sentence-transformers__average_word_embeddings_levy_dependency/no_revision_available/SummEval.json b/tests/mock_mteb_cache/results/results/sentence-transformers__average_word_embeddings_levy_dependency/no_revision_available/SummEval.json new file mode 100644 index 0000000000..0c79516405 --- /dev/null +++ b/tests/mock_mteb_cache/results/results/sentence-transformers__average_word_embeddings_levy_dependency/no_revision_available/SummEval.json @@ -0,0 +1,24 @@ +{ + "dataset_revision": "cda12ad7615edc362dbf25a00fdd61d3b1eaf93c", + "evaluation_time": 1.9177391529083252, + "kg_co2_emissions": null, + "mteb_version": "1.12.75", + "scores": { + "test": [ + { + "cosine_pearson": 0.3188068764666941, + "cosine_spearman": 0.3131388835922983, + "dot_pearson": 0.16400179504605283, + "dot_spearman": 0.1668532421311614, + "hf_subset": "default", + "languages": [ + "eng-Latn" + ], + "main_score": 0.3131388835922983, + "pearson": 0.3188069315279351, + "spearman": 0.31323675156410546 + } + ] + }, + "task_name": "SummEval" +} diff --git a/tests/mock_mteb_cache/results/results/sentence-transformers__average_word_embeddings_levy_dependency/no_revision_available/TwentyNewsgroupsClustering.json b/tests/mock_mteb_cache/results/results/sentence-transformers__average_word_embeddings_levy_dependency/no_revision_available/TwentyNewsgroupsClustering.json new file mode 100644 index 0000000000..95b0cdfdbe --- /dev/null +++ b/tests/mock_mteb_cache/results/results/sentence-transformers__average_word_embeddings_levy_dependency/no_revision_available/TwentyNewsgroupsClustering.json @@ -0,0 +1,32 @@ +{ + "dataset_revision": "6125ec4e24fa026cec8a478383ee943acfbd5449", + "evaluation_time": 17.457006692886353, + "kg_co2_emissions": null, + "mteb_version": "1.12.75", + "scores": { + "test": [ + { + "hf_subset": "default", + "languages": [ + "eng-Latn" + ], + "main_score": 0.23648231952817939, + "v_measure": 0.23648231952817939, + "v_measure_std": 0.019681586971300975, + "v_measures": [ + 0.2809108247742145, + 0.2432090061999711, + 0.24002971126150005, + 0.23266886121180477, + 0.23899987964631256, + 0.21779454939152634, + 0.2266476414061159, + 0.25104573290889665, + 0.2028851658722335, + 0.2306318226092182 + ] + } + ] + }, + "task_name": "TwentyNewsgroupsClustering" +} diff --git a/tests/mock_mteb_cache/results/results/sentence-transformers__average_word_embeddings_levy_dependency/no_revision_available/TwentyNewsgroupsClustering.v2.json b/tests/mock_mteb_cache/results/results/sentence-transformers__average_word_embeddings_levy_dependency/no_revision_available/TwentyNewsgroupsClustering.v2.json new file mode 100644 index 0000000000..02420b2085 --- /dev/null +++ b/tests/mock_mteb_cache/results/results/sentence-transformers__average_word_embeddings_levy_dependency/no_revision_available/TwentyNewsgroupsClustering.v2.json @@ -0,0 +1,26 @@ +{ + "dataset_revision": "6125ec4e24fa026cec8a478383ee943acfbd5449", + "evaluation_time": 0.9795777797698975, + "kg_co2_emissions": null, + "mteb_version": "1.12.75", + "scores": { + "test": [ + { + "hf_subset": "default", + "languages": [ + "eng-Latn" + ], + "main_score": 0.2559434325717902, + "v_measure": 0.2559434325717902, + "v_measure_std": 0.0029490857512028956, + "v_measures": { + "Level 0": [ + 0.2529943468205873, + 0.2588925183229931 + ] + } + } + ] + }, + "task_name": "TwentyNewsgroupsClustering.v2" +} diff --git a/tests/mock_mteb_cache/results/results/sentence-transformers__average_word_embeddings_levy_dependency/no_revision_available/TwitterHjerneRetrieval.json b/tests/mock_mteb_cache/results/results/sentence-transformers__average_word_embeddings_levy_dependency/no_revision_available/TwitterHjerneRetrieval.json new file mode 100644 index 0000000000..18c5871e1e --- /dev/null +++ b/tests/mock_mteb_cache/results/results/sentence-transformers__average_word_embeddings_levy_dependency/no_revision_available/TwitterHjerneRetrieval.json @@ -0,0 +1,158 @@ +{ + "dataset_revision": "099ee143c7fdfa6bd7965be8c801cb161c313b29", + "evaluation_time": 0.21496796607971191, + "kg_co2_emissions": null, + "mteb_version": "1.12.75", + "scores": { + "train": [ + { + "hf_subset": "default", + "languages": [ + "dan-Latn" + ], + "main_score": 0.04105, + "map_at_1": 0.01282, + "map_at_10": 0.02481, + "map_at_100": 0.04196, + "map_at_1000": 0.05011, + "map_at_20": 0.0299, + "map_at_3": 0.01923, + "map_at_5": 0.02131, + "mrr_at_1": 0.038461538461538464, + "mrr_at_10": 0.05952380952380952, + "mrr_at_100": 0.08079845500840184, + "mrr_at_1000": 0.08221962576694332, + "mrr_at_20": 0.06924433996802418, + "mrr_at_3": 0.04487179487179487, + "mrr_at_5": 0.05320512820512821, + "nauc_map_at_1000_diff1": 0.43877676062465176, + "nauc_map_at_1000_max": 0.019639999404557355, + "nauc_map_at_1000_std": -0.20226293075630766, + "nauc_map_at_100_diff1": 0.4379837831162035, + "nauc_map_at_100_max": -0.0003349185193268875, + "nauc_map_at_100_std": -0.21721983390948285, + "nauc_map_at_10_diff1": 0.6409296992593275, + "nauc_map_at_10_max": -0.036758327835021226, + "nauc_map_at_10_std": -0.24733287730858092, + "nauc_map_at_1_diff1": 0.833134119966383, + "nauc_map_at_1_max": -0.14581698788832048, + "nauc_map_at_1_std": -0.3707231740205868, + "nauc_map_at_20_diff1": 0.5231441873676606, + "nauc_map_at_20_max": -0.020216787829636963, + "nauc_map_at_20_std": -0.23458029822237542, + "nauc_map_at_3_diff1": 0.8331341199663831, + "nauc_map_at_3_max": -0.027526537586110702, + "nauc_map_at_3_std": -0.3274014524291326, + "nauc_map_at_5_diff1": 0.7206841941573701, + "nauc_map_at_5_max": -0.03665140448986435, + "nauc_map_at_5_std": -0.2694949296954416, + "nauc_mrr_at_1000_diff1": 0.4546383906435604, + "nauc_mrr_at_1000_max": -0.018658071426010347, + "nauc_mrr_at_1000_std": -0.19709731795561858, + "nauc_mrr_at_100_diff1": 0.44804003776203516, + "nauc_mrr_at_100_max": -0.013466820876777206, + "nauc_mrr_at_100_std": -0.1938988361766503, + "nauc_mrr_at_10_diff1": 0.514005062048379, + "nauc_mrr_at_10_max": -0.025023246576114135, + "nauc_mrr_at_10_std": -0.17577570940853363, + "nauc_mrr_at_1_diff1": 0.7775121599551773, + "nauc_mrr_at_1_max": -0.0004588925299664257, + "nauc_mrr_at_1_std": -0.30033380737298826, + "nauc_mrr_at_20_diff1": 0.45128161848381687, + "nauc_mrr_at_20_max": -0.021779641408469573, + "nauc_mrr_at_20_std": -0.18491060350575378, + "nauc_mrr_at_3_diff1": 0.7616201713805473, + "nauc_mrr_at_3_max": -0.08352066130616873, + "nauc_mrr_at_3_std": -0.3405563026001873, + "nauc_mrr_at_5_diff1": 0.5926303063084647, + "nauc_mrr_at_5_max": -0.08937227202711431, + "nauc_mrr_at_5_std": -0.24570593421699935, + "nauc_ndcg_at_1000_diff1": 0.23886252882408138, + "nauc_ndcg_at_1000_max": 0.11989581194693663, + "nauc_ndcg_at_1000_std": -0.13295577380243986, + "nauc_ndcg_at_100_diff1": 0.16782043757045095, + "nauc_ndcg_at_100_max": 0.07897254621461802, + "nauc_ndcg_at_100_std": -0.1491963737957376, + "nauc_ndcg_at_10_diff1": 0.46974794783146867, + "nauc_ndcg_at_10_max": -0.0068838330460257394, + "nauc_ndcg_at_10_std": -0.1555753780952521, + "nauc_ndcg_at_1_diff1": 0.7775121599551773, + "nauc_ndcg_at_1_max": -0.0004588925299664257, + "nauc_ndcg_at_1_std": -0.30033380737298826, + "nauc_ndcg_at_20_diff1": 0.2850117190468895, + "nauc_ndcg_at_20_max": -0.0016730569905689845, + "nauc_ndcg_at_20_std": -0.16652588418555264, + "nauc_ndcg_at_3_diff1": 0.8020568750544967, + "nauc_ndcg_at_3_max": -0.005990323537358181, + "nauc_ndcg_at_3_std": -0.3141104519663256, + "nauc_ndcg_at_5_diff1": 0.5959832995831041, + "nauc_ndcg_at_5_max": -0.06472814384006419, + "nauc_ndcg_at_5_std": -0.22641912591231347, + "nauc_precision_at_1000_diff1": -0.15553898404373762, + "nauc_precision_at_1000_max": 0.37265082696690444, + "nauc_precision_at_1000_std": 0.1862924252559177, + "nauc_precision_at_100_diff1": -0.1193857945363764, + "nauc_precision_at_100_max": 0.23335050601377114, + "nauc_precision_at_100_std": -0.001278319406148229, + "nauc_precision_at_10_diff1": 0.2051997221310295, + "nauc_precision_at_10_max": 0.10498489086878844, + "nauc_precision_at_10_std": 0.018365539986249662, + "nauc_precision_at_1_diff1": 0.7775121599551773, + "nauc_precision_at_1_max": -0.0004588925299664257, + "nauc_precision_at_1_std": -0.30033380737298826, + "nauc_precision_at_20_diff1": 0.011052082092452422, + "nauc_precision_at_20_max": 0.08980020269434623, + "nauc_precision_at_20_std": -0.05204964272232509, + "nauc_precision_at_3_diff1": 0.7997609439596596, + "nauc_precision_at_3_max": 0.08334640968934388, + "nauc_precision_at_3_std": -0.2765034881222825, + "nauc_precision_at_5_diff1": 0.3810163158392287, + "nauc_precision_at_5_max": 0.013058615864564735, + "nauc_precision_at_5_std": -0.08596841751175756, + "nauc_recall_at_1000_diff1": 0.1235168803903835, + "nauc_recall_at_1000_max": 0.3588109980374423, + "nauc_recall_at_1000_std": -1.7185341055018757, + "nauc_recall_at_100_diff1": -0.027833262940236168, + "nauc_recall_at_100_max": 0.09823261333212424, + "nauc_recall_at_100_std": -0.13657972674505262, + "nauc_recall_at_10_diff1": 0.30041848718240727, + "nauc_recall_at_10_max": 0.008287728851474253, + "nauc_recall_at_10_std": -0.08548814743384145, + "nauc_recall_at_1_diff1": 0.833134119966383, + "nauc_recall_at_1_max": -0.14581698788832048, + "nauc_recall_at_1_std": -0.3707231740205868, + "nauc_recall_at_20_diff1": 0.04901742604675977, + "nauc_recall_at_20_max": -0.014462746234754024, + "nauc_recall_at_20_std": -0.1414653146116804, + "nauc_recall_at_3_diff1": 0.8092961371044376, + "nauc_recall_at_3_max": -0.10672149992572096, + "nauc_recall_at_3_std": -0.3637571412197397, + "nauc_recall_at_5_diff1": 0.4714398766646598, + "nauc_recall_at_5_max": -0.10593136210102469, + "nauc_recall_at_5_std": -0.18515298880208245, + "ndcg_at_1": 0.03846, + "ndcg_at_10": 0.04105, + "ndcg_at_100": 0.15084, + "ndcg_at_1000": 0.25462, + "ndcg_at_20": 0.06218, + "ndcg_at_3": 0.02865, + "ndcg_at_5": 0.03202, + "precision_at_1": 0.03846, + "precision_at_10": 0.01795, + "precision_at_100": 0.01603, + "precision_at_1000": 0.00336, + "precision_at_20": 0.01859, + "precision_at_3": 0.02137, + "precision_at_5": 0.02051, + "recall_at_1": 0.01282, + "recall_at_10": 0.05, + "recall_at_100": 0.44573, + "recall_at_1000": 0.98718, + "recall_at_20": 0.10791, + "recall_at_3": 0.02244, + "recall_at_5": 0.03205 + } + ] + }, + "task_name": "TwitterHjerneRetrieval" +} diff --git a/tests/mock_mteb_cache/results/results/sentence-transformers__average_word_embeddings_levy_dependency/no_revision_available/model_meta.json b/tests/mock_mteb_cache/results/results/sentence-transformers__average_word_embeddings_levy_dependency/no_revision_available/model_meta.json new file mode 100644 index 0000000000..9d8c191068 --- /dev/null +++ b/tests/mock_mteb_cache/results/results/sentence-transformers__average_word_embeddings_levy_dependency/no_revision_available/model_meta.json @@ -0,0 +1,17 @@ +{ + "name": "sentence-transformers/average_word_embeddings_levy_dependency", + "revision": "no_revision_available", + "release_date": null, + "languages": [], + "n_parameters": null, + "memory_usage": null, + "max_tokens": null, + "embed_dim": null, + "license": null, + "open_source": null, + "similarity_fn_name": null, + "framework": [ + "Sentence Transformers" + ], + "loader": null +} diff --git a/tests/test_TaskMetadata.py b/tests/test_TaskMetadata.py index 241ec536ea..3b5aa1e158 100644 --- a/tests/test_TaskMetadata.py +++ b/tests/test_TaskMetadata.py @@ -5,6 +5,7 @@ import pytest from mteb import AbsTask +from mteb.abstasks.aggregated_task import AbsTaskAggregate from mteb.abstasks.TaskMetadata import TaskMetadata from mteb.overview import get_tasks @@ -52,8 +53,6 @@ "TwitterHjerneRetrieval", "GerDaLIR", "GerDaLIRSmall", - "GermanDPR", - "GermanQuAD-Retrieval", "LegalQuAD", "AILACasedocs", "AILAStatutes", @@ -179,10 +178,23 @@ "TamilNewsClassification", "TenKGnadClusteringP2P.v2", "TenKGnadClusteringS2S.v2", - "SynPerChatbotConvSAClassification", - "CQADupstackRetrieval-Fa", - "VisualSTS17Eng", - "VisualSTS17Multilingual", + "ClimateFEVERHardNegatives", + "DBPediaHardNegatives", + "FEVERHardNegatives", + "HotpotQAHardNegatives", + "MSMARCOHardNegatives", + "NQHardNegatives", + "QuoraRetrievalHardNegatives", + "TopiOCQAHardNegatives", + "MIRACLRetrievalHardNegatives", + "NeuCLIR2022RetrievalHardNegatives", + "NeuCLIR2023RetrievalHardNegatives", + "DBPedia-PLHardNegatives", + "HotpotQA-PLHardNegatives", + "MSMARCO-PLHardNegatives", + "NQ-PLHardNegatives", + "Quora-PLHardNegatives", + "RiaNewsRetrievalHardNegatives", ] @@ -357,23 +369,54 @@ def test_filled_metadata_is_filled(): ) +def test_invalid_metadata_eval_lang_is_invalid(): + with pytest.raises(ValueError): + TaskMetadata( + name="MyTask", + dataset={ + "path": "test/dataset", + "revision": "1.0", + }, + description="testing", + reference="https://aclanthology.org/W19-6138/", + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng_Latn"], # uses underscore instead of dash + main_score="map", + date=("2021-01-01", "2021-12-31"), + domains=["Non-fiction", "Written"], + license="mit", + task_subtypes=["Thematic clustering"], + annotations_creators="expert-annotated", + dialect=[], + sample_creation="found", + bibtex_citation="Someone et al", + ).validate_metadata() + + def test_all_metadata_is_filled_and_valid(): all_tasks = get_tasks() unfilled_metadata = [] + invalid_metadata = [] for task in all_tasks: - if ( - task.metadata.name not in _HISTORIC_DATASETS - and task.metadata.name.replace("HardNegatives", "") - not in _HISTORIC_DATASETS + if task.metadata.name in _HISTORIC_DATASETS or isinstance( + task, AbsTaskAggregate ): - if not task.metadata.is_filled() and ( - not task.metadata.validate_metadata() - ): - unfilled_metadata.append(task.metadata.name) - if unfilled_metadata: + continue + + if not task.metadata.is_filled(): + unfilled_metadata.append(task.metadata.name) + else: + if task.metadata.validate_metadata() is not None: + invalid_metadata.append(task.metadata.name) + + if unfilled_metadata or invalid_metadata: raise ValueError( - f"The metadata of the following datasets is not filled: {unfilled_metadata}" + f"The metadata of the following datasets is not filled: {unfilled_metadata}." + + f"The metadata of the following datasets is invalid: {invalid_metadata}." ) diff --git a/tests/test_benchmark/mock_models.py b/tests/test_benchmark/mock_models.py index 22f64bcc89..997c607cc0 100644 --- a/tests/test_benchmark/mock_models.py +++ b/tests/test_benchmark/mock_models.py @@ -46,7 +46,7 @@ def encode(self, sentences, prompt_name: str | None = None, **kwargs): class MockCLIPEncoder: mteb_model_meta = ModelMeta( name="mock/MockCLIPModel", - languages=["eng_Latn"], + languages=["eng-Latn"], revision="3d74acf9a28c67741b2f4f2ea7635f0aaf6f0268", release_date="2021-02-06", modalities=["image", "text"], @@ -92,7 +92,7 @@ def calculate_probs(self, text_embeddings, image_embeddings): class MockAudioEncoder: mteb_model_meta = ModelMeta( name="mock/MockAudioEncoder", - languages=["eng_Latn"], + languages=["eng-Latn"], revision="7d091cd70772c5c0ecf7f00b5f12ca609a99d69d", release_date="2024-01-01", modalities=["audio"], @@ -154,7 +154,7 @@ def get_fused_embeddings( class MockMocoEncoder: mteb_model_meta = ModelMeta( name="mock/MockMocoModel", - languages=["eng_Latn"], + languages=["eng-Latn"], revision="7d091cd70772c5c0ecf7f00b5f12ca609a99d69d", release_date="2024-01-01", modalities=["image"], diff --git a/tests/test_citation_formatting.py b/tests/test_citation_formatting.py new file mode 100644 index 0000000000..dd2c731994 --- /dev/null +++ b/tests/test_citation_formatting.py @@ -0,0 +1,64 @@ +from __future__ import annotations + +import bibtexparser +import pytest +from bibtexparser.bwriter import BibTexWriter + +import mteb +from mteb.abstasks import AbsTask +from mteb.benchmarks.benchmark import Benchmark + + +def format_bibtex(bibtex_str: str) -> str | None: + parser = bibtexparser.bparser.BibTexParser( + common_strings=True, ignore_nonstandard_types=False, interpolate_strings=False + ) + + bib_database = bibtexparser.loads(bibtex_str, parser) + if not bib_database.entries: + return None + + writer = BibTexWriter() + writer.indent = " " + writer.comma_first = False + writer.add_trailing_comma = True + + return writer.write(bib_database).strip() + + +@pytest.fixture(params=mteb.get_tasks()) +def task(request): + return request.param + + +def test_task_bibtex(task: AbsTask): + task_name = task.metadata.name + bibtex_citation = task.metadata.bibtex_citation + + if not bibtex_citation or not bibtex_citation.strip(): + pytest.skip(f"Task {task_name} has no bibtex_citation") + bibtex_citation = bibtex_citation.strip() + + formatted_bibtex = format_bibtex(bibtex_citation) + assert formatted_bibtex is not None and formatted_bibtex == bibtex_citation, ( + f"Wrong BibTeX citation formatting for task {task_name}" + ) + + +@pytest.fixture(params=mteb.get_benchmarks()) +def benchmark(request): + return request.param + + +def test_benchmark_bibtex(benchmark: Benchmark): + benchmark_name = benchmark.name + bibtex_citation = benchmark.citation + + if not bibtex_citation or not bibtex_citation.strip(): + pytest.skip(f"Benchmark {benchmark_name} has no bibtex_citation") + bibtex_citation = bibtex_citation.strip() + + formatted_bibtex = format_bibtex(bibtex_citation) + assert formatted_bibtex is not None and formatted_bibtex == bibtex_citation, ( + f"Wrong BibTeX citation formatting for benchmark {benchmark_name}" + ) diff --git a/tests/test_embedding_caching.py b/tests/test_embedding_caching.py index 77e0546440..1704d347ff 100644 --- a/tests/test_embedding_caching.py +++ b/tests/test_embedding_caching.py @@ -50,14 +50,14 @@ def test_caching_functionality(self, cache_dir): ] # First call - should use the model to compute embeddings - query_embeddings1 = wrapped_model.encode(queries, task_name="query") - corpus_embeddings1 = wrapped_model.encode(corpus, task_name="corpus") + query_embeddings1 = wrapped_model.encode(queries, task_name="DummyTaskQuery") + corpus_embeddings1 = wrapped_model.encode(corpus, task_name="DummyTaskCorpus") assert dummy_model.call_count == 2 # One call for queries, one for corpus # Second call - should use cached embeddings - query_embeddings2 = wrapped_model.encode(queries) - corpus_embeddings2 = wrapped_model.encode(corpus) + query_embeddings2 = wrapped_model.encode(queries, task_name="DummyTaskQuery") + corpus_embeddings2 = wrapped_model.encode(corpus, task_name="DummyTaskCorpus") assert dummy_model.call_count == 2 # No additional calls to the model @@ -66,18 +66,22 @@ def test_caching_functionality(self, cache_dir): np.testing.assert_allclose(corpus_embeddings1, corpus_embeddings2) # Verify that cache files were created - assert (cache_dir / "cache" / "vectors.npy").exists() - assert (cache_dir / "cache" / "index.json").exists() + assert (cache_dir / "DummyTaskQuery" / "vectors.npy").exists() + assert (cache_dir / "DummyTaskQuery" / "index.json").exists() + assert (cache_dir / "DummyTaskCorpus" / "vectors.npy").exists() + assert (cache_dir / "DummyTaskCorpus" / "index.json").exists() # Test with a new query - should use cache for existing queries and compute for new one new_queries = ["What is the role of insulin in diabetes?"] - query_embeddings3 = wrapped_model.encode(new_queries) + query_embeddings3 = wrapped_model.encode( + new_queries, task_name="DummyTaskQuery" + ) assert dummy_model.call_count == 3 # One additional call for the new query assert query_embeddings3.shape == (1, dummy_model.embedding_dim) # try with a cached query only - _ = wrapped_model.encode(queries) + _ = wrapped_model.encode(queries, task_name="DummyTaskQuery") assert dummy_model.call_count == 3 wrapped_model.close() # delete to allow cleanup on Windows diff --git a/tests/test_load_results/test_benchmark_results.py b/tests/test_load_results/test_benchmark_results.py new file mode 100644 index 0000000000..36b592e3b7 --- /dev/null +++ b/tests/test_load_results/test_benchmark_results.py @@ -0,0 +1,162 @@ +"""Tests for the BenchmarkResult class""" + +from __future__ import annotations + +import os +from pathlib import Path + +import pandas as pd +import pytest + +import mteb +from mteb.load_results import ModelResult +from mteb.load_results.benchmark_results import BenchmarkResults + + +@pytest.fixture +def benchmark_results() -> BenchmarkResults: + tests_path = Path(__file__).parent.parent / "mock_mteb_cache" + + os.environ["MTEB_CACHE"] = str(tests_path) + + results = mteb.load_results(download_latest=False) + + return results + + +def test_indexing(benchmark_results: BenchmarkResults) -> None: + model_res = benchmark_results.model_results[0] + assert isinstance(model_res, ModelResult), ( + "indexing into the list should return a ModelResult" + ) + + +def test_select_models(benchmark_results: BenchmarkResults) -> None: + model_name = "sentence-transformers/all-MiniLM-L6-v2" + bench_res = benchmark_results.select_models([model_name]) + assert isinstance(bench_res, BenchmarkResults) + assert isinstance(bench_res[0], ModelResult) + assert len(bench_res.model_results) > 1 # multiple revisions + assert bench_res[0].model_name == model_name + + # with revision + model_meta = mteb.get_model_meta(model_name) + bench_res = benchmark_results.select_models( + names=[model_name], + revisions=[model_meta.revision], + ) + assert bench_res[0].model_name == model_name + assert bench_res[0].model_revision == model_meta.revision + assert len(bench_res.model_results) == 1 # only one revision + + # with model_meta + model_meta = mteb.get_model_meta(model_name) + bench_res = benchmark_results.select_models( + names=[model_meta], + ) + assert bench_res[0].model_name == model_name + assert bench_res[0].model_revision == model_meta.revision + assert len(bench_res.model_results) == 1 # only one revision + + +def test_select_tasks(benchmark_results: BenchmarkResults) -> None: + tasks = [mteb.get_task("STS12")] + bench_res = benchmark_results.select_tasks(tasks=tasks) + task_names = bench_res.task_names + assert isinstance(task_names, list) + assert len(task_names) == 1 + assert task_names[0] == "STS12" + + +def test_join_revisions(benchmark_results: BenchmarkResults) -> None: + model_name = "sentence-transformers/all-MiniLM-L6-v2" + bench_res = benchmark_results.select_models([model_name]) + + assert len(bench_res.model_revisions) == 2, ( + "There should only be two revisions for this model in the mock cache" + ) + + bench_res = bench_res.join_revisions() + assert isinstance(bench_res, BenchmarkResults) + assert len(bench_res.model_revisions) == 1 + revision = bench_res.model_revisions[0]["revision"] + assert revision == mteb.get_model_meta(model_name).revision + + +def test_to_dataframe( + benchmark_results: BenchmarkResults, +) -> None: + required_columns = [ + "model_name", + "task_name", + "task_name", + "score", + "subset", + "split", + ] + t1 = benchmark_results.to_dataframe(aggregation_level="subset", format="long") + assert isinstance(t1, pd.DataFrame) + assert all(col in t1.columns for col in required_columns), "Columns are missing" + assert t1.shape[0] > 0, "Results table is empty" + + t2 = benchmark_results.to_dataframe(aggregation_level="split", format="long") + assert all( + col in t2.columns for col in required_columns if col not in ["subset"] + ), "Columns are missing" + assert "subset" not in t2.columns, "Subset column should not be present" + assert t1.shape[0] >= t2.shape[0], ( + "Aggregation level 'split' should have more rows than 'subset'" + ) + + t3 = benchmark_results.to_dataframe(aggregation_level="task", format="long") + assert all( + col in t3.columns for col in required_columns if col not in ["subset", "split"] + ), "Columns are missing" + assert "subset" not in t3.columns, "Subset column should not be present" + assert "split" not in t3.columns, "Split column should not be present" + assert t2.shape[0] >= t3.shape[0], ( + "Aggregation level 'task' should have more rows than 'split'" + ) + + # test no model revisions + benchmark_res = benchmark_results.join_revisions() + t1 = benchmark_res.to_dataframe(aggregation_level="subset", format="long") + assert "model_revision" not in t1.columns, ( + "Model revision column should not be present" + ) + # Test the wide format + model_name = "sentence-transformers/all-MiniLM-L6-v2" + task = mteb.get_task("BornholmBitextMining") + + # simplify down to one model and one task + br = benchmark_res.select_models([model_name]).select_tasks([task]) + + t4_wide = br.to_dataframe(aggregation_level="task", format="wide") + t4_long = br.to_dataframe(aggregation_level="task", format="long") + assert isinstance(t4_wide, pd.DataFrame) + + # check that the scores are the same for a given model + assert t4_wide[model_name][0] == t4_long["score"][0], ( + "The scores in wide and long format should be the same" + ) + + +def test_utility_properties( + benchmark_results: BenchmarkResults, +) -> None: + br = benchmark_results + assert isinstance(br.task_names, list) and isinstance(br.task_names[0], str) + assert ( + isinstance(br.languages, list) + and isinstance(br.languages[0], str) + and "eng" in br.languages + ) + assert isinstance(br.model_names, list) and isinstance(br.model_names[0], str) + assert ( + isinstance(br.model_revisions, list) + and isinstance(br.model_revisions[0], dict) + and "model_name" in br.model_revisions[0] + and "revision" in br.model_revisions[0] + ) + assert isinstance(br.task_types, list) and isinstance(br.task_types[0], str) + assert isinstance(br.domains, list) and isinstance(br.domains[0], str) diff --git a/tests/test_load_results/test_model_results.py b/tests/test_load_results/test_model_results.py new file mode 100644 index 0000000000..09921ae68b --- /dev/null +++ b/tests/test_load_results/test_model_results.py @@ -0,0 +1,121 @@ +"""Tests for the ModelResult class""" + +from __future__ import annotations + +import pandas as pd +import pytest + +import mteb +from mteb.load_results import ModelResult +from mteb.load_results.task_results import TaskResult + +# TODO: v2 ^ we probably want to refactor such that this import looks like +# from mteb.results import ModelResult, TaskResults + + +@pytest.fixture +def model_result() -> ModelResult: + task_result = TaskResult( + dataset_revision="1.0", + task_name="BornholmBitextMining", # just dummy results + mteb_version="1.0.0", + evaluation_time=100, + scores={ + "train": [ + { + "main_score": 0.5, + "hf_subset": "en-de", + "languages": ["eng-Latn", "deu-Latn"], + }, + { + "main_score": 0.6, + "hf_subset": "en-fr", + "languages": ["eng-Latn", "fra-Latn"], + }, + ] + }, + ) + + return ModelResult( + model_name="mock_model", + model_revision="dummy", + task_results=[task_result], + ) + + +def test_indexing(model_result: ModelResult) -> None: + res = model_result[0] + assert isinstance(res, TaskResult), ( + "indexing into the list should return a ModelResult" + ) + + +def test_utility_properties( + model_result: ModelResult, +) -> None: + mr = model_result + assert isinstance(mr.task_names, list) and isinstance(mr.task_names[0], str) + assert ( + isinstance(mr.languages, list) + and isinstance(mr.languages[0], str) + and "eng" in mr.languages # known from mock data + ) + assert isinstance(mr.task_types, list) and isinstance(mr.task_types[0], str) + assert isinstance(mr.domains, list) and isinstance(mr.domains[0], str) + + +def test_select_tasks( + model_result: ModelResult, +) -> None: + tasks = [mteb.get_task("BornholmBitextMining")] + mr = model_result.select_tasks(tasks=tasks) + task_names = mr.task_names + assert isinstance(task_names, list) + assert len(task_names) == 1 + assert task_names[0] == "BornholmBitextMining" + + +def test_to_dataframe( + model_result: ModelResult, # noqa: F811 +) -> None: + mr = model_result + required_columns = [ + "model_name", + "task_name", + "task_name", + "score", + "subset", + "split", + ] + t1 = mr.to_dataframe(aggregation_level="subset", format="long") + assert isinstance(t1, pd.DataFrame) + assert all(col in t1.columns for col in required_columns), "Columns are missing" + assert t1.shape[0] > 0, "Results table is empty" + + t2 = mr.to_dataframe(aggregation_level="split", format="long") + assert all( + col in t2.columns for col in required_columns if col not in ["subset"] + ), "Columns are missing" + assert "subset" not in t2.columns, "Subset column should not be present" + assert t1.shape[0] >= t2.shape[0], ( + "Aggregation level 'split' should have more rows than 'subset'" + ) + + t3 = mr.to_dataframe(aggregation_level="task", format="long") + assert all( + col in t3.columns for col in required_columns if col not in ["subset", "split"] + ), "Columns are missing" + assert "subset" not in t3.columns, "Subset column should not be present" + assert "split" not in t3.columns, "Split column should not be present" + assert t2.shape[0] >= t3.shape[0], ( + "Aggregation level 'task' should have more rows than 'split'" + ) + + t4_wide = mr.to_dataframe(aggregation_level="task", format="wide") + t4_long = mr.to_dataframe(aggregation_level="task", format="long") + assert isinstance(t4_wide, pd.DataFrame) + + # we know it is only one task + assert t4_wide[mr.model_name].tolist()[0] == t4_long["score"][0], ( + "The scores in wide and long format should be the same" + ) diff --git a/tests/test_overview.py b/tests/test_overview.py index 4486bc1136..801929817c 100644 --- a/tests/test_overview.py +++ b/tests/test_overview.py @@ -72,6 +72,7 @@ def test_get_task( @pytest.mark.parametrize("exclude_superseded_datasets", [True, False]) @pytest.mark.parametrize("modalities", [["text"], ["image"], ["text", "image"], None]) @pytest.mark.parametrize("exclusive_modality_filter", [True, False]) +@pytest.mark.parametrize("exclude_aggregate", [True, False]) def test_get_tasks( languages: list[str], script: list[str], @@ -80,6 +81,7 @@ def test_get_tasks( exclude_superseded_datasets: bool, modalities: list[MODALITIES] | None, exclusive_modality_filter: bool, + exclude_aggregate: bool, ): tasks = mteb.get_tasks( languages=languages, @@ -89,6 +91,7 @@ def test_get_tasks( exclude_superseded=exclude_superseded_datasets, modalities=modalities, exclusive_modality_filter=exclusive_modality_filter, + exclude_aggregate=exclude_aggregate, ) for task in tasks: @@ -110,6 +113,9 @@ def test_get_tasks( assert set(task.modalities) == set(modalities) else: assert any(mod in task.modalities for mod in modalities) + if exclude_aggregate: + # Aggregate tasks should be excluded + assert not task.is_aggregate def test_get_tasks_filtering(): diff --git a/tests/test_tasks/test_all_abstasks.py b/tests/test_tasks/test_all_abstasks.py index 996dc1ba87..2c7bf4ed6a 100644 --- a/tests/test_tasks/test_all_abstasks.py +++ b/tests/test_tasks/test_all_abstasks.py @@ -57,17 +57,6 @@ ) -dataset_revisions = list( - { # deduplicate as multiple tasks rely on the same dataset (save us at least 100 test cases) - (t.metadata.dataset["path"], t.metadata.dataset["revision"]) - for t in mteb.get_tasks(exclude_superseded=False) - if not isinstance(t, (AbsTaskAggregate, AbsTaskSpeedTask)) - and t.metadata.name != "AfriSentiLangClassification" - and t.metadata.name not in ALL_MOCK_TASKS - } -) - - @pytest.mark.parametrize("task", tasks) @patch("datasets.load_dataset") @patch("datasets.concatenate_datasets") @@ -122,3 +111,10 @@ def test_superseded_dataset_exists(): assert task.superseded_by in TASKS_REGISTRY, ( f"{task} is superseded by {task.superseded_by} but {task.superseded_by} is not in the TASKS_REGISTRY" ) + + +def test_is_aggregate_property_correct(): + tasks = mteb.get_tasks() + + for task in tasks: + assert task.is_aggregate == isinstance(task, AbsTaskAggregate)