diff --git a/mteb/descriptive_stats/BitextMining/WebFAQBitextMiningQAs.json b/mteb/descriptive_stats/BitextMining/WebFAQBitextMiningQAs.json new file mode 100644 index 0000000000..c87b7d24f5 --- /dev/null +++ b/mteb/descriptive_stats/BitextMining/WebFAQBitextMiningQAs.json @@ -0,0 +1,2227 @@ +{ + "default": { + "num_samples": 682057, + "number_of_characters": 526563222, + "unique_pairs": 682057, + "min_sentence1_length": 43, + "average_sentence1_length": 386.03851584251754, + "max_sentence1_length": 18410, + "unique_sentence1": 379274, + "min_sentence2_length": 41, + "average_sentence2_length": 385.98379607569456, + "max_sentence2_length": 21081, + "unique_sentence2": 398878, + "hf_subset_descriptive_stats": { + "ara-fas": { + "num_samples": 609, + "number_of_characters": 411293, + "unique_pairs": 609, + "min_sentence1_length": 54, + "average_sentence1_length": 321.44499178981937, + "max_sentence1_length": 2446, + "unique_sentence1": 609, + "min_sentence2_length": 63, + "average_sentence2_length": 353.9129720853859, + "max_sentence2_length": 2754, + "unique_sentence2": 609 + }, + "ara-heb": { + "num_samples": 978, + "number_of_characters": 628664, + "unique_pairs": 978, + "min_sentence1_length": 68, + "average_sentence1_length": 336.19529652351736, + "max_sentence1_length": 4204, + "unique_sentence1": 978, + "min_sentence2_length": 62, + "average_sentence2_length": 306.61042944785277, + "max_sentence2_length": 3077, + "unique_sentence2": 978 + }, + "jpn-kor": { + "num_samples": 4820, + "number_of_characters": 1914641, + "unique_pairs": 4820, + "min_sentence1_length": 43, + "average_sentence1_length": 194.2630705394191, + "max_sentence1_length": 1741, + "unique_sentence1": 4820, + "min_sentence2_length": 43, + "average_sentence2_length": 202.96535269709543, + "max_sentence2_length": 1830, + "unique_sentence2": 4820 + }, + "jpn-vie": { + "num_samples": 1356, + "number_of_characters": 823292, + "unique_pairs": 1356, + "min_sentence1_length": 50, + "average_sentence1_length": 202.49115044247787, + "max_sentence1_length": 1660, + "unique_sentence1": 1356, + "min_sentence2_length": 71, + "average_sentence2_length": 404.65634218289085, + "max_sentence2_length": 3938, + "unique_sentence2": 1356 + }, + "jpn-zho": { + "num_samples": 1728, + "number_of_characters": 587091, + "unique_pairs": 1728, + "min_sentence1_length": 49, + "average_sentence1_length": 198.45138888888889, + "max_sentence1_length": 6803, + "unique_sentence1": 1728, + "min_sentence2_length": 43, + "average_sentence2_length": 141.30034722222223, + "max_sentence2_length": 4500, + "unique_sentence2": 1728 + }, + "kor-vie": { + "num_samples": 1386, + "number_of_characters": 906969, + "unique_pairs": 1386, + "min_sentence1_length": 50, + "average_sentence1_length": 224.98701298701297, + "max_sentence1_length": 3677, + "unique_sentence1": 1386, + "min_sentence2_length": 71, + "average_sentence2_length": 429.3917748917749, + "max_sentence2_length": 6989, + "unique_sentence2": 1386 + }, + "kor-zho": { + "num_samples": 1087, + "number_of_characters": 354672, + "unique_pairs": 1087, + "min_sentence1_length": 64, + "average_sentence1_length": 193.25022999080036, + "max_sentence1_length": 1064, + "unique_sentence1": 1087, + "min_sentence2_length": 45, + "average_sentence2_length": 133.03495860165594, + "max_sentence2_length": 729, + "unique_sentence2": 1087 + }, + "vie-zho": { + "num_samples": 646, + "number_of_characters": 341786, + "unique_pairs": 646, + "min_sentence1_length": 89, + "average_sentence1_length": 388.8777089783282, + "max_sentence1_length": 2114, + "unique_sentence1": 646, + "min_sentence2_length": 45, + "average_sentence2_length": 140.20278637770897, + "max_sentence2_length": 1267, + "unique_sentence2": 646 + }, + "ind-msa": { + "num_samples": 455, + "number_of_characters": 197047, + "unique_pairs": 455, + "min_sentence1_length": 78, + "average_sentence1_length": 217.3802197802198, + "max_sentence1_length": 870, + "unique_sentence1": 455, + "min_sentence2_length": 72, + "average_sentence2_length": 215.69010989010988, + "max_sentence2_length": 784, + "unique_sentence2": 455 + }, + "ind-tgl": { + "num_samples": 378, + "number_of_characters": 265203, + "unique_pairs": 378, + "min_sentence1_length": 74, + "average_sentence1_length": 329.08730158730157, + "max_sentence1_length": 1418, + "unique_sentence1": 378, + "min_sentence2_length": 75, + "average_sentence2_length": 372.5079365079365, + "max_sentence2_length": 1600, + "unique_sentence2": 378 + }, + "ind-tha": { + "num_samples": 1258, + "number_of_characters": 794128, + "unique_pairs": 1258, + "min_sentence1_length": 72, + "average_sentence1_length": 347.3974562798092, + "max_sentence1_length": 3226, + "unique_sentence1": 1258, + "min_sentence2_length": 63, + "average_sentence2_length": 283.86486486486484, + "max_sentence2_length": 2816, + "unique_sentence2": 1258 + }, + "bul-ces": { + "num_samples": 1485, + "number_of_characters": 922134, + "unique_pairs": 1485, + "min_sentence1_length": 71, + "average_sentence1_length": 325.8868686868687, + "max_sentence1_length": 1945, + "unique_sentence1": 1485, + "min_sentence2_length": 56, + "average_sentence2_length": 295.07878787878786, + "max_sentence2_length": 1921, + "unique_sentence2": 1485 + }, + "bul-lav": { + "num_samples": 710, + "number_of_characters": 492895, + "unique_pairs": 710, + "min_sentence1_length": 74, + "average_sentence1_length": 358.14647887323946, + "max_sentence1_length": 2765, + "unique_sentence1": 710, + "min_sentence2_length": 61, + "average_sentence2_length": 336.0718309859155, + "max_sentence2_length": 2523, + "unique_sentence2": 710 + }, + "bul-lit": { + "num_samples": 803, + "number_of_characters": 540245, + "unique_pairs": 803, + "min_sentence1_length": 71, + "average_sentence1_length": 346.31382316313824, + "max_sentence1_length": 1945, + "unique_sentence1": 803, + "min_sentence2_length": 67, + "average_sentence2_length": 326.4694894146949, + "max_sentence2_length": 1925, + "unique_sentence2": 803 + }, + "bul-pol": { + "num_samples": 1635, + "number_of_characters": 1126043, + "unique_pairs": 1635, + "min_sentence1_length": 69, + "average_sentence1_length": 347.74128440366974, + "max_sentence1_length": 2431, + "unique_sentence1": 1635, + "min_sentence2_length": 68, + "average_sentence2_length": 340.97003058103974, + "max_sentence2_length": 2277, + "unique_sentence2": 1635 + }, + "bul-rus": { + "num_samples": 1476, + "number_of_characters": 995879, + "unique_pairs": 1476, + "min_sentence1_length": 71, + "average_sentence1_length": 337.76151761517616, + "max_sentence1_length": 4620, + "unique_sentence1": 1476, + "min_sentence2_length": 63, + "average_sentence2_length": 336.9532520325203, + "max_sentence2_length": 4654, + "unique_sentence2": 1476 + }, + "bul-slk": { + "num_samples": 1154, + "number_of_characters": 777946, + "unique_pairs": 1154, + "min_sentence1_length": 73, + "average_sentence1_length": 349.93934142114387, + "max_sentence1_length": 1945, + "unique_sentence1": 1154, + "min_sentence2_length": 68, + "average_sentence2_length": 324.19064124783364, + "max_sentence2_length": 2073, + "unique_sentence2": 1154 + }, + "bul-slv": { + "num_samples": 1034, + "number_of_characters": 673719, + "unique_pairs": 1034, + "min_sentence1_length": 86, + "average_sentence1_length": 339.068665377176, + "max_sentence1_length": 1945, + "unique_sentence1": 1034, + "min_sentence2_length": 79, + "average_sentence2_length": 312.49709864603483, + "max_sentence2_length": 1872, + "unique_sentence2": 1034 + }, + "bul-srp": { + "num_samples": 296, + "number_of_characters": 222838, + "unique_pairs": 296, + "min_sentence1_length": 92, + "average_sentence1_length": 390.4054054054054, + "max_sentence1_length": 1945, + "unique_sentence1": 296, + "min_sentence2_length": 87, + "average_sentence2_length": 362.4256756756757, + "max_sentence2_length": 1845, + "unique_sentence2": 296 + }, + "bul-ukr": { + "num_samples": 1074, + "number_of_characters": 708525, + "unique_pairs": 1074, + "min_sentence1_length": 64, + "average_sentence1_length": 335.2728119180633, + "max_sentence1_length": 2057, + "unique_sentence1": 1074, + "min_sentence2_length": 59, + "average_sentence2_length": 324.4338919925512, + "max_sentence2_length": 2042, + "unique_sentence2": 1074 + }, + "ces-lav": { + "num_samples": 875, + "number_of_characters": 569552, + "unique_pairs": 875, + "min_sentence1_length": 74, + "average_sentence1_length": 318.83314285714283, + "max_sentence1_length": 1921, + "unique_sentence1": 875, + "min_sentence2_length": 76, + "average_sentence2_length": 332.08342857142856, + "max_sentence2_length": 1948, + "unique_sentence2": 875 + }, + "ces-lit": { + "num_samples": 1002, + "number_of_characters": 674361, + "unique_pairs": 1002, + "min_sentence1_length": 73, + "average_sentence1_length": 328.37225548902194, + "max_sentence1_length": 2956, + "unique_sentence1": 1002, + "min_sentence2_length": 84, + "average_sentence2_length": 344.6427145708583, + "max_sentence2_length": 2995, + "unique_sentence2": 1002 + }, + "ces-pol": { + "num_samples": 3367, + "number_of_characters": 2230397, + "unique_pairs": 3367, + "min_sentence1_length": 56, + "average_sentence1_length": 317.48589248589246, + "max_sentence1_length": 2453, + "unique_sentence1": 3367, + "min_sentence2_length": 64, + "average_sentence2_length": 344.9426789426789, + "max_sentence2_length": 2621, + "unique_sentence2": 3367 + }, + "ces-rus": { + "num_samples": 2144, + "number_of_characters": 1438311, + "unique_pairs": 2144, + "min_sentence1_length": 56, + "average_sentence1_length": 319.39925373134326, + "max_sentence1_length": 2349, + "unique_sentence1": 2144, + "min_sentence2_length": 71, + "average_sentence2_length": 351.4547574626866, + "max_sentence2_length": 2509, + "unique_sentence2": 2144 + }, + "ces-slk": { + "num_samples": 2551, + "number_of_characters": 1733126, + "unique_pairs": 2551, + "min_sentence1_length": 65, + "average_sentence1_length": 334.90395923167387, + "max_sentence1_length": 7967, + "unique_sentence1": 2551, + "min_sentence2_length": 67, + "average_sentence2_length": 344.48686789494315, + "max_sentence2_length": 10365, + "unique_sentence2": 2551 + }, + "ces-slv": { + "num_samples": 1370, + "number_of_characters": 848116, + "unique_pairs": 1370, + "min_sentence1_length": 54, + "average_sentence1_length": 304.5014598540146, + "max_sentence1_length": 2956, + "unique_sentence1": 1370, + "min_sentence2_length": 59, + "average_sentence2_length": 314.5613138686131, + "max_sentence2_length": 3006, + "unique_sentence2": 1370 + }, + "ces-srp": { + "num_samples": 362, + "number_of_characters": 238713, + "unique_pairs": 362, + "min_sentence1_length": 77, + "average_sentence1_length": 322.69060773480663, + "max_sentence1_length": 1921, + "unique_sentence1": 362, + "min_sentence2_length": 80, + "average_sentence2_length": 336.73756906077347, + "max_sentence2_length": 1861, + "unique_sentence2": 362 + }, + "ces-ukr": { + "num_samples": 1285, + "number_of_characters": 789567, + "unique_pairs": 1285, + "min_sentence1_length": 56, + "average_sentence1_length": 295.0536964980545, + "max_sentence1_length": 1921, + "unique_sentence1": 1285, + "min_sentence2_length": 69, + "average_sentence2_length": 319.3953307392996, + "max_sentence2_length": 2042, + "unique_sentence2": 1285 + }, + "hrv-slk": { + "num_samples": 313, + "number_of_characters": 184033, + "unique_pairs": 313, + "min_sentence1_length": 112, + "average_sentence1_length": 295.80511182108626, + "max_sentence1_length": 1393, + "unique_sentence1": 313, + "min_sentence2_length": 104, + "average_sentence2_length": 292.1597444089457, + "max_sentence2_length": 1411, + "unique_sentence2": 313 + }, + "kat-rus": { + "num_samples": 262, + "number_of_characters": 190050, + "unique_pairs": 262, + "min_sentence1_length": 68, + "average_sentence1_length": 362.4351145038168, + "max_sentence1_length": 2879, + "unique_sentence1": 262, + "min_sentence2_length": 74, + "average_sentence2_length": 362.9465648854962, + "max_sentence2_length": 3069, + "unique_sentence2": 262 + }, + "lav-lit": { + "num_samples": 1061, + "number_of_characters": 794243, + "unique_pairs": 1061, + "min_sentence1_length": 61, + "average_sentence1_length": 372.32139491046183, + "max_sentence1_length": 2410, + "unique_sentence1": 1061, + "min_sentence2_length": 67, + "average_sentence2_length": 376.258246936852, + "max_sentence2_length": 2463, + "unique_sentence2": 1061 + }, + "lav-pol": { + "num_samples": 951, + "number_of_characters": 701354, + "unique_pairs": 951, + "min_sentence1_length": 63, + "average_sentence1_length": 359.69295478443746, + "max_sentence1_length": 2044, + "unique_sentence1": 951, + "min_sentence2_length": 72, + "average_sentence2_length": 377.7981072555205, + "max_sentence2_length": 2234, + "unique_sentence2": 951 + }, + "lav-rus": { + "num_samples": 1412, + "number_of_characters": 1039535, + "unique_pairs": 1412, + "min_sentence1_length": 61, + "average_sentence1_length": 358.03753541076486, + "max_sentence1_length": 2206, + "unique_sentence1": 1412, + "min_sentence2_length": 63, + "average_sentence2_length": 378.1770538243626, + "max_sentence2_length": 2383, + "unique_sentence2": 1412 + }, + "lav-slk": { + "num_samples": 789, + "number_of_characters": 535091, + "unique_pairs": 789, + "min_sentence1_length": 75, + "average_sentence1_length": 342.3295310519645, + "max_sentence1_length": 1948, + "unique_sentence1": 789, + "min_sentence2_length": 68, + "average_sentence2_length": 335.8593155893536, + "max_sentence2_length": 1910, + "unique_sentence2": 789 + }, + "lav-slv": { + "num_samples": 518, + "number_of_characters": 340127, + "unique_pairs": 518, + "min_sentence1_length": 76, + "average_sentence1_length": 329.33976833976834, + "max_sentence1_length": 1948, + "unique_sentence1": 518, + "min_sentence2_length": 71, + "average_sentence2_length": 327.2760617760618, + "max_sentence2_length": 1872, + "unique_sentence2": 518 + }, + "lav-ukr": { + "num_samples": 579, + "number_of_characters": 428022, + "unique_pairs": 579, + "min_sentence1_length": 61, + "average_sentence1_length": 365.16062176165804, + "max_sentence1_length": 2410, + "unique_sentence1": 579, + "min_sentence2_length": 59, + "average_sentence2_length": 374.08290155440415, + "max_sentence2_length": 2412, + "unique_sentence2": 579 + }, + "lit-pol": { + "num_samples": 1026, + "number_of_characters": 767128, + "unique_pairs": 1026, + "min_sentence1_length": 64, + "average_sentence1_length": 366.05750487329436, + "max_sentence1_length": 1990, + "unique_sentence1": 1026, + "min_sentence2_length": 79, + "average_sentence2_length": 381.630604288499, + "max_sentence2_length": 2234, + "unique_sentence2": 1026 + }, + "lit-rus": { + "num_samples": 961, + "number_of_characters": 744509, + "unique_pairs": 961, + "min_sentence1_length": 67, + "average_sentence1_length": 379.4672216441207, + "max_sentence1_length": 3141, + "unique_sentence1": 961, + "min_sentence2_length": 63, + "average_sentence2_length": 395.2559833506764, + "max_sentence2_length": 2201, + "unique_sentence2": 961 + }, + "lit-slk": { + "num_samples": 859, + "number_of_characters": 583451, + "unique_pairs": 859, + "min_sentence1_length": 74, + "average_sentence1_length": 344.7508731082654, + "max_sentence1_length": 1925, + "unique_sentence1": 859, + "min_sentence2_length": 68, + "average_sentence2_length": 334.47031431897557, + "max_sentence2_length": 1961, + "unique_sentence2": 859 + }, + "lit-slv": { + "num_samples": 607, + "number_of_characters": 438866, + "unique_pairs": 607, + "min_sentence1_length": 70, + "average_sentence1_length": 366.02141680395385, + "max_sentence1_length": 2995, + "unique_sentence1": 607, + "min_sentence2_length": 75, + "average_sentence2_length": 356.9868204283361, + "max_sentence2_length": 3006, + "unique_sentence2": 607 + }, + "lit-ukr": { + "num_samples": 639, + "number_of_characters": 463616, + "unique_pairs": 639, + "min_sentence1_length": 67, + "average_sentence1_length": 361.1627543035994, + "max_sentence1_length": 2463, + "unique_sentence1": 639, + "min_sentence2_length": 59, + "average_sentence2_length": 364.37089201877933, + "max_sentence2_length": 2412, + "unique_sentence2": 639 + }, + "pol-rus": { + "num_samples": 5014, + "number_of_characters": 3850186, + "unique_pairs": 5014, + "min_sentence1_length": 60, + "average_sentence1_length": 380.9339848424412, + "max_sentence1_length": 5103, + "unique_sentence1": 5014, + "min_sentence2_length": 59, + "average_sentence2_length": 386.95313123254886, + "max_sentence2_length": 4888, + "unique_sentence2": 5014 + }, + "pol-slk": { + "num_samples": 1918, + "number_of_characters": 1321855, + "unique_pairs": 1918, + "min_sentence1_length": 71, + "average_sentence1_length": 354.28362877997915, + "max_sentence1_length": 5103, + "unique_sentence1": 1918, + "min_sentence2_length": 67, + "average_sentence2_length": 334.900417101147, + "max_sentence2_length": 4641, + "unique_sentence2": 1918 + }, + "pol-slv": { + "num_samples": 1382, + "number_of_characters": 859222, + "unique_pairs": 1382, + "min_sentence1_length": 76, + "average_sentence1_length": 317.4905933429812, + "max_sentence1_length": 2101, + "unique_sentence1": 1382, + "min_sentence2_length": 75, + "average_sentence2_length": 304.232995658466, + "max_sentence2_length": 2015, + "unique_sentence2": 1382 + }, + "pol-srp": { + "num_samples": 492, + "number_of_characters": 350413, + "unique_pairs": 492, + "min_sentence1_length": 82, + "average_sentence1_length": 357.2642276422764, + "max_sentence1_length": 1902, + "unique_sentence1": 492, + "min_sentence2_length": 78, + "average_sentence2_length": 354.9573170731707, + "max_sentence2_length": 1845, + "unique_sentence2": 492 + }, + "pol-ukr": { + "num_samples": 2370, + "number_of_characters": 1753652, + "unique_pairs": 2370, + "min_sentence1_length": 59, + "average_sentence1_length": 373.8987341772152, + "max_sentence1_length": 3106, + "unique_sentence1": 2370, + "min_sentence2_length": 61, + "average_sentence2_length": 366.03881856540085, + "max_sentence2_length": 2827, + "unique_sentence2": 2370 + }, + "rus-slk": { + "num_samples": 1263, + "number_of_characters": 905526, + "unique_pairs": 1263, + "min_sentence1_length": 69, + "average_sentence1_length": 371.3887569279493, + "max_sentence1_length": 4888, + "unique_sentence1": 1263, + "min_sentence2_length": 67, + "average_sentence2_length": 345.57561361836895, + "max_sentence2_length": 4641, + "unique_sentence2": 1263 + }, + "rus-slv": { + "num_samples": 1096, + "number_of_characters": 719013, + "unique_pairs": 1096, + "min_sentence1_length": 84, + "average_sentence1_length": 341.51368613138686, + "max_sentence1_length": 2164, + "unique_sentence1": 1096, + "min_sentence2_length": 71, + "average_sentence2_length": 314.5200729927007, + "max_sentence2_length": 2015, + "unique_sentence2": 1096 + }, + "rus-srp": { + "num_samples": 455, + "number_of_characters": 341619, + "unique_pairs": 455, + "min_sentence1_length": 92, + "average_sentence1_length": 386.6307692307692, + "max_sentence1_length": 1921, + "unique_sentence1": 455, + "min_sentence2_length": 90, + "average_sentence2_length": 364.1802197802198, + "max_sentence2_length": 1845, + "unique_sentence2": 455 + }, + "rus-ukr": { + "num_samples": 15251, + "number_of_characters": 10782282, + "unique_pairs": 15251, + "min_sentence1_length": 55, + "average_sentence1_length": 358.27040849780343, + "max_sentence1_length": 3905, + "unique_sentence1": 15251, + "min_sentence2_length": 49, + "average_sentence2_length": 348.71811684479707, + "max_sentence2_length": 3801, + "unique_sentence2": 15251 + }, + "slk-slv": { + "num_samples": 1259, + "number_of_characters": 852109, + "unique_pairs": 1259, + "min_sentence1_length": 68, + "average_sentence1_length": 338.07783955520256, + "max_sentence1_length": 1961, + "unique_sentence1": 1259, + "min_sentence2_length": 71, + "average_sentence2_length": 338.736298649722, + "max_sentence2_length": 1872, + "unique_sentence2": 1259 + }, + "slk-srp": { + "num_samples": 561, + "number_of_characters": 493396, + "unique_pairs": 561, + "min_sentence1_length": 82, + "average_sentence1_length": 431.1336898395722, + "max_sentence1_length": 1910, + "unique_sentence1": 561, + "min_sentence2_length": 80, + "average_sentence2_length": 448.3600713012478, + "max_sentence2_length": 1845, + "unique_sentence2": 561 + }, + "slk-ukr": { + "num_samples": 944, + "number_of_characters": 608143, + "unique_pairs": 944, + "min_sentence1_length": 68, + "average_sentence1_length": 314.40783898305085, + "max_sentence1_length": 1910, + "unique_sentence1": 944, + "min_sentence2_length": 69, + "average_sentence2_length": 329.8114406779661, + "max_sentence2_length": 1923, + "unique_sentence2": 944 + }, + "slv-srp": { + "num_samples": 499, + "number_of_characters": 378293, + "unique_pairs": 499, + "min_sentence1_length": 80, + "average_sentence1_length": 374.4729458917836, + "max_sentence1_length": 2476, + "unique_sentence1": 499, + "min_sentence2_length": 80, + "average_sentence2_length": 383.6292585170341, + "max_sentence2_length": 2387, + "unique_sentence2": 499 + }, + "slv-ukr": { + "num_samples": 733, + "number_of_characters": 431361, + "unique_pairs": 733, + "min_sentence1_length": 71, + "average_sentence1_length": 286.63847203274213, + "max_sentence1_length": 1872, + "unique_sentence1": 733, + "min_sentence2_length": 69, + "average_sentence2_length": 301.8485675306958, + "max_sentence2_length": 1923, + "unique_sentence2": 733 + }, + "cat-deu": { + "num_samples": 302, + "number_of_characters": 279459, + "unique_pairs": 302, + "min_sentence1_length": 66, + "average_sentence1_length": 451.95364238410593, + "max_sentence1_length": 2056, + "unique_sentence1": 302, + "min_sentence2_length": 77, + "average_sentence2_length": 473.4072847682119, + "max_sentence2_length": 2082, + "unique_sentence2": 302 + }, + "cat-fra": { + "num_samples": 598, + "number_of_characters": 476709, + "unique_pairs": 598, + "min_sentence1_length": 62, + "average_sentence1_length": 381.67558528428094, + "max_sentence1_length": 2056, + "unique_sentence1": 598, + "min_sentence2_length": 74, + "average_sentence2_length": 415.4966555183947, + "max_sentence2_length": 2277, + "unique_sentence2": 598 + }, + "cat-ita": { + "num_samples": 418, + "number_of_characters": 327132, + "unique_pairs": 418, + "min_sentence1_length": 60, + "average_sentence1_length": 388.79904306220095, + "max_sentence1_length": 2056, + "unique_sentence1": 418, + "min_sentence2_length": 55, + "average_sentence2_length": 393.8133971291866, + "max_sentence2_length": 2186, + "unique_sentence2": 418 + }, + "cat-por": { + "num_samples": 370, + "number_of_characters": 248938, + "unique_pairs": 370, + "min_sentence1_length": 58, + "average_sentence1_length": 338.3243243243243, + "max_sentence1_length": 2056, + "unique_sentence1": 370, + "min_sentence2_length": 60, + "average_sentence2_length": 334.4810810810811, + "max_sentence2_length": 2088, + "unique_sentence2": 370 + }, + "cat-spa": { + "num_samples": 2648, + "number_of_characters": 2308040, + "unique_pairs": 2648, + "min_sentence1_length": 62, + "average_sentence1_length": 430.9592145015106, + "max_sentence1_length": 8113, + "unique_sentence1": 2648, + "min_sentence2_length": 65, + "average_sentence2_length": 440.6570996978852, + "max_sentence2_length": 8345, + "unique_sentence2": 2648 + }, + "dan-deu": { + "num_samples": 4337, + "number_of_characters": 3443148, + "unique_pairs": 4337, + "min_sentence1_length": 60, + "average_sentence1_length": 372.69564214895087, + "max_sentence1_length": 4236, + "unique_sentence1": 4337, + "min_sentence2_length": 69, + "average_sentence2_length": 421.20521097532855, + "max_sentence2_length": 4093, + "unique_sentence2": 4337 + }, + "dan-fra": { + "num_samples": 3802, + "number_of_characters": 3021023, + "unique_pairs": 3802, + "min_sentence1_length": 62, + "average_sentence1_length": 365.19542346133613, + "max_sentence1_length": 4236, + "unique_sentence1": 3802, + "min_sentence2_length": 70, + "average_sentence2_length": 429.3924250394529, + "max_sentence2_length": 4717, + "unique_sentence2": 3802 + }, + "dan-isl": { + "num_samples": 327, + "number_of_characters": 230870, + "unique_pairs": 327, + "min_sentence1_length": 81, + "average_sentence1_length": 357.5107033639144, + "max_sentence1_length": 1856, + "unique_sentence1": 327, + "min_sentence2_length": 82, + "average_sentence2_length": 348.51376146788994, + "max_sentence2_length": 1897, + "unique_sentence2": 327 + }, + "dan-ita": { + "num_samples": 3818, + "number_of_characters": 2976506, + "unique_pairs": 3818, + "min_sentence1_length": 62, + "average_sentence1_length": 371.5421686746988, + "max_sentence1_length": 4236, + "unique_sentence1": 3818, + "min_sentence2_length": 64, + "average_sentence2_length": 408.05605028810896, + "max_sentence2_length": 4574, + "unique_sentence2": 3818 + }, + "dan-nld": { + "num_samples": 4099, + "number_of_characters": 3047816, + "unique_pairs": 4099, + "min_sentence1_length": 68, + "average_sentence1_length": 360.1166138082459, + "max_sentence1_length": 4236, + "unique_sentence1": 4099, + "min_sentence2_length": 63, + "average_sentence2_length": 383.4344962185899, + "max_sentence2_length": 4431, + "unique_sentence2": 4099 + }, + "dan-nor": { + "num_samples": 2603, + "number_of_characters": 1873194, + "unique_pairs": 2603, + "min_sentence1_length": 62, + "average_sentence1_length": 365.46177487514404, + "max_sentence1_length": 3505, + "unique_sentence1": 2603, + "min_sentence2_length": 59, + "average_sentence2_length": 354.1671148674606, + "max_sentence2_length": 3400, + "unique_sentence2": 2603 + }, + "dan-por": { + "num_samples": 3206, + "number_of_characters": 2344257, + "unique_pairs": 3206, + "min_sentence1_length": 60, + "average_sentence1_length": 353.29850280723645, + "max_sentence1_length": 3843, + "unique_sentence1": 3206, + "min_sentence2_length": 65, + "average_sentence2_length": 377.91079226450404, + "max_sentence2_length": 3799, + "unique_sentence2": 3206 + }, + "dan-ron": { + "num_samples": 2052, + "number_of_characters": 1446475, + "unique_pairs": 2052, + "min_sentence1_length": 67, + "average_sentence1_length": 336.3269980506823, + "max_sentence1_length": 4236, + "unique_sentence1": 2052, + "min_sentence2_length": 63, + "average_sentence2_length": 368.5828460038986, + "max_sentence2_length": 4285, + "unique_sentence2": 2052 + }, + "dan-spa": { + "num_samples": 3571, + "number_of_characters": 2720999, + "unique_pairs": 3571, + "min_sentence1_length": 60, + "average_sentence1_length": 360.46233548025765, + "max_sentence1_length": 4236, + "unique_sentence1": 3571, + "min_sentence2_length": 65, + "average_sentence2_length": 401.508821058527, + "max_sentence2_length": 4498, + "unique_sentence2": 3571 + }, + "dan-swe": { + "num_samples": 4268, + "number_of_characters": 3115686, + "unique_pairs": 4268, + "min_sentence1_length": 63, + "average_sentence1_length": 368.2061855670103, + "max_sentence1_length": 15317, + "unique_sentence1": 4268, + "min_sentence2_length": 64, + "average_sentence2_length": 361.8045923149016, + "max_sentence2_length": 15427, + "unique_sentence2": 4268 + }, + "deu-fra": { + "num_samples": 27727, + "number_of_characters": 24341503, + "unique_pairs": 27727, + "min_sentence1_length": 52, + "average_sentence1_length": 429.8509395174379, + "max_sentence1_length": 10595, + "unique_sentence1": 27727, + "min_sentence2_length": 60, + "average_sentence2_length": 448.04796768492804, + "max_sentence2_length": 11165, + "unique_sentence2": 27727 + }, + "deu-isl": { + "num_samples": 294, + "number_of_characters": 221411, + "unique_pairs": 294, + "min_sentence1_length": 84, + "average_sentence1_length": 403.21768707482994, + "max_sentence1_length": 2082, + "unique_sentence1": 294, + "min_sentence2_length": 79, + "average_sentence2_length": 349.8809523809524, + "max_sentence2_length": 1897, + "unique_sentence2": 294 + }, + "deu-ita": { + "num_samples": 18787, + "number_of_characters": 15815694, + "unique_pairs": 18787, + "min_sentence1_length": 52, + "average_sentence1_length": 424.82344174162984, + "max_sentence1_length": 7136, + "unique_sentence1": 18787, + "min_sentence2_length": 56, + "average_sentence2_length": 417.0190025017299, + "max_sentence2_length": 6634, + "unique_sentence2": 18787 + }, + "deu-nld": { + "num_samples": 14211, + "number_of_characters": 11610783, + "unique_pairs": 14211, + "min_sentence1_length": 52, + "average_sentence1_length": 418.901695869397, + "max_sentence1_length": 4919, + "unique_sentence1": 14211, + "min_sentence2_length": 62, + "average_sentence2_length": 398.1261698684118, + "max_sentence2_length": 4779, + "unique_sentence2": 14211 + }, + "deu-nor": { + "num_samples": 2783, + "number_of_characters": 2150067, + "unique_pairs": 2783, + "min_sentence1_length": 63, + "average_sentence1_length": 413.81854114265184, + "max_sentence1_length": 18410, + "unique_sentence1": 2783, + "min_sentence2_length": 63, + "average_sentence2_length": 358.7531440891125, + "max_sentence2_length": 16149, + "unique_sentence2": 2783 + }, + "deu-por": { + "num_samples": 11319, + "number_of_characters": 9085897, + "unique_pairs": 11319, + "min_sentence1_length": 63, + "average_sentence1_length": 412.0379892216627, + "max_sentence1_length": 7136, + "unique_sentence1": 11319, + "min_sentence2_length": 59, + "average_sentence2_length": 390.6739994699178, + "max_sentence2_length": 6536, + "unique_sentence2": 11319 + }, + "deu-ron": { + "num_samples": 3598, + "number_of_characters": 2755112, + "unique_pairs": 3598, + "min_sentence1_length": 61, + "average_sentence1_length": 387.5605892162312, + "max_sentence1_length": 4093, + "unique_sentence1": 3598, + "min_sentence2_length": 55, + "average_sentence2_length": 378.17370761534187, + "max_sentence2_length": 4285, + "unique_sentence2": 3598 + }, + "deu-spa": { + "num_samples": 19739, + "number_of_characters": 16855942, + "unique_pairs": 19739, + "min_sentence1_length": 60, + "average_sentence1_length": 430.8435077764831, + "max_sentence1_length": 7136, + "unique_sentence1": 19739, + "min_sentence2_length": 55, + "average_sentence2_length": 423.09752267085463, + "max_sentence2_length": 6963, + "unique_sentence2": 19739 + }, + "deu-swe": { + "num_samples": 5772, + "number_of_characters": 4469906, + "unique_pairs": 5772, + "min_sentence1_length": 59, + "average_sentence1_length": 412.4137214137214, + "max_sentence1_length": 4093, + "unique_sentence1": 5772, + "min_sentence2_length": 57, + "average_sentence2_length": 361.9982674982675, + "max_sentence2_length": 4038, + "unique_sentence2": 5772 + }, + "fra-isl": { + "num_samples": 347, + "number_of_characters": 256923, + "unique_pairs": 347, + "min_sentence1_length": 76, + "average_sentence1_length": 400.63976945244957, + "max_sentence1_length": 2277, + "unique_sentence1": 347, + "min_sentence2_length": 75, + "average_sentence2_length": 339.7723342939481, + "max_sentence2_length": 1897, + "unique_sentence2": 347 + }, + "fra-ita": { + "num_samples": 20002, + "number_of_characters": 17269559, + "unique_pairs": 20002, + "min_sentence1_length": 62, + "average_sentence1_length": 444.9357064293571, + "max_sentence1_length": 7253, + "unique_sentence1": 20002, + "min_sentence2_length": 49, + "average_sentence2_length": 418.455904409559, + "max_sentence2_length": 6634, + "unique_sentence2": 20002 + }, + "fra-nld": { + "num_samples": 14684, + "number_of_characters": 12784405, + "unique_pairs": 14684, + "min_sentence1_length": 60, + "average_sentence1_length": 455.3099291746118, + "max_sentence1_length": 9107, + "unique_sentence1": 14684, + "min_sentence2_length": 51, + "average_sentence2_length": 415.32511577226916, + "max_sentence2_length": 8534, + "unique_sentence2": 14684 + }, + "fra-nor": { + "num_samples": 2558, + "number_of_characters": 1963253, + "unique_pairs": 2558, + "min_sentence1_length": 86, + "average_sentence1_length": 421.1630179827991, + "max_sentence1_length": 4086, + "unique_sentence1": 2558, + "min_sentence2_length": 64, + "average_sentence2_length": 346.3322908522283, + "max_sentence2_length": 3400, + "unique_sentence2": 2558 + }, + "fra-por": { + "num_samples": 13265, + "number_of_characters": 10942625, + "unique_pairs": 13265, + "min_sentence1_length": 61, + "average_sentence1_length": 431.68051262721445, + "max_sentence1_length": 7253, + "unique_sentence1": 13265, + "min_sentence2_length": 55, + "average_sentence2_length": 393.2441010177158, + "max_sentence2_length": 6536, + "unique_sentence2": 13265 + }, + "fra-ron": { + "num_samples": 3295, + "number_of_characters": 2448321, + "unique_pairs": 3295, + "min_sentence1_length": 70, + "average_sentence1_length": 385.2907435508346, + "max_sentence1_length": 4717, + "unique_sentence1": 3295, + "min_sentence2_length": 63, + "average_sentence2_length": 357.750531107739, + "max_sentence2_length": 4285, + "unique_sentence2": 3295 + }, + "fra-spa": { + "num_samples": 23311, + "number_of_characters": 20477763, + "unique_pairs": 23311, + "min_sentence1_length": 54, + "average_sentence1_length": 451.83681523744156, + "max_sentence1_length": 14943, + "unique_sentence1": 23311, + "min_sentence2_length": 50, + "average_sentence2_length": 426.62241002102013, + "max_sentence2_length": 13323, + "unique_sentence2": 23311 + }, + "fra-swe": { + "num_samples": 5006, + "number_of_characters": 3880565, + "unique_pairs": 5006, + "min_sentence1_length": 70, + "average_sentence1_length": 421.9480623252098, + "max_sentence1_length": 4717, + "unique_sentence1": 5006, + "min_sentence2_length": 59, + "average_sentence2_length": 353.2347183379944, + "max_sentence2_length": 4038, + "unique_sentence2": 5006 + }, + "isl-ita": { + "num_samples": 421, + "number_of_characters": 310158, + "unique_pairs": 421, + "min_sentence1_length": 75, + "average_sentence1_length": 350.3254156769596, + "max_sentence1_length": 1897, + "unique_sentence1": 421, + "min_sentence2_length": 74, + "average_sentence2_length": 386.3919239904988, + "max_sentence2_length": 2186, + "unique_sentence2": 421 + }, + "isl-nld": { + "num_samples": 311, + "number_of_characters": 228026, + "unique_pairs": 311, + "min_sentence1_length": 123, + "average_sentence1_length": 350.5594855305466, + "max_sentence1_length": 1705, + "unique_sentence1": 311, + "min_sentence2_length": 125, + "average_sentence2_length": 382.64308681672026, + "max_sentence2_length": 2142, + "unique_sentence2": 311 + }, + "isl-por": { + "num_samples": 341, + "number_of_characters": 260057, + "unique_pairs": 341, + "min_sentence1_length": 110, + "average_sentence1_length": 367.62463343108504, + "max_sentence1_length": 1897, + "unique_sentence1": 341, + "min_sentence2_length": 128, + "average_sentence2_length": 395.0058651026393, + "max_sentence2_length": 2088, + "unique_sentence2": 341 + }, + "isl-spa": { + "num_samples": 366, + "number_of_characters": 262085, + "unique_pairs": 366, + "min_sentence1_length": 75, + "average_sentence1_length": 336.9535519125683, + "max_sentence1_length": 1517, + "unique_sentence1": 366, + "min_sentence2_length": 73, + "average_sentence2_length": 379.1256830601093, + "max_sentence2_length": 1601, + "unique_sentence2": 366 + }, + "isl-swe": { + "num_samples": 312, + "number_of_characters": 214179, + "unique_pairs": 312, + "min_sentence1_length": 75, + "average_sentence1_length": 342.18910256410254, + "max_sentence1_length": 1897, + "unique_sentence1": 312, + "min_sentence2_length": 78, + "average_sentence2_length": 344.28205128205127, + "max_sentence2_length": 1841, + "unique_sentence2": 312 + }, + "ita-nld": { + "num_samples": 9160, + "number_of_characters": 7462623, + "unique_pairs": 9160, + "min_sentence1_length": 64, + "average_sentence1_length": 413.3672489082969, + "max_sentence1_length": 16311, + "unique_sentence1": 9160, + "min_sentence2_length": 62, + "average_sentence2_length": 401.32958515283843, + "max_sentence2_length": 15855, + "unique_sentence2": 9160 + }, + "ita-nor": { + "num_samples": 2516, + "number_of_characters": 1877016, + "unique_pairs": 2516, + "min_sentence1_length": 88, + "average_sentence1_length": 395.466613672496, + "max_sentence1_length": 3906, + "unique_sentence1": 2516, + "min_sentence2_length": 80, + "average_sentence2_length": 350.5651828298887, + "max_sentence2_length": 3030, + "unique_sentence2": 2516 + }, + "ita-por": { + "num_samples": 10924, + "number_of_characters": 8751330, + "unique_pairs": 10924, + "min_sentence1_length": 55, + "average_sentence1_length": 406.04696082021235, + "max_sentence1_length": 16311, + "unique_sentence1": 10924, + "min_sentence2_length": 55, + "average_sentence2_length": 395.06343830098865, + "max_sentence2_length": 16230, + "unique_sentence2": 10924 + }, + "ita-ron": { + "num_samples": 3360, + "number_of_characters": 2417807, + "unique_pairs": 3360, + "min_sentence1_length": 63, + "average_sentence1_length": 360.0520833333333, + "max_sentence1_length": 6226, + "unique_sentence1": 3360, + "min_sentence2_length": 65, + "average_sentence2_length": 359.53333333333336, + "max_sentence2_length": 6571, + "unique_sentence2": 3360 + }, + "ita-spa": { + "num_samples": 16534, + "number_of_characters": 14167503, + "unique_pairs": 16534, + "min_sentence1_length": 49, + "average_sentence1_length": 426.6369904439337, + "max_sentence1_length": 16311, + "unique_sentence1": 16534, + "min_sentence2_length": 50, + "average_sentence2_length": 430.2338816983186, + "max_sentence2_length": 16655, + "unique_sentence2": 16534 + }, + "ita-swe": { + "num_samples": 4741, + "number_of_characters": 3557838, + "unique_pairs": 4741, + "min_sentence1_length": 64, + "average_sentence1_length": 394.9702594389369, + "max_sentence1_length": 16311, + "unique_sentence1": 4741, + "min_sentence2_length": 57, + "average_sentence2_length": 355.4701539759544, + "max_sentence2_length": 15020, + "unique_sentence2": 4741 + }, + "nld-nor": { + "num_samples": 2664, + "number_of_characters": 1941142, + "unique_pairs": 2664, + "min_sentence1_length": 72, + "average_sentence1_length": 380.6306306306306, + "max_sentence1_length": 3967, + "unique_sentence1": 2664, + "min_sentence2_length": 68, + "average_sentence2_length": 348.0262762762763, + "max_sentence2_length": 3400, + "unique_sentence2": 2664 + }, + "nld-por": { + "num_samples": 7021, + "number_of_characters": 5347190, + "unique_pairs": 7021, + "min_sentence1_length": 51, + "average_sentence1_length": 380.21991169349093, + "max_sentence1_length": 15855, + "unique_sentence1": 7021, + "min_sentence2_length": 56, + "average_sentence2_length": 381.37957555903716, + "max_sentence2_length": 16230, + "unique_sentence2": 7021 + }, + "nld-ron": { + "num_samples": 2888, + "number_of_characters": 2001437, + "unique_pairs": 2888, + "min_sentence1_length": 60, + "average_sentence1_length": 340.68905817174516, + "max_sentence1_length": 4431, + "unique_sentence1": 2888, + "min_sentence2_length": 70, + "average_sentence2_length": 352.32929362880884, + "max_sentence2_length": 4285, + "unique_sentence2": 2888 + }, + "nld-spa": { + "num_samples": 9555, + "number_of_characters": 7861048, + "unique_pairs": 9555, + "min_sentence1_length": 62, + "average_sentence1_length": 403.9521716378859, + "max_sentence1_length": 15855, + "unique_sentence1": 9555, + "min_sentence2_length": 55, + "average_sentence2_length": 418.7634746206175, + "max_sentence2_length": 16655, + "unique_sentence2": 9555 + }, + "nld-swe": { + "num_samples": 5072, + "number_of_characters": 3727392, + "unique_pairs": 5072, + "min_sentence1_length": 65, + "average_sentence1_length": 381.6940063091483, + "max_sentence1_length": 15855, + "unique_sentence1": 5072, + "min_sentence2_length": 56, + "average_sentence2_length": 353.2018927444795, + "max_sentence2_length": 15020, + "unique_sentence2": 5072 + }, + "nor-por": { + "num_samples": 2096, + "number_of_characters": 1461412, + "unique_pairs": 2096, + "min_sentence1_length": 68, + "average_sentence1_length": 331.4675572519084, + "max_sentence1_length": 3400, + "unique_sentence1": 2096, + "min_sentence2_length": 61, + "average_sentence2_length": 365.7709923664122, + "max_sentence2_length": 3784, + "unique_sentence2": 2096 + }, + "nor-ron": { + "num_samples": 1412, + "number_of_characters": 972154, + "unique_pairs": 1412, + "min_sentence1_length": 78, + "average_sentence1_length": 324.0134560906516, + "max_sentence1_length": 1884, + "unique_sentence1": 1412, + "min_sentence2_length": 75, + "average_sentence2_length": 364.4808781869688, + "max_sentence2_length": 2196, + "unique_sentence2": 1412 + }, + "nor-spa": { + "num_samples": 2603, + "number_of_characters": 1933198, + "unique_pairs": 2603, + "min_sentence1_length": 63, + "average_sentence1_length": 347.5593545908567, + "max_sentence1_length": 3030, + "unique_sentence1": 2603, + "min_sentence2_length": 74, + "average_sentence2_length": 395.12139838647715, + "max_sentence2_length": 3847, + "unique_sentence2": 2603 + }, + "nor-swe": { + "num_samples": 3165, + "number_of_characters": 2305135, + "unique_pairs": 3165, + "min_sentence1_length": 66, + "average_sentence1_length": 360.9911532385466, + "max_sentence1_length": 2366, + "unique_sentence1": 3165, + "min_sentence2_length": 70, + "average_sentence2_length": 367.329541864139, + "max_sentence2_length": 2340, + "unique_sentence2": 3165 + }, + "por-ron": { + "num_samples": 3026, + "number_of_characters": 2086079, + "unique_pairs": 3026, + "min_sentence1_length": 71, + "average_sentence1_length": 340.8793787177792, + "max_sentence1_length": 4439, + "unique_sentence1": 3026, + "min_sentence2_length": 63, + "average_sentence2_length": 348.5056179775281, + "max_sentence2_length": 3274, + "unique_sentence2": 3026 + }, + "por-spa": { + "num_samples": 16084, + "number_of_characters": 12835938, + "unique_pairs": 16084, + "min_sentence1_length": 51, + "average_sentence1_length": 391.85967421039544, + "max_sentence1_length": 16230, + "unique_sentence1": 16084, + "min_sentence2_length": 54, + "average_sentence2_length": 406.19665506093014, + "max_sentence2_length": 16655, + "unique_sentence2": 16084 + }, + "por-swe": { + "num_samples": 4235, + "number_of_characters": 2994503, + "unique_pairs": 4235, + "min_sentence1_length": 62, + "average_sentence1_length": 367.6595041322314, + "max_sentence1_length": 16230, + "unique_sentence1": 4235, + "min_sentence2_length": 57, + "average_sentence2_length": 339.4250295159386, + "max_sentence2_length": 15020, + "unique_sentence2": 4235 + }, + "ron-spa": { + "num_samples": 3375, + "number_of_characters": 2415347, + "unique_pairs": 3375, + "min_sentence1_length": 73, + "average_sentence1_length": 355.3931851851852, + "max_sentence1_length": 4285, + "unique_sentence1": 3375, + "min_sentence2_length": 70, + "average_sentence2_length": 360.2651851851852, + "max_sentence2_length": 4498, + "unique_sentence2": 3375 + }, + "ron-swe": { + "num_samples": 2154, + "number_of_characters": 1454257, + "unique_pairs": 2154, + "min_sentence1_length": 63, + "average_sentence1_length": 354.6615598885794, + "max_sentence1_length": 4285, + "unique_sentence1": 2154, + "min_sentence2_length": 63, + "average_sentence2_length": 320.4809656453111, + "max_sentence2_length": 4038, + "unique_sentence2": 2154 + }, + "spa-swe": { + "num_samples": 4884, + "number_of_characters": 3751782, + "unique_pairs": 4884, + "min_sentence1_length": 66, + "average_sentence1_length": 406.5151515151515, + "max_sentence1_length": 16655, + "unique_sentence1": 4884, + "min_sentence2_length": 62, + "average_sentence2_length": 361.66298116298117, + "max_sentence2_length": 15020, + "unique_sentence2": 4884 + }, + "ben-hin": { + "num_samples": 1174, + "number_of_characters": 682915, + "unique_pairs": 1174, + "min_sentence1_length": 64, + "average_sentence1_length": 287.33049403747873, + "max_sentence1_length": 1957, + "unique_sentence1": 1174, + "min_sentence2_length": 50, + "average_sentence2_length": 294.36882453151617, + "max_sentence2_length": 1980, + "unique_sentence2": 1174 + }, + "ben-mar": { + "num_samples": 566, + "number_of_characters": 305353, + "unique_pairs": 566, + "min_sentence1_length": 50, + "average_sentence1_length": 271.8321554770318, + "max_sentence1_length": 1753, + "unique_sentence1": 566, + "min_sentence2_length": 57, + "average_sentence2_length": 267.660777385159, + "max_sentence2_length": 1780, + "unique_sentence2": 566 + }, + "ben-urd": { + "num_samples": 488, + "number_of_characters": 265698, + "unique_pairs": 488, + "min_sentence1_length": 61, + "average_sentence1_length": 269.23155737704917, + "max_sentence1_length": 1190, + "unique_sentence1": 488, + "min_sentence2_length": 62, + "average_sentence2_length": 275.23155737704917, + "max_sentence2_length": 1179, + "unique_sentence2": 488 + }, + "hin-mar": { + "num_samples": 615, + "number_of_characters": 320880, + "unique_pairs": 615, + "min_sentence1_length": 58, + "average_sentence1_length": 265.8861788617886, + "max_sentence1_length": 1769, + "unique_sentence1": 615, + "min_sentence2_length": 58, + "average_sentence2_length": 255.869918699187, + "max_sentence2_length": 1780, + "unique_sentence2": 615 + }, + "hin-urd": { + "num_samples": 545, + "number_of_characters": 293939, + "unique_pairs": 545, + "min_sentence1_length": 63, + "average_sentence1_length": 271.8880733944954, + "max_sentence1_length": 1206, + "unique_sentence1": 545, + "min_sentence2_length": 62, + "average_sentence2_length": 267.44954128440367, + "max_sentence2_length": 1179, + "unique_sentence2": 545 + }, + "mar-urd": { + "num_samples": 270, + "number_of_characters": 147706, + "unique_pairs": 270, + "min_sentence1_length": 63, + "average_sentence1_length": 270.89629629629627, + "max_sentence1_length": 1169, + "unique_sentence1": 270, + "min_sentence2_length": 66, + "average_sentence2_length": 276.162962962963, + "max_sentence2_length": 1172, + "unique_sentence2": 270 + }, + "aze-kaz": { + "num_samples": 412, + "number_of_characters": 230950, + "unique_pairs": 412, + "min_sentence1_length": 73, + "average_sentence1_length": 280.497572815534, + "max_sentence1_length": 1824, + "unique_sentence1": 412, + "min_sentence2_length": 68, + "average_sentence2_length": 280.06067961165047, + "max_sentence2_length": 1855, + "unique_sentence2": 412 + }, + "aze-tur": { + "num_samples": 388, + "number_of_characters": 205998, + "unique_pairs": 388, + "min_sentence1_length": 72, + "average_sentence1_length": 266.66752577319585, + "max_sentence1_length": 1824, + "unique_sentence1": 388, + "min_sentence2_length": 64, + "average_sentence2_length": 264.25515463917526, + "max_sentence2_length": 1838, + "unique_sentence2": 388 + }, + "kaz-tur": { + "num_samples": 340, + "number_of_characters": 181572, + "unique_pairs": 340, + "min_sentence1_length": 68, + "average_sentence1_length": 267.44117647058823, + "max_sentence1_length": 1855, + "unique_sentence1": 340, + "min_sentence2_length": 64, + "average_sentence2_length": 266.59411764705885, + "max_sentence2_length": 1838, + "unique_sentence2": 340 + }, + "est-fin": { + "num_samples": 790, + "number_of_characters": 551725, + "unique_pairs": 790, + "min_sentence1_length": 63, + "average_sentence1_length": 341.0708860759494, + "max_sentence1_length": 1829, + "unique_sentence1": 790, + "min_sentence2_length": 63, + "average_sentence2_length": 357.3151898734177, + "max_sentence2_length": 1815, + "unique_sentence2": 790 + }, + "est-hun": { + "num_samples": 674, + "number_of_characters": 465779, + "unique_pairs": 674, + "min_sentence1_length": 72, + "average_sentence1_length": 329.88278931750745, + "max_sentence1_length": 2117, + "unique_sentence1": 674, + "min_sentence2_length": 73, + "average_sentence2_length": 361.1839762611276, + "max_sentence2_length": 2403, + "unique_sentence2": 674 + }, + "fin-hun": { + "num_samples": 1542, + "number_of_characters": 1062678, + "unique_pairs": 1542, + "min_sentence1_length": 65, + "average_sentence1_length": 336.8067444876783, + "max_sentence1_length": 2500, + "unique_sentence1": 1542, + "min_sentence2_length": 66, + "average_sentence2_length": 352.34889753566796, + "max_sentence2_length": 2674, + "unique_sentence2": 1542 + }, + "ara-eng": { + "num_samples": 5698, + "number_of_characters": 3662324, + "unique_pairs": 5698, + "min_sentence1_length": 53, + "average_sentence1_length": 302.5900315900316, + "max_sentence1_length": 2664, + "unique_sentence1": 5698, + "min_sentence2_length": 60, + "average_sentence2_length": 340.14847314847316, + "max_sentence2_length": 2811, + "unique_sentence2": 5698 + }, + "aze-eng": { + "num_samples": 603, + "number_of_characters": 383857, + "unique_pairs": 603, + "min_sentence1_length": 58, + "average_sentence1_length": 323.5240464344942, + "max_sentence1_length": 1339, + "unique_sentence1": 603, + "min_sentence2_length": 69, + "average_sentence2_length": 313.0547263681592, + "max_sentence2_length": 1352, + "unique_sentence2": 603 + }, + "ben-eng": { + "num_samples": 1367, + "number_of_characters": 866437, + "unique_pairs": 1367, + "min_sentence1_length": 50, + "average_sentence1_length": 318.6664228237015, + "max_sentence1_length": 1191, + "unique_sentence1": 1367, + "min_sentence2_length": 65, + "average_sentence2_length": 315.15727871250914, + "max_sentence2_length": 1094, + "unique_sentence2": 1367 + }, + "bul-eng": { + "num_samples": 2133, + "number_of_characters": 1458078, + "unique_pairs": 2133, + "min_sentence1_length": 65, + "average_sentence1_length": 354.9592123769339, + "max_sentence1_length": 3016, + "unique_sentence1": 2133, + "min_sentence2_length": 65, + "average_sentence2_length": 328.62165963431784, + "max_sentence2_length": 2770, + "unique_sentence2": 2133 + }, + "cat-eng": { + "num_samples": 1152, + "number_of_characters": 977495, + "unique_pairs": 1152, + "min_sentence1_length": 64, + "average_sentence1_length": 437.25434027777777, + "max_sentence1_length": 8113, + "unique_sentence1": 1152, + "min_sentence2_length": 59, + "average_sentence2_length": 411.265625, + "max_sentence2_length": 7400, + "unique_sentence2": 1152 + }, + "ces-eng": { + "num_samples": 3775, + "number_of_characters": 2651016, + "unique_pairs": 3775, + "min_sentence1_length": 54, + "average_sentence1_length": 347.59152317880796, + "max_sentence1_length": 2349, + "unique_sentence1": 3775, + "min_sentence2_length": 60, + "average_sentence2_length": 354.66437086092714, + "max_sentence2_length": 2401, + "unique_sentence2": 3775 + }, + "dan-eng": { + "num_samples": 4512, + "number_of_characters": 3404958, + "unique_pairs": 4512, + "min_sentence1_length": 49, + "average_sentence1_length": 381.45301418439715, + "max_sentence1_length": 15317, + "unique_sentence1": 4512, + "min_sentence2_length": 56, + "average_sentence2_length": 373.19193262411346, + "max_sentence2_length": 14749, + "unique_sentence2": 4512 + }, + "deu-eng": { + "num_samples": 37348, + "number_of_characters": 29811894, + "unique_pairs": 37348, + "min_sentence1_length": 53, + "average_sentence1_length": 423.63243011673984, + "max_sentence1_length": 6437, + "unique_sentence1": 37348, + "min_sentence2_length": 46, + "average_sentence2_length": 374.58685873406876, + "max_sentence2_length": 5781, + "unique_sentence2": 37348 + }, + "ell-eng": { + "num_samples": 2790, + "number_of_characters": 2021300, + "unique_pairs": 2790, + "min_sentence1_length": 66, + "average_sentence1_length": 394.6537634408602, + "max_sentence1_length": 2963, + "unique_sentence1": 2790, + "min_sentence2_length": 61, + "average_sentence2_length": 329.82652329749106, + "max_sentence2_length": 3013, + "unique_sentence2": 2790 + }, + "eng-est": { + "num_samples": 755, + "number_of_characters": 528417, + "unique_pairs": 755, + "min_sentence1_length": 58, + "average_sentence1_length": 352.57880794701987, + "max_sentence1_length": 1567, + "unique_sentence1": 755, + "min_sentence2_length": 62, + "average_sentence2_length": 347.3112582781457, + "max_sentence2_length": 1630, + "unique_sentence2": 755 + }, + "eng-fas": { + "num_samples": 556, + "number_of_characters": 431685, + "unique_pairs": 556, + "min_sentence1_length": 59, + "average_sentence1_length": 396.6492805755396, + "max_sentence1_length": 5339, + "unique_sentence1": 556, + "min_sentence2_length": 68, + "average_sentence2_length": 379.76258992805754, + "max_sentence2_length": 4782, + "unique_sentence2": 556 + }, + "eng-fin": { + "num_samples": 3443, + "number_of_characters": 2505517, + "unique_pairs": 3443, + "min_sentence1_length": 62, + "average_sentence1_length": 359.75776938716234, + "max_sentence1_length": 4412, + "unique_sentence1": 3443, + "min_sentence2_length": 57, + "average_sentence2_length": 367.9555620098751, + "max_sentence2_length": 4583, + "unique_sentence2": 3443 + }, + "eng-fra": { + "num_samples": 37208, + "number_of_characters": 30609932, + "unique_pairs": 37208, + "min_sentence1_length": 52, + "average_sentence1_length": 375.6779724790368, + "max_sentence1_length": 14463, + "unique_sentence1": 37208, + "min_sentence2_length": 59, + "average_sentence2_length": 446.99274349602234, + "max_sentence2_length": 15312, + "unique_sentence2": 37208 + }, + "eng-heb": { + "num_samples": 882, + "number_of_characters": 541517, + "unique_pairs": 882, + "min_sentence1_length": 80, + "average_sentence1_length": 339.5770975056689, + "max_sentence1_length": 16651, + "unique_sentence1": 882, + "min_sentence2_length": 62, + "average_sentence2_length": 274.38775510204084, + "max_sentence2_length": 14483, + "unique_sentence2": 882 + }, + "eng-hin": { + "num_samples": 2219, + "number_of_characters": 1277126, + "unique_pairs": 2219, + "min_sentence1_length": 59, + "average_sentence1_length": 284.607030193781, + "max_sentence1_length": 2439, + "unique_sentence1": 2219, + "min_sentence2_length": 50, + "average_sentence2_length": 290.93420459666515, + "max_sentence2_length": 2496, + "unique_sentence2": 2219 + }, + "eng-hrv": { + "num_samples": 336, + "number_of_characters": 247780, + "unique_pairs": 336, + "min_sentence1_length": 79, + "average_sentence1_length": 370.67559523809524, + "max_sentence1_length": 1657, + "unique_sentence1": 336, + "min_sentence2_length": 59, + "average_sentence2_length": 366.76488095238096, + "max_sentence2_length": 1393, + "unique_sentence2": 336 + }, + "eng-hun": { + "num_samples": 2185, + "number_of_characters": 1517498, + "unique_pairs": 2185, + "min_sentence1_length": 61, + "average_sentence1_length": 334.85766590389017, + "max_sentence1_length": 1664, + "unique_sentence1": 2185, + "min_sentence2_length": 55, + "average_sentence2_length": 359.64942791762013, + "max_sentence2_length": 1814, + "unique_sentence2": 2185 + }, + "eng-ind": { + "num_samples": 3454, + "number_of_characters": 2493124, + "unique_pairs": 3454, + "min_sentence1_length": 60, + "average_sentence1_length": 344.7333526346265, + "max_sentence1_length": 4253, + "unique_sentence1": 3454, + "min_sentence2_length": 67, + "average_sentence2_length": 377.07440648523453, + "max_sentence2_length": 4132, + "unique_sentence2": 3454 + }, + "eng-isl": { + "num_samples": 358, + "number_of_characters": 237004, + "unique_pairs": 358, + "min_sentence1_length": 72, + "average_sentence1_length": 329.6117318435754, + "max_sentence1_length": 1112, + "unique_sentence1": 358, + "min_sentence2_length": 69, + "average_sentence2_length": 332.4106145251397, + "max_sentence2_length": 1206, + "unique_sentence2": 358 + }, + "eng-ita": { + "num_samples": 19661, + "number_of_characters": 15893506, + "unique_pairs": 19661, + "min_sentence1_length": 54, + "average_sentence1_length": 381.6225522608209, + "max_sentence1_length": 14540, + "unique_sentence1": 19661, + "min_sentence2_length": 49, + "average_sentence2_length": 426.75474289201975, + "max_sentence2_length": 16311, + "unique_sentence2": 19661 + }, + "eng-jpn": { + "num_samples": 3807, + "number_of_characters": 1890484, + "unique_pairs": 3807, + "min_sentence1_length": 66, + "average_sentence1_length": 323.3538219070134, + "max_sentence1_length": 1738, + "unique_sentence1": 3807, + "min_sentence2_length": 47, + "average_sentence2_length": 173.22721302863147, + "max_sentence2_length": 856, + "unique_sentence2": 3807 + }, + "eng-kaz": { + "num_samples": 346, + "number_of_characters": 245791, + "unique_pairs": 346, + "min_sentence1_length": 72, + "average_sentence1_length": 349.8699421965318, + "max_sentence1_length": 2126, + "unique_sentence1": 346, + "min_sentence2_length": 79, + "average_sentence2_length": 360.5086705202312, + "max_sentence2_length": 2052, + "unique_sentence2": 346 + }, + "eng-kor": { + "num_samples": 2558, + "number_of_characters": 1358932, + "unique_pairs": 2558, + "min_sentence1_length": 58, + "average_sentence1_length": 342.4530883502737, + "max_sentence1_length": 11054, + "unique_sentence1": 2558, + "min_sentence2_length": 47, + "average_sentence2_length": 188.7947615324472, + "max_sentence2_length": 6212, + "unique_sentence2": 2558 + }, + "eng-lav": { + "num_samples": 1079, + "number_of_characters": 765748, + "unique_pairs": 1079, + "min_sentence1_length": 63, + "average_sentence1_length": 352.17979610750695, + "max_sentence1_length": 3960, + "unique_sentence1": 1079, + "min_sentence2_length": 65, + "average_sentence2_length": 357.5032437442076, + "max_sentence2_length": 3996, + "unique_sentence2": 1079 + }, + "eng-lit": { + "num_samples": 1185, + "number_of_characters": 884963, + "unique_pairs": 1185, + "min_sentence1_length": 70, + "average_sentence1_length": 369.4016877637131, + "max_sentence1_length": 3955, + "unique_sentence1": 1185, + "min_sentence2_length": 87, + "average_sentence2_length": 377.40253164556964, + "max_sentence2_length": 3841, + "unique_sentence2": 1185 + }, + "eng-mar": { + "num_samples": 280, + "number_of_characters": 153207, + "unique_pairs": 280, + "min_sentence1_length": 65, + "average_sentence1_length": 274.54285714285714, + "max_sentence1_length": 883, + "unique_sentence1": 280, + "min_sentence2_length": 57, + "average_sentence2_length": 272.625, + "max_sentence2_length": 943, + "unique_sentence2": 280 + }, + "eng-msa": { + "num_samples": 469, + "number_of_characters": 205989, + "unique_pairs": 469, + "min_sentence1_length": 58, + "average_sentence1_length": 209.6417910447761, + "max_sentence1_length": 1842, + "unique_sentence1": 469, + "min_sentence2_length": 58, + "average_sentence2_length": 229.56716417910448, + "max_sentence2_length": 2082, + "unique_sentence2": 469 + }, + "eng-nld": { + "num_samples": 15613, + "number_of_characters": 12313146, + "unique_pairs": 15613, + "min_sentence1_length": 55, + "average_sentence1_length": 378.0273490040351, + "max_sentence1_length": 15297, + "unique_sentence1": 15613, + "min_sentence2_length": 54, + "average_sentence2_length": 410.6196759110997, + "max_sentence2_length": 16485, + "unique_sentence2": 15613 + }, + "eng-nor": { + "num_samples": 2666, + "number_of_characters": 1883809, + "unique_pairs": 2666, + "min_sentence1_length": 55, + "average_sentence1_length": 353.07801950487624, + "max_sentence1_length": 2834, + "unique_sentence1": 2666, + "min_sentence2_length": 56, + "average_sentence2_length": 353.5270067516879, + "max_sentence2_length": 2795, + "unique_sentence2": 2666 + }, + "eng-pol": { + "num_samples": 6868, + "number_of_characters": 4946440, + "unique_pairs": 6868, + "min_sentence1_length": 54, + "average_sentence1_length": 349.2925160163075, + "max_sentence1_length": 4412, + "unique_sentence1": 6868, + "min_sentence2_length": 57, + "average_sentence2_length": 370.9229761211415, + "max_sentence2_length": 5103, + "unique_sentence2": 6868 + }, + "eng-por": { + "num_samples": 12406, + "number_of_characters": 9040635, + "unique_pairs": 12406, + "min_sentence1_length": 58, + "average_sentence1_length": 347.93801386425923, + "max_sentence1_length": 6643, + "unique_sentence1": 12406, + "min_sentence2_length": 56, + "average_sentence2_length": 380.79284217314205, + "max_sentence2_length": 7445, + "unique_sentence2": 12406 + }, + "eng-ron": { + "num_samples": 3039, + "number_of_characters": 2119434, + "unique_pairs": 3039, + "min_sentence1_length": 61, + "average_sentence1_length": 328.9292530437644, + "max_sentence1_length": 2085, + "unique_sentence1": 3039, + "min_sentence2_length": 70, + "average_sentence2_length": 368.4823955248437, + "max_sentence2_length": 2421, + "unique_sentence2": 3039 + }, + "eng-rus": { + "num_samples": 9360, + "number_of_characters": 6547558, + "unique_pairs": 9360, + "min_sentence1_length": 54, + "average_sentence1_length": 340.46976495726494, + "max_sentence1_length": 3382, + "unique_sentence1": 9360, + "min_sentence2_length": 56, + "average_sentence2_length": 359.0556623931624, + "max_sentence2_length": 4018, + "unique_sentence2": 9360 + }, + "eng-slk": { + "num_samples": 1823, + "number_of_characters": 1287409, + "unique_pairs": 1823, + "min_sentence1_length": 64, + "average_sentence1_length": 353.213933077345, + "max_sentence1_length": 4412, + "unique_sentence1": 1823, + "min_sentence2_length": 68, + "average_sentence2_length": 352.9895776193088, + "max_sentence2_length": 4641, + "unique_sentence2": 1823 + }, + "eng-slv": { + "num_samples": 1450, + "number_of_characters": 948874, + "unique_pairs": 1450, + "min_sentence1_length": 61, + "average_sentence1_length": 327.6303448275862, + "max_sentence1_length": 2058, + "unique_sentence1": 1450, + "min_sentence2_length": 59, + "average_sentence2_length": 326.7655172413793, + "max_sentence2_length": 2049, + "unique_sentence2": 1450 + }, + "eng-spa": { + "num_samples": 35446, + "number_of_characters": 29514190, + "unique_pairs": 35446, + "min_sentence1_length": 52, + "average_sentence1_length": 391.4480900524742, + "max_sentence1_length": 16651, + "unique_sentence1": 35446, + "min_sentence2_length": 47, + "average_sentence2_length": 441.20411329910286, + "max_sentence2_length": 21081, + "unique_sentence2": 35446 + }, + "eng-srp": { + "num_samples": 303, + "number_of_characters": 196853, + "unique_pairs": 303, + "min_sentence1_length": 85, + "average_sentence1_length": 323.19471947194717, + "max_sentence1_length": 1054, + "unique_sentence1": 303, + "min_sentence2_length": 78, + "average_sentence2_length": 326.48514851485146, + "max_sentence2_length": 1025, + "unique_sentence2": 303 + }, + "eng-swe": { + "num_samples": 6005, + "number_of_characters": 4500597, + "unique_pairs": 6005, + "min_sentence1_length": 54, + "average_sentence1_length": 373.82148209825147, + "max_sentence1_length": 14540, + "unique_sentence1": 6005, + "min_sentence2_length": 56, + "average_sentence2_length": 375.6534554537885, + "max_sentence2_length": 15020, + "unique_sentence2": 6005 + }, + "eng-tgl": { + "num_samples": 551, + "number_of_characters": 426303, + "unique_pairs": 551, + "min_sentence1_length": 61, + "average_sentence1_length": 346.0907441016334, + "max_sentence1_length": 1958, + "unique_sentence1": 551, + "min_sentence2_length": 57, + "average_sentence2_length": 427.5989110707804, + "max_sentence2_length": 2422, + "unique_sentence2": 551 + }, + "eng-tha": { + "num_samples": 814, + "number_of_characters": 426960, + "unique_pairs": 814, + "min_sentence1_length": 67, + "average_sentence1_length": 274.76044226044223, + "max_sentence1_length": 1842, + "unique_sentence1": 814, + "min_sentence2_length": 52, + "average_sentence2_length": 249.76044226044226, + "max_sentence2_length": 1669, + "unique_sentence2": 814 + }, + "eng-tur": { + "num_samples": 4606, + "number_of_characters": 3315390, + "unique_pairs": 4606, + "min_sentence1_length": 58, + "average_sentence1_length": 353.3762483716891, + "max_sentence1_length": 5595, + "unique_sentence1": 4606, + "min_sentence2_length": 58, + "average_sentence2_length": 366.42184107685625, + "max_sentence2_length": 6024, + "unique_sentence2": 4606 + }, + "eng-ukr": { + "num_samples": 3778, + "number_of_characters": 2803318, + "unique_pairs": 3778, + "min_sentence1_length": 59, + "average_sentence1_length": 367.927210164108, + "max_sentence1_length": 3627, + "unique_sentence1": 3778, + "min_sentence2_length": 49, + "average_sentence2_length": 374.08390682901006, + "max_sentence2_length": 3991, + "unique_sentence2": 3778 + }, + "eng-urd": { + "num_samples": 268, + "number_of_characters": 137458, + "unique_pairs": 268, + "min_sentence1_length": 67, + "average_sentence1_length": 253.8320895522388, + "max_sentence1_length": 736, + "unique_sentence1": 268, + "min_sentence2_length": 57, + "average_sentence2_length": 259.07089552238807, + "max_sentence2_length": 795, + "unique_sentence2": 268 + }, + "eng-vie": { + "num_samples": 1264, + "number_of_characters": 866332, + "unique_pairs": 1264, + "min_sentence1_length": 60, + "average_sentence1_length": 330.7072784810127, + "max_sentence1_length": 4253, + "unique_sentence1": 1264, + "min_sentence2_length": 59, + "average_sentence2_length": 354.68196202531647, + "max_sentence2_length": 3780, + "unique_sentence2": 1264 + }, + "eng-zho": { + "num_samples": 4959, + "number_of_characters": 2696879, + "unique_pairs": 4959, + "min_sentence1_length": 75, + "average_sentence1_length": 402.3762855414398, + "max_sentence1_length": 14540, + "unique_sentence1": 4959, + "min_sentence2_length": 41, + "average_sentence2_length": 141.45896350070578, + "max_sentence2_length": 4500, + "unique_sentence2": 4959 + } + } + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/BitextMining/WebFAQBitextMiningQuestions.json b/mteb/descriptive_stats/BitextMining/WebFAQBitextMiningQuestions.json new file mode 100644 index 0000000000..ef654596cd --- /dev/null +++ b/mteb/descriptive_stats/BitextMining/WebFAQBitextMiningQuestions.json @@ -0,0 +1,2227 @@ +{ + "default": { + "num_samples": 682057, + "number_of_characters": 71984237, + "unique_pairs": 681597, + "min_sentence1_length": 6, + "average_sentence1_length": 52.72250559703954, + "max_sentence1_length": 847, + "unique_sentence1": 379086, + "min_sentence2_length": 6, + "average_sentence2_length": 52.81740822247994, + "max_sentence2_length": 773, + "unique_sentence2": 398743, + "hf_subset_descriptive_stats": { + "ara-fas": { + "num_samples": 609, + "number_of_characters": 53319, + "unique_pairs": 609, + "min_sentence1_length": 10, + "average_sentence1_length": 41.90147783251231, + "max_sentence1_length": 164, + "unique_sentence1": 609, + "min_sentence2_length": 11, + "average_sentence2_length": 45.65024630541872, + "max_sentence2_length": 177, + "unique_sentence2": 609 + }, + "ara-heb": { + "num_samples": 978, + "number_of_characters": 93188, + "unique_pairs": 978, + "min_sentence1_length": 10, + "average_sentence1_length": 48.33844580777096, + "max_sentence1_length": 158, + "unique_sentence1": 978, + "min_sentence2_length": 9, + "average_sentence2_length": 46.945807770961146, + "max_sentence2_length": 160, + "unique_sentence2": 978 + }, + "jpn-kor": { + "num_samples": 4820, + "number_of_characters": 310427, + "unique_pairs": 4820, + "min_sentence1_length": 10, + "average_sentence1_length": 31.906224066390042, + "max_sentence1_length": 146, + "unique_sentence1": 4820, + "min_sentence2_length": 7, + "average_sentence2_length": 32.49771784232365, + "max_sentence2_length": 208, + "unique_sentence2": 4820 + }, + "jpn-vie": { + "num_samples": 1356, + "number_of_characters": 110198, + "unique_pairs": 1356, + "min_sentence1_length": 10, + "average_sentence1_length": 28.54793510324484, + "max_sentence1_length": 152, + "unique_sentence1": 1356, + "min_sentence2_length": 10, + "average_sentence2_length": 52.719026548672566, + "max_sentence2_length": 344, + "unique_sentence2": 1356 + }, + "jpn-zho": { + "num_samples": 1728, + "number_of_characters": 86936, + "unique_pairs": 1728, + "min_sentence1_length": 7, + "average_sentence1_length": 29.02777777777778, + "max_sentence1_length": 135, + "unique_sentence1": 1728, + "min_sentence2_length": 8, + "average_sentence2_length": 21.28240740740741, + "max_sentence2_length": 126, + "unique_sentence2": 1728 + }, + "kor-vie": { + "num_samples": 1386, + "number_of_characters": 116716, + "unique_pairs": 1386, + "min_sentence1_length": 10, + "average_sentence1_length": 30.344155844155843, + "max_sentence1_length": 108, + "unique_sentence1": 1386, + "min_sentence2_length": 10, + "average_sentence2_length": 53.86652236652237, + "max_sentence2_length": 176, + "unique_sentence2": 1386 + }, + "kor-zho": { + "num_samples": 1087, + "number_of_characters": 56152, + "unique_pairs": 1087, + "min_sentence1_length": 10, + "average_sentence1_length": 30.238270469181234, + "max_sentence1_length": 109, + "unique_sentence1": 1087, + "min_sentence2_length": 10, + "average_sentence2_length": 21.419503219871206, + "max_sentence2_length": 95, + "unique_sentence2": 1087 + }, + "vie-zho": { + "num_samples": 646, + "number_of_characters": 46981, + "unique_pairs": 646, + "min_sentence1_length": 11, + "average_sentence1_length": 52.128482972136226, + "max_sentence1_length": 258, + "unique_sentence1": 646, + "min_sentence2_length": 10, + "average_sentence2_length": 20.597523219814242, + "max_sentence2_length": 78, + "unique_sentence2": 646 + }, + "ind-msa": { + "num_samples": 455, + "number_of_characters": 61802, + "unique_pairs": 455, + "min_sentence1_length": 17, + "average_sentence1_length": 69.12527472527472, + "max_sentence1_length": 135, + "unique_sentence1": 455, + "min_sentence2_length": 16, + "average_sentence2_length": 66.7032967032967, + "max_sentence2_length": 156, + "unique_sentence2": 455 + }, + "ind-tgl": { + "num_samples": 378, + "number_of_characters": 40452, + "unique_pairs": 378, + "min_sentence1_length": 13, + "average_sentence1_length": 51.301587301587304, + "max_sentence1_length": 122, + "unique_sentence1": 378, + "min_sentence2_length": 13, + "average_sentence2_length": 55.714285714285715, + "max_sentence2_length": 136, + "unique_sentence2": 378 + }, + "ind-tha": { + "num_samples": 1258, + "number_of_characters": 129949, + "unique_pairs": 1258, + "min_sentence1_length": 11, + "average_sentence1_length": 56.66295707472178, + "max_sentence1_length": 159, + "unique_sentence1": 1258, + "min_sentence2_length": 11, + "average_sentence2_length": 46.63513513513514, + "max_sentence2_length": 153, + "unique_sentence2": 1258 + }, + "bul-ces": { + "num_samples": 1485, + "number_of_characters": 156647, + "unique_pairs": 1485, + "min_sentence1_length": 12, + "average_sentence1_length": 55.73468013468013, + "max_sentence1_length": 273, + "unique_sentence1": 1485, + "min_sentence2_length": 10, + "average_sentence2_length": 49.75151515151515, + "max_sentence2_length": 265, + "unique_sentence2": 1485 + }, + "bul-lav": { + "num_samples": 710, + "number_of_characters": 72767, + "unique_pairs": 710, + "min_sentence1_length": 14, + "average_sentence1_length": 52.807042253521125, + "max_sentence1_length": 169, + "unique_sentence1": 710, + "min_sentence2_length": 13, + "average_sentence2_length": 49.68169014084507, + "max_sentence2_length": 137, + "unique_sentence2": 710 + }, + "bul-lit": { + "num_samples": 803, + "number_of_characters": 83819, + "unique_pairs": 803, + "min_sentence1_length": 14, + "average_sentence1_length": 53.80821917808219, + "max_sentence1_length": 223, + "unique_sentence1": 803, + "min_sentence2_length": 14, + "average_sentence2_length": 50.57409713574097, + "max_sentence2_length": 188, + "unique_sentence2": 803 + }, + "bul-pol": { + "num_samples": 1635, + "number_of_characters": 182803, + "unique_pairs": 1635, + "min_sentence1_length": 12, + "average_sentence1_length": 56.70581039755351, + "max_sentence1_length": 224, + "unique_sentence1": 1635, + "min_sentence2_length": 14, + "average_sentence2_length": 55.10030581039755, + "max_sentence2_length": 223, + "unique_sentence2": 1635 + }, + "bul-rus": { + "num_samples": 1476, + "number_of_characters": 171755, + "unique_pairs": 1476, + "min_sentence1_length": 11, + "average_sentence1_length": 58.61178861788618, + "max_sentence1_length": 273, + "unique_sentence1": 1476, + "min_sentence2_length": 13, + "average_sentence2_length": 57.75338753387534, + "max_sentence2_length": 263, + "unique_sentence2": 1476 + }, + "bul-slk": { + "num_samples": 1154, + "number_of_characters": 122062, + "unique_pairs": 1154, + "min_sentence1_length": 14, + "average_sentence1_length": 55.46707105719238, + "max_sentence1_length": 273, + "unique_sentence1": 1154, + "min_sentence2_length": 12, + "average_sentence2_length": 50.305892547660314, + "max_sentence2_length": 234, + "unique_sentence2": 1154 + }, + "bul-slv": { + "num_samples": 1034, + "number_of_characters": 120376, + "unique_pairs": 1034, + "min_sentence1_length": 13, + "average_sentence1_length": 60.3926499032882, + "max_sentence1_length": 273, + "unique_sentence1": 1034, + "min_sentence2_length": 12, + "average_sentence2_length": 56.02514506769826, + "max_sentence2_length": 252, + "unique_sentence2": 1034 + }, + "bul-srp": { + "num_samples": 296, + "number_of_characters": 29879, + "unique_pairs": 296, + "min_sentence1_length": 17, + "average_sentence1_length": 51.28378378378378, + "max_sentence1_length": 146, + "unique_sentence1": 296, + "min_sentence2_length": 10, + "average_sentence2_length": 49.65878378378378, + "max_sentence2_length": 118, + "unique_sentence2": 296 + }, + "bul-ukr": { + "num_samples": 1074, + "number_of_characters": 120520, + "unique_pairs": 1074, + "min_sentence1_length": 11, + "average_sentence1_length": 57.256052141527, + "max_sentence1_length": 155, + "unique_sentence1": 1074, + "min_sentence2_length": 12, + "average_sentence2_length": 54.95996275605214, + "max_sentence2_length": 152, + "unique_sentence2": 1074 + }, + "ces-lav": { + "num_samples": 875, + "number_of_characters": 81949, + "unique_pairs": 875, + "min_sentence1_length": 12, + "average_sentence1_length": 45.45485714285714, + "max_sentence1_length": 133, + "unique_sentence1": 875, + "min_sentence2_length": 13, + "average_sentence2_length": 48.201142857142855, + "max_sentence2_length": 151, + "unique_sentence2": 875 + }, + "ces-lit": { + "num_samples": 1002, + "number_of_characters": 95191, + "unique_pairs": 1002, + "min_sentence1_length": 10, + "average_sentence1_length": 45.98802395209581, + "max_sentence1_length": 129, + "unique_sentence1": 1002, + "min_sentence2_length": 12, + "average_sentence2_length": 49.01297405189621, + "max_sentence2_length": 177, + "unique_sentence2": 1002 + }, + "ces-pol": { + "num_samples": 3367, + "number_of_characters": 335068, + "unique_pairs": 3367, + "min_sentence1_length": 10, + "average_sentence1_length": 47.42084942084942, + "max_sentence1_length": 211, + "unique_sentence1": 3367, + "min_sentence2_length": 13, + "average_sentence2_length": 52.09444609444609, + "max_sentence2_length": 231, + "unique_sentence2": 3367 + }, + "ces-rus": { + "num_samples": 2144, + "number_of_characters": 221574, + "unique_pairs": 2144, + "min_sentence1_length": 10, + "average_sentence1_length": 48.921175373134325, + "max_sentence1_length": 265, + "unique_sentence1": 2144, + "min_sentence2_length": 13, + "average_sentence2_length": 54.42490671641791, + "max_sentence2_length": 263, + "unique_sentence2": 2144 + }, + "ces-slk": { + "num_samples": 2551, + "number_of_characters": 253025, + "unique_pairs": 2551, + "min_sentence1_length": 10, + "average_sentence1_length": 48.90905527244218, + "max_sentence1_length": 265, + "unique_sentence1": 2551, + "min_sentence2_length": 10, + "average_sentence2_length": 50.277538220305765, + "max_sentence2_length": 237, + "unique_sentence2": 2551 + }, + "ces-slv": { + "num_samples": 1370, + "number_of_characters": 145413, + "unique_pairs": 1370, + "min_sentence1_length": 10, + "average_sentence1_length": 51.00656934306569, + "max_sentence1_length": 265, + "unique_sentence1": 1370, + "min_sentence2_length": 11, + "average_sentence2_length": 55.13430656934307, + "max_sentence2_length": 252, + "unique_sentence2": 1370 + }, + "ces-srp": { + "num_samples": 362, + "number_of_characters": 37517, + "unique_pairs": 362, + "min_sentence1_length": 15, + "average_sentence1_length": 49.98618784530387, + "max_sentence1_length": 129, + "unique_sentence1": 362, + "min_sentence2_length": 16, + "average_sentence2_length": 53.651933701657455, + "max_sentence2_length": 129, + "unique_sentence2": 362 + }, + "ces-ukr": { + "num_samples": 1285, + "number_of_characters": 133762, + "unique_pairs": 1285, + "min_sentence1_length": 10, + "average_sentence1_length": 49.67237354085603, + "max_sentence1_length": 233, + "unique_sentence1": 1285, + "min_sentence2_length": 12, + "average_sentence2_length": 54.422568093385216, + "max_sentence2_length": 253, + "unique_sentence2": 1285 + }, + "hrv-slk": { + "num_samples": 313, + "number_of_characters": 38510, + "unique_pairs": 313, + "min_sentence1_length": 16, + "average_sentence1_length": 61.019169329073485, + "max_sentence1_length": 174, + "unique_sentence1": 313, + "min_sentence2_length": 15, + "average_sentence2_length": 62.01597444089457, + "max_sentence2_length": 182, + "unique_sentence2": 313 + }, + "kat-rus": { + "num_samples": 262, + "number_of_characters": 26840, + "unique_pairs": 262, + "min_sentence1_length": 14, + "average_sentence1_length": 51.595419847328245, + "max_sentence1_length": 179, + "unique_sentence1": 262, + "min_sentence2_length": 16, + "average_sentence2_length": 50.847328244274806, + "max_sentence2_length": 186, + "unique_sentence2": 262 + }, + "lav-lit": { + "num_samples": 1061, + "number_of_characters": 101845, + "unique_pairs": 1061, + "min_sentence1_length": 11, + "average_sentence1_length": 47.86899151743638, + "max_sentence1_length": 283, + "unique_sentence1": 1061, + "min_sentence2_length": 12, + "average_sentence2_length": 48.120640904806784, + "max_sentence2_length": 268, + "unique_sentence2": 1061 + }, + "lav-pol": { + "num_samples": 951, + "number_of_characters": 94163, + "unique_pairs": 951, + "min_sentence1_length": 11, + "average_sentence1_length": 48.29232386961094, + "max_sentence1_length": 172, + "unique_sentence1": 951, + "min_sentence2_length": 14, + "average_sentence2_length": 50.722397476340696, + "max_sentence2_length": 208, + "unique_sentence2": 951 + }, + "lav-rus": { + "num_samples": 1412, + "number_of_characters": 141609, + "unique_pairs": 1412, + "min_sentence1_length": 11, + "average_sentence1_length": 49.20254957507082, + "max_sentence1_length": 146, + "unique_sentence1": 1412, + "min_sentence2_length": 13, + "average_sentence2_length": 51.0871104815864, + "max_sentence2_length": 182, + "unique_sentence2": 1412 + }, + "lav-slk": { + "num_samples": 789, + "number_of_characters": 72654, + "unique_pairs": 789, + "min_sentence1_length": 13, + "average_sentence1_length": 46.750316856780735, + "max_sentence1_length": 172, + "unique_sentence1": 789, + "min_sentence2_length": 12, + "average_sentence2_length": 45.333333333333336, + "max_sentence2_length": 162, + "unique_sentence2": 789 + }, + "lav-slv": { + "num_samples": 518, + "number_of_characters": 48873, + "unique_pairs": 518, + "min_sentence1_length": 13, + "average_sentence1_length": 46.72586872586873, + "max_sentence1_length": 126, + "unique_sentence1": 518, + "min_sentence2_length": 13, + "average_sentence2_length": 47.62355212355212, + "max_sentence2_length": 125, + "unique_sentence2": 518 + }, + "lav-ukr": { + "num_samples": 579, + "number_of_characters": 52695, + "unique_pairs": 579, + "min_sentence1_length": 11, + "average_sentence1_length": 45.799654576856646, + "max_sentence1_length": 151, + "unique_sentence1": 579, + "min_sentence2_length": 12, + "average_sentence2_length": 45.21070811744387, + "max_sentence2_length": 170, + "unique_sentence2": 579 + }, + "lit-pol": { + "num_samples": 1026, + "number_of_characters": 102814, + "unique_pairs": 1026, + "min_sentence1_length": 13, + "average_sentence1_length": 48.93664717348928, + "max_sentence1_length": 188, + "unique_sentence1": 1026, + "min_sentence2_length": 15, + "average_sentence2_length": 51.271929824561404, + "max_sentence2_length": 206, + "unique_sentence2": 1026 + }, + "lit-rus": { + "num_samples": 961, + "number_of_characters": 96822, + "unique_pairs": 961, + "min_sentence1_length": 12, + "average_sentence1_length": 49.50156087408949, + "max_sentence1_length": 155, + "unique_sentence1": 961, + "min_sentence2_length": 12, + "average_sentence2_length": 51.24973985431842, + "max_sentence2_length": 182, + "unique_sentence2": 961 + }, + "lit-slk": { + "num_samples": 859, + "number_of_characters": 82605, + "unique_pairs": 859, + "min_sentence1_length": 12, + "average_sentence1_length": 49.033760186263095, + "max_sentence1_length": 188, + "unique_sentence1": 859, + "min_sentence2_length": 10, + "average_sentence2_length": 47.130384167636784, + "max_sentence2_length": 195, + "unique_sentence2": 859 + }, + "lit-slv": { + "num_samples": 607, + "number_of_characters": 58755, + "unique_pairs": 607, + "min_sentence1_length": 12, + "average_sentence1_length": 48.056013179571664, + "max_sentence1_length": 155, + "unique_sentence1": 607, + "min_sentence2_length": 11, + "average_sentence2_length": 48.73970345963756, + "max_sentence2_length": 145, + "unique_sentence2": 607 + }, + "lit-ukr": { + "num_samples": 639, + "number_of_characters": 59450, + "unique_pairs": 639, + "min_sentence1_length": 14, + "average_sentence1_length": 46.9358372456964, + "max_sentence1_length": 149, + "unique_sentence1": 639, + "min_sentence2_length": 13, + "average_sentence2_length": 46.10015649452269, + "max_sentence2_length": 170, + "unique_sentence2": 639 + }, + "pol-rus": { + "num_samples": 5014, + "number_of_characters": 536093, + "unique_pairs": 5014, + "min_sentence1_length": 12, + "average_sentence1_length": 53.28500199441564, + "max_sentence1_length": 424, + "unique_sentence1": 5014, + "min_sentence2_length": 11, + "average_sentence2_length": 53.63422417231751, + "max_sentence2_length": 456, + "unique_sentence2": 5014 + }, + "pol-slk": { + "num_samples": 1918, + "number_of_characters": 196496, + "unique_pairs": 1918, + "min_sentence1_length": 14, + "average_sentence1_length": 53.03023983315954, + "max_sentence1_length": 206, + "unique_sentence1": 1918, + "min_sentence2_length": 10, + "average_sentence2_length": 49.418143899895725, + "max_sentence2_length": 194, + "unique_sentence2": 1918 + }, + "pol-slv": { + "num_samples": 1382, + "number_of_characters": 160396, + "unique_pairs": 1382, + "min_sentence1_length": 9, + "average_sentence1_length": 57.59406657018813, + "max_sentence1_length": 180, + "unique_sentence1": 1382, + "min_sentence2_length": 8, + "average_sentence2_length": 58.46671490593343, + "max_sentence2_length": 146, + "unique_sentence2": 1382 + }, + "pol-srp": { + "num_samples": 492, + "number_of_characters": 54772, + "unique_pairs": 492, + "min_sentence1_length": 19, + "average_sentence1_length": 54.552845528455286, + "max_sentence1_length": 148, + "unique_sentence1": 492, + "min_sentence2_length": 16, + "average_sentence2_length": 56.77235772357724, + "max_sentence2_length": 129, + "unique_sentence2": 492 + }, + "pol-ukr": { + "num_samples": 2370, + "number_of_characters": 251424, + "unique_pairs": 2370, + "min_sentence1_length": 14, + "average_sentence1_length": 53.79789029535865, + "max_sentence1_length": 305, + "unique_sentence1": 2370, + "min_sentence2_length": 12, + "average_sentence2_length": 52.28818565400844, + "max_sentence2_length": 202, + "unique_sentence2": 2370 + }, + "rus-slk": { + "num_samples": 1263, + "number_of_characters": 136750, + "unique_pairs": 1263, + "min_sentence1_length": 13, + "average_sentence1_length": 56.64528899445764, + "max_sentence1_length": 316, + "unique_sentence1": 1263, + "min_sentence2_length": 10, + "average_sentence2_length": 51.62866191607284, + "max_sentence2_length": 309, + "unique_sentence2": 1263 + }, + "rus-slv": { + "num_samples": 1096, + "number_of_characters": 133263, + "unique_pairs": 1096, + "min_sentence1_length": 14, + "average_sentence1_length": 62.56204379562044, + "max_sentence1_length": 263, + "unique_sentence1": 1096, + "min_sentence2_length": 11, + "average_sentence2_length": 59.028284671532845, + "max_sentence2_length": 252, + "unique_sentence2": 1096 + }, + "rus-srp": { + "num_samples": 455, + "number_of_characters": 51800, + "unique_pairs": 455, + "min_sentence1_length": 17, + "average_sentence1_length": 57.51868131868132, + "max_sentence1_length": 122, + "unique_sentence1": 455, + "min_sentence2_length": 12, + "average_sentence2_length": 56.32747252747253, + "max_sentence2_length": 129, + "unique_sentence2": 455 + }, + "rus-ukr": { + "num_samples": 15251, + "number_of_characters": 1504266, + "unique_pairs": 15251, + "min_sentence1_length": 10, + "average_sentence1_length": 49.97587043472559, + "max_sentence1_length": 308, + "unique_sentence1": 15251, + "min_sentence2_length": 10, + "average_sentence2_length": 48.65805520949446, + "max_sentence2_length": 353, + "unique_sentence2": 15251 + }, + "slk-slv": { + "num_samples": 1259, + "number_of_characters": 133621, + "unique_pairs": 1259, + "min_sentence1_length": 10, + "average_sentence1_length": 50.423351866560765, + "max_sentence1_length": 234, + "unique_sentence1": 1259, + "min_sentence2_length": 11, + "average_sentence2_length": 55.70929308975377, + "max_sentence2_length": 252, + "unique_sentence2": 1259 + }, + "slk-srp": { + "num_samples": 561, + "number_of_characters": 57637, + "unique_pairs": 561, + "min_sentence1_length": 15, + "average_sentence1_length": 47.934046345811055, + "max_sentence1_length": 117, + "unique_sentence1": 561, + "min_sentence2_length": 16, + "average_sentence2_length": 54.805704099821746, + "max_sentence2_length": 112, + "unique_sentence2": 561 + }, + "slk-ukr": { + "num_samples": 944, + "number_of_characters": 90612, + "unique_pairs": 944, + "min_sentence1_length": 10, + "average_sentence1_length": 46.8728813559322, + "max_sentence1_length": 237, + "unique_sentence1": 944, + "min_sentence2_length": 12, + "average_sentence2_length": 49.11440677966102, + "max_sentence2_length": 253, + "unique_sentence2": 944 + }, + "slv-srp": { + "num_samples": 499, + "number_of_characters": 60828, + "unique_pairs": 499, + "min_sentence1_length": 16, + "average_sentence1_length": 63.63927855711423, + "max_sentence1_length": 122, + "unique_sentence1": 499, + "min_sentence2_length": 16, + "average_sentence2_length": 58.26052104208417, + "max_sentence2_length": 118, + "unique_sentence2": 499 + }, + "slv-ukr": { + "num_samples": 733, + "number_of_characters": 86586, + "unique_pairs": 733, + "min_sentence1_length": 11, + "average_sentence1_length": 57.88403819918145, + "max_sentence1_length": 124, + "unique_sentence1": 733, + "min_sentence2_length": 12, + "average_sentence2_length": 60.241473396998636, + "max_sentence2_length": 132, + "unique_sentence2": 733 + }, + "cat-deu": { + "num_samples": 302, + "number_of_characters": 33752, + "unique_pairs": 302, + "min_sentence1_length": 13, + "average_sentence1_length": 53.35099337748344, + "max_sentence1_length": 255, + "unique_sentence1": 302, + "min_sentence2_length": 12, + "average_sentence2_length": 58.41059602649007, + "max_sentence2_length": 269, + "unique_sentence2": 302 + }, + "cat-fra": { + "num_samples": 598, + "number_of_characters": 69263, + "unique_pairs": 598, + "min_sentence1_length": 14, + "average_sentence1_length": 53.40468227424749, + "max_sentence1_length": 202, + "unique_sentence1": 598, + "min_sentence2_length": 16, + "average_sentence2_length": 62.419732441471574, + "max_sentence2_length": 235, + "unique_sentence2": 598 + }, + "cat-ita": { + "num_samples": 418, + "number_of_characters": 43550, + "unique_pairs": 418, + "min_sentence1_length": 15, + "average_sentence1_length": 50.67464114832536, + "max_sentence1_length": 195, + "unique_sentence1": 418, + "min_sentence2_length": 15, + "average_sentence2_length": 53.51196172248804, + "max_sentence2_length": 230, + "unique_sentence2": 418 + }, + "cat-por": { + "num_samples": 370, + "number_of_characters": 36411, + "unique_pairs": 370, + "min_sentence1_length": 15, + "average_sentence1_length": 48.71891891891892, + "max_sentence1_length": 193, + "unique_sentence1": 370, + "min_sentence2_length": 15, + "average_sentence2_length": 49.689189189189186, + "max_sentence2_length": 211, + "unique_sentence2": 370 + }, + "cat-spa": { + "num_samples": 2648, + "number_of_characters": 285897, + "unique_pairs": 2648, + "min_sentence1_length": 10, + "average_sentence1_length": 52.98489425981873, + "max_sentence1_length": 216, + "unique_sentence1": 2648, + "min_sentence2_length": 11, + "average_sentence2_length": 54.98225075528701, + "max_sentence2_length": 247, + "unique_sentence2": 2648 + }, + "dan-deu": { + "num_samples": 4337, + "number_of_characters": 463434, + "unique_pairs": 4337, + "min_sentence1_length": 11, + "average_sentence1_length": 50.83537007147798, + "max_sentence1_length": 198, + "unique_sentence1": 4337, + "min_sentence2_length": 11, + "average_sentence2_length": 56.02052109753286, + "max_sentence2_length": 224, + "unique_sentence2": 4337 + }, + "dan-fra": { + "num_samples": 3802, + "number_of_characters": 434788, + "unique_pairs": 3802, + "min_sentence1_length": 12, + "average_sentence1_length": 51.91977906365071, + "max_sentence1_length": 296, + "unique_sentence1": 3802, + "min_sentence2_length": 11, + "average_sentence2_length": 62.43792740662809, + "max_sentence2_length": 345, + "unique_sentence2": 3802 + }, + "dan-isl": { + "num_samples": 327, + "number_of_characters": 32853, + "unique_pairs": 327, + "min_sentence1_length": 12, + "average_sentence1_length": 51.35474006116208, + "max_sentence1_length": 198, + "unique_sentence1": 327, + "min_sentence2_length": 12, + "average_sentence2_length": 49.1131498470948, + "max_sentence2_length": 181, + "unique_sentence2": 327 + }, + "dan-ita": { + "num_samples": 3818, + "number_of_characters": 421045, + "unique_pairs": 3818, + "min_sentence1_length": 12, + "average_sentence1_length": 53.20324777370351, + "max_sentence1_length": 296, + "unique_sentence1": 3818, + "min_sentence2_length": 11, + "average_sentence2_length": 57.07569408067051, + "max_sentence2_length": 271, + "unique_sentence2": 3818 + }, + "dan-nld": { + "num_samples": 4099, + "number_of_characters": 428737, + "unique_pairs": 4099, + "min_sentence1_length": 11, + "average_sentence1_length": 51.53330080507441, + "max_sentence1_length": 225, + "unique_sentence1": 4099, + "min_sentence2_length": 10, + "average_sentence2_length": 53.06221029519395, + "max_sentence2_length": 254, + "unique_sentence2": 4099 + }, + "dan-nor": { + "num_samples": 2603, + "number_of_characters": 278953, + "unique_pairs": 2603, + "min_sentence1_length": 13, + "average_sentence1_length": 54.74375720322705, + "max_sentence1_length": 296, + "unique_sentence1": 2603, + "min_sentence2_length": 11, + "average_sentence2_length": 52.42220514790626, + "max_sentence2_length": 291, + "unique_sentence2": 2603 + }, + "dan-por": { + "num_samples": 3206, + "number_of_characters": 349267, + "unique_pairs": 3206, + "min_sentence1_length": 12, + "average_sentence1_length": 52.78290704928259, + "max_sentence1_length": 176, + "unique_sentence1": 3206, + "min_sentence2_length": 10, + "average_sentence2_length": 56.15876481597006, + "max_sentence2_length": 211, + "unique_sentence2": 3206 + }, + "dan-ron": { + "num_samples": 2052, + "number_of_characters": 225425, + "unique_pairs": 2052, + "min_sentence1_length": 8, + "average_sentence1_length": 53.82358674463938, + "max_sentence1_length": 296, + "unique_sentence1": 2052, + "min_sentence2_length": 9, + "average_sentence2_length": 56.03265107212476, + "max_sentence2_length": 281, + "unique_sentence2": 2052 + }, + "dan-spa": { + "num_samples": 3571, + "number_of_characters": 389048, + "unique_pairs": 3571, + "min_sentence1_length": 12, + "average_sentence1_length": 52.05740688882666, + "max_sentence1_length": 257, + "unique_sentence1": 3571, + "min_sentence2_length": 12, + "average_sentence2_length": 56.88910669280314, + "max_sentence2_length": 239, + "unique_sentence2": 3571 + }, + "dan-swe": { + "num_samples": 4268, + "number_of_characters": 440347, + "unique_pairs": 4268, + "min_sentence1_length": 12, + "average_sentence1_length": 52.087160262417996, + "max_sentence1_length": 315, + "unique_sentence1": 4268, + "min_sentence2_length": 11, + "average_sentence2_length": 51.0869259606373, + "max_sentence2_length": 330, + "unique_sentence2": 4268 + }, + "deu-fra": { + "num_samples": 27727, + "number_of_characters": 3222716, + "unique_pairs": 27727, + "min_sentence1_length": 6, + "average_sentence1_length": 55.5575431889494, + "max_sentence1_length": 337, + "unique_sentence1": 27727, + "min_sentence2_length": 6, + "average_sentence2_length": 60.67270169870523, + "max_sentence2_length": 330, + "unique_sentence2": 27727 + }, + "deu-isl": { + "num_samples": 294, + "number_of_characters": 31097, + "unique_pairs": 294, + "min_sentence1_length": 14, + "average_sentence1_length": 56.085034013605444, + "max_sentence1_length": 201, + "unique_sentence1": 294, + "min_sentence2_length": 14, + "average_sentence2_length": 49.68707482993197, + "max_sentence2_length": 181, + "unique_sentence2": 294 + }, + "deu-ita": { + "num_samples": 18787, + "number_of_characters": 2100285, + "unique_pairs": 18787, + "min_sentence1_length": 10, + "average_sentence1_length": 56.25480385372864, + "max_sentence1_length": 346, + "unique_sentence1": 18787, + "min_sentence2_length": 10, + "average_sentence2_length": 55.53978815138127, + "max_sentence2_length": 357, + "unique_sentence2": 18787 + }, + "deu-nld": { + "num_samples": 14211, + "number_of_characters": 1508365, + "unique_pairs": 14211, + "min_sentence1_length": 10, + "average_sentence1_length": 54.50777566673703, + "max_sentence1_length": 299, + "unique_sentence1": 14211, + "min_sentence2_length": 10, + "average_sentence2_length": 51.63289001477729, + "max_sentence2_length": 276, + "unique_sentence2": 14211 + }, + "deu-nor": { + "num_samples": 2783, + "number_of_characters": 295406, + "unique_pairs": 2783, + "min_sentence1_length": 10, + "average_sentence1_length": 56.26122888968739, + "max_sentence1_length": 224, + "unique_sentence1": 2783, + "min_sentence2_length": 11, + "average_sentence2_length": 49.88537549407115, + "max_sentence2_length": 163, + "unique_sentence2": 2783 + }, + "deu-por": { + "num_samples": 11319, + "number_of_characters": 1213812, + "unique_pairs": 11319, + "min_sentence1_length": 9, + "average_sentence1_length": 54.69635126777984, + "max_sentence1_length": 374, + "unique_sentence1": 11319, + "min_sentence2_length": 10, + "average_sentence2_length": 52.540330417881435, + "max_sentence2_length": 337, + "unique_sentence2": 11319 + }, + "deu-ron": { + "num_samples": 3598, + "number_of_characters": 401116, + "unique_pairs": 3598, + "min_sentence1_length": 8, + "average_sentence1_length": 57.00611450806004, + "max_sentence1_length": 716, + "unique_sentence1": 3598, + "min_sentence2_length": 9, + "average_sentence2_length": 54.4769316286826, + "max_sentence2_length": 699, + "unique_sentence2": 3598 + }, + "deu-spa": { + "num_samples": 19739, + "number_of_characters": 2151487, + "unique_pairs": 19739, + "min_sentence1_length": 9, + "average_sentence1_length": 54.838593647094584, + "max_sentence1_length": 428, + "unique_sentence1": 19739, + "min_sentence2_length": 10, + "average_sentence2_length": 54.15816404073155, + "max_sentence2_length": 415, + "unique_sentence2": 19739 + }, + "deu-swe": { + "num_samples": 5772, + "number_of_characters": 610949, + "unique_pairs": 5772, + "min_sentence1_length": 10, + "average_sentence1_length": 55.78326403326403, + "max_sentence1_length": 264, + "unique_sentence1": 5772, + "min_sentence2_length": 10, + "average_sentence2_length": 50.06375606375607, + "max_sentence2_length": 237, + "unique_sentence2": 5772 + }, + "fra-isl": { + "num_samples": 347, + "number_of_characters": 39368, + "unique_pairs": 347, + "min_sentence1_length": 14, + "average_sentence1_length": 62.11527377521614, + "max_sentence1_length": 229, + "unique_sentence1": 347, + "min_sentence2_length": 16, + "average_sentence2_length": 51.3371757925072, + "max_sentence2_length": 194, + "unique_sentence2": 347 + }, + "fra-ita": { + "num_samples": 20002, + "number_of_characters": 2326983, + "unique_pairs": 20002, + "min_sentence1_length": 10, + "average_sentence1_length": 61.115738426157385, + "max_sentence1_length": 743, + "unique_sentence1": 20002, + "min_sentence2_length": 8, + "average_sentence2_length": 55.22177782221778, + "max_sentence2_length": 653, + "unique_sentence2": 20002 + }, + "fra-nld": { + "num_samples": 14684, + "number_of_characters": 1659621, + "unique_pairs": 14684, + "min_sentence1_length": 10, + "average_sentence1_length": 60.335739580495776, + "max_sentence1_length": 400, + "unique_sentence1": 14684, + "min_sentence2_length": 10, + "average_sentence2_length": 52.68666575864887, + "max_sentence2_length": 263, + "unique_sentence2": 14684 + }, + "fra-nor": { + "num_samples": 2558, + "number_of_characters": 290540, + "unique_pairs": 2558, + "min_sentence1_length": 10, + "average_sentence1_length": 63.30609851446442, + "max_sentence1_length": 377, + "unique_sentence1": 2558, + "min_sentence2_length": 11, + "average_sentence2_length": 50.274824081313525, + "max_sentence2_length": 305, + "unique_sentence2": 2558 + }, + "fra-por": { + "num_samples": 13265, + "number_of_characters": 1485292, + "unique_pairs": 13265, + "min_sentence1_length": 10, + "average_sentence1_length": 59.58296268375424, + "max_sentence1_length": 284, + "unique_sentence1": 13265, + "min_sentence2_length": 8, + "average_sentence2_length": 52.3877874104787, + "max_sentence2_length": 443, + "unique_sentence2": 13265 + }, + "fra-ron": { + "num_samples": 3295, + "number_of_characters": 381273, + "unique_pairs": 3295, + "min_sentence1_length": 8, + "average_sentence1_length": 61.61699544764795, + "max_sentence1_length": 677, + "unique_sentence1": 3295, + "min_sentence2_length": 8, + "average_sentence2_length": 54.095599393019725, + "max_sentence2_length": 587, + "unique_sentence2": 3295 + }, + "fra-spa": { + "num_samples": 23311, + "number_of_characters": 2650610, + "unique_pairs": 23311, + "min_sentence1_length": 10, + "average_sentence1_length": 59.62794388915105, + "max_sentence1_length": 677, + "unique_sentence1": 23311, + "min_sentence2_length": 8, + "average_sentence2_length": 54.078460812491954, + "max_sentence2_length": 576, + "unique_sentence2": 23311 + }, + "fra-swe": { + "num_samples": 5006, + "number_of_characters": 565299, + "unique_pairs": 5006, + "min_sentence1_length": 10, + "average_sentence1_length": 62.228925289652416, + "max_sentence1_length": 345, + "unique_sentence1": 5006, + "min_sentence2_length": 11, + "average_sentence2_length": 50.69536556132641, + "max_sentence2_length": 289, + "unique_sentence2": 5006 + }, + "isl-ita": { + "num_samples": 421, + "number_of_characters": 42563, + "unique_pairs": 421, + "min_sentence1_length": 14, + "average_sentence1_length": 48.11638954869359, + "max_sentence1_length": 145, + "unique_sentence1": 421, + "min_sentence2_length": 13, + "average_sentence2_length": 52.9833729216152, + "max_sentence2_length": 151, + "unique_sentence2": 421 + }, + "isl-nld": { + "num_samples": 311, + "number_of_characters": 33292, + "unique_pairs": 311, + "min_sentence1_length": 13, + "average_sentence1_length": 51.4951768488746, + "max_sentence1_length": 160, + "unique_sentence1": 311, + "min_sentence2_length": 11, + "average_sentence2_length": 55.553054662379424, + "max_sentence2_length": 228, + "unique_sentence2": 311 + }, + "isl-por": { + "num_samples": 341, + "number_of_characters": 34367, + "unique_pairs": 341, + "min_sentence1_length": 12, + "average_sentence1_length": 48.82697947214076, + "max_sentence1_length": 135, + "unique_sentence1": 341, + "min_sentence2_length": 12, + "average_sentence2_length": 51.956011730205276, + "max_sentence2_length": 156, + "unique_sentence2": 341 + }, + "isl-spa": { + "num_samples": 366, + "number_of_characters": 38563, + "unique_pairs": 366, + "min_sentence1_length": 14, + "average_sentence1_length": 49.80327868852459, + "max_sentence1_length": 196, + "unique_sentence1": 366, + "min_sentence2_length": 17, + "average_sentence2_length": 55.560109289617486, + "max_sentence2_length": 206, + "unique_sentence2": 366 + }, + "isl-swe": { + "num_samples": 312, + "number_of_characters": 30662, + "unique_pairs": 312, + "min_sentence1_length": 14, + "average_sentence1_length": 48.83653846153846, + "max_sentence1_length": 149, + "unique_sentence1": 312, + "min_sentence2_length": 15, + "average_sentence2_length": 49.43910256410256, + "max_sentence2_length": 141, + "unique_sentence2": 312 + }, + "ita-nld": { + "num_samples": 9160, + "number_of_characters": 973718, + "unique_pairs": 9160, + "min_sentence1_length": 11, + "average_sentence1_length": 54.19890829694323, + "max_sentence1_length": 291, + "unique_sentence1": 9160, + "min_sentence2_length": 10, + "average_sentence2_length": 52.10218340611354, + "max_sentence2_length": 337, + "unique_sentence2": 9160 + }, + "ita-nor": { + "num_samples": 2516, + "number_of_characters": 267838, + "unique_pairs": 2516, + "min_sentence1_length": 13, + "average_sentence1_length": 56.17885532591415, + "max_sentence1_length": 356, + "unique_sentence1": 2516, + "min_sentence2_length": 12, + "average_sentence2_length": 50.27503974562798, + "max_sentence2_length": 305, + "unique_sentence2": 2516 + }, + "ita-por": { + "num_samples": 10924, + "number_of_characters": 1157982, + "unique_pairs": 10924, + "min_sentence1_length": 10, + "average_sentence1_length": 53.500274624679605, + "max_sentence1_length": 357, + "unique_sentence1": 10924, + "min_sentence2_length": 10, + "average_sentence2_length": 52.503203954595385, + "max_sentence2_length": 276, + "unique_sentence2": 10924 + }, + "ita-ron": { + "num_samples": 3360, + "number_of_characters": 375697, + "unique_pairs": 3360, + "min_sentence1_length": 8, + "average_sentence1_length": 56.383630952380955, + "max_sentence1_length": 346, + "unique_sentence1": 3360, + "min_sentence2_length": 9, + "average_sentence2_length": 55.430952380952384, + "max_sentence2_length": 402, + "unique_sentence2": 3360 + }, + "ita-spa": { + "num_samples": 16534, + "number_of_characters": 1778757, + "unique_pairs": 16534, + "min_sentence1_length": 10, + "average_sentence1_length": 53.5578202491835, + "max_sentence1_length": 362, + "unique_sentence1": 16534, + "min_sentence2_length": 11, + "average_sentence2_length": 54.023950647151324, + "max_sentence2_length": 363, + "unique_sentence2": 16534 + }, + "ita-swe": { + "num_samples": 4741, + "number_of_characters": 508653, + "unique_pairs": 4741, + "min_sentence1_length": 10, + "average_sentence1_length": 55.86985867960346, + "max_sentence1_length": 271, + "unique_sentence1": 4741, + "min_sentence2_length": 10, + "average_sentence2_length": 51.41826618856781, + "max_sentence2_length": 289, + "unique_sentence2": 4741 + }, + "nld-nor": { + "num_samples": 2664, + "number_of_characters": 281584, + "unique_pairs": 2664, + "min_sentence1_length": 10, + "average_sentence1_length": 54.619369369369366, + "max_sentence1_length": 211, + "unique_sentence1": 2664, + "min_sentence2_length": 11, + "average_sentence2_length": 51.080330330330334, + "max_sentence2_length": 162, + "unique_sentence2": 2664 + }, + "nld-por": { + "num_samples": 7021, + "number_of_characters": 738280, + "unique_pairs": 7021, + "min_sentence1_length": 10, + "average_sentence1_length": 51.91012676256943, + "max_sentence1_length": 243, + "unique_sentence1": 7021, + "min_sentence2_length": 11, + "average_sentence2_length": 53.24298532972511, + "max_sentence2_length": 443, + "unique_sentence2": 7021 + }, + "nld-ron": { + "num_samples": 2888, + "number_of_characters": 311689, + "unique_pairs": 2888, + "min_sentence1_length": 10, + "average_sentence1_length": 53.15547091412743, + "max_sentence1_length": 228, + "unique_sentence1": 2888, + "min_sentence2_length": 11, + "average_sentence2_length": 54.770083102493075, + "max_sentence2_length": 252, + "unique_sentence2": 2888 + }, + "nld-spa": { + "num_samples": 9555, + "number_of_characters": 1015756, + "unique_pairs": 9555, + "min_sentence1_length": 10, + "average_sentence1_length": 51.9134484563056, + "max_sentence1_length": 229, + "unique_sentence1": 9555, + "min_sentence2_length": 10, + "average_sentence2_length": 54.39277864992151, + "max_sentence2_length": 235, + "unique_sentence2": 9555 + }, + "nld-swe": { + "num_samples": 5072, + "number_of_characters": 529162, + "unique_pairs": 5072, + "min_sentence1_length": 10, + "average_sentence1_length": 53.53410883280757, + "max_sentence1_length": 333, + "unique_sentence1": 5072, + "min_sentence2_length": 10, + "average_sentence2_length": 50.79593848580441, + "max_sentence2_length": 311, + "unique_sentence2": 5072 + }, + "nor-por": { + "num_samples": 2096, + "number_of_characters": 227941, + "unique_pairs": 2096, + "min_sentence1_length": 11, + "average_sentence1_length": 51.42557251908397, + "max_sentence1_length": 200, + "unique_sentence1": 2096, + "min_sentence2_length": 12, + "average_sentence2_length": 57.32490458015267, + "max_sentence2_length": 180, + "unique_sentence2": 2096 + }, + "nor-ron": { + "num_samples": 1412, + "number_of_characters": 156087, + "unique_pairs": 1412, + "min_sentence1_length": 13, + "average_sentence1_length": 52.156515580736546, + "max_sentence1_length": 291, + "unique_sentence1": 1412, + "min_sentence2_length": 14, + "average_sentence2_length": 58.38668555240793, + "max_sentence2_length": 281, + "unique_sentence2": 1412 + }, + "nor-spa": { + "num_samples": 2603, + "number_of_characters": 283540, + "unique_pairs": 2603, + "min_sentence1_length": 12, + "average_sentence1_length": 50.998079139454475, + "max_sentence1_length": 283, + "unique_sentence1": 2603, + "min_sentence2_length": 13, + "average_sentence2_length": 57.930080676142914, + "max_sentence2_length": 274, + "unique_sentence2": 2603 + }, + "nor-swe": { + "num_samples": 3165, + "number_of_characters": 325198, + "unique_pairs": 3165, + "min_sentence1_length": 11, + "average_sentence1_length": 50.93870458135861, + "max_sentence1_length": 291, + "unique_sentence1": 3165, + "min_sentence2_length": 10, + "average_sentence2_length": 51.80947867298578, + "max_sentence2_length": 289, + "unique_sentence2": 3165 + }, + "por-ron": { + "num_samples": 3026, + "number_of_characters": 341012, + "unique_pairs": 3026, + "min_sentence1_length": 12, + "average_sentence1_length": 56.56840713813615, + "max_sentence1_length": 334, + "unique_sentence1": 3026, + "min_sentence2_length": 12, + "average_sentence2_length": 56.125578321216125, + "max_sentence2_length": 402, + "unique_sentence2": 3026 + }, + "por-spa": { + "num_samples": 16084, + "number_of_characters": 1698580, + "unique_pairs": 16084, + "min_sentence1_length": 10, + "average_sentence1_length": 51.925205172842574, + "max_sentence1_length": 350, + "unique_sentence1": 16084, + "min_sentence2_length": 10, + "average_sentence2_length": 53.681609052474506, + "max_sentence2_length": 364, + "unique_sentence2": 16084 + }, + "por-swe": { + "num_samples": 4235, + "number_of_characters": 444824, + "unique_pairs": 4235, + "min_sentence1_length": 11, + "average_sentence1_length": 54.483825265643446, + "max_sentence1_length": 282, + "unique_sentence1": 4235, + "min_sentence2_length": 11, + "average_sentence2_length": 50.55135773317591, + "max_sentence2_length": 255, + "unique_sentence2": 4235 + }, + "ron-spa": { + "num_samples": 3375, + "number_of_characters": 376497, + "unique_pairs": 3375, + "min_sentence1_length": 12, + "average_sentence1_length": 54.70755555555556, + "max_sentence1_length": 587, + "unique_sentence1": 3375, + "min_sentence2_length": 11, + "average_sentence2_length": 56.84711111111111, + "max_sentence2_length": 576, + "unique_sentence2": 3375 + }, + "ron-swe": { + "num_samples": 2154, + "number_of_characters": 238390, + "unique_pairs": 2154, + "min_sentence1_length": 9, + "average_sentence1_length": 57.035747446610955, + "max_sentence1_length": 281, + "unique_sentence1": 2154, + "min_sentence2_length": 8, + "average_sentence2_length": 53.637418755803154, + "max_sentence2_length": 289, + "unique_sentence2": 2154 + }, + "spa-swe": { + "num_samples": 4884, + "number_of_characters": 525651, + "unique_pairs": 4884, + "min_sentence1_length": 12, + "average_sentence1_length": 56.58456183456183, + "max_sentence1_length": 244, + "unique_sentence1": 4884, + "min_sentence2_length": 10, + "average_sentence2_length": 51.042588042588044, + "max_sentence2_length": 280, + "unique_sentence2": 4884 + }, + "ben-hin": { + "num_samples": 1174, + "number_of_characters": 115288, + "unique_pairs": 1174, + "min_sentence1_length": 11, + "average_sentence1_length": 47.49574105621806, + "max_sentence1_length": 145, + "unique_sentence1": 1174, + "min_sentence2_length": 13, + "average_sentence2_length": 50.70528109028961, + "max_sentence2_length": 149, + "unique_sentence2": 1174 + }, + "ben-mar": { + "num_samples": 566, + "number_of_characters": 54106, + "unique_pairs": 566, + "min_sentence1_length": 12, + "average_sentence1_length": 47.79858657243816, + "max_sentence1_length": 145, + "unique_sentence1": 566, + "min_sentence2_length": 14, + "average_sentence2_length": 47.79505300353357, + "max_sentence2_length": 130, + "unique_sentence2": 566 + }, + "ben-urd": { + "num_samples": 488, + "number_of_characters": 44387, + "unique_pairs": 488, + "min_sentence1_length": 11, + "average_sentence1_length": 44.0, + "max_sentence1_length": 110, + "unique_sentence1": 488, + "min_sentence2_length": 12, + "average_sentence2_length": 46.95696721311475, + "max_sentence2_length": 101, + "unique_sentence2": 488 + }, + "hin-mar": { + "num_samples": 615, + "number_of_characters": 59136, + "unique_pairs": 615, + "min_sentence1_length": 11, + "average_sentence1_length": 49.56585365853658, + "max_sentence1_length": 149, + "unique_sentence1": 615, + "min_sentence2_length": 11, + "average_sentence2_length": 46.59024390243903, + "max_sentence2_length": 143, + "unique_sentence2": 615 + }, + "hin-urd": { + "num_samples": 545, + "number_of_characters": 52165, + "unique_pairs": 545, + "min_sentence1_length": 14, + "average_sentence1_length": 48.37614678899082, + "max_sentence1_length": 111, + "unique_sentence1": 545, + "min_sentence2_length": 12, + "average_sentence2_length": 47.3394495412844, + "max_sentence2_length": 125, + "unique_sentence2": 545 + }, + "mar-urd": { + "num_samples": 270, + "number_of_characters": 23951, + "unique_pairs": 270, + "min_sentence1_length": 14, + "average_sentence1_length": 43.162962962962965, + "max_sentence1_length": 108, + "unique_sentence1": 270, + "min_sentence2_length": 13, + "average_sentence2_length": 45.544444444444444, + "max_sentence2_length": 107, + "unique_sentence2": 270 + }, + "aze-kaz": { + "num_samples": 412, + "number_of_characters": 38912, + "unique_pairs": 412, + "min_sentence1_length": 12, + "average_sentence1_length": 46.70145631067961, + "max_sentence1_length": 121, + "unique_sentence1": 412, + "min_sentence2_length": 14, + "average_sentence2_length": 47.74514563106796, + "max_sentence2_length": 108, + "unique_sentence2": 412 + }, + "aze-tur": { + "num_samples": 388, + "number_of_characters": 36138, + "unique_pairs": 388, + "min_sentence1_length": 12, + "average_sentence1_length": 46.69072164948454, + "max_sentence1_length": 124, + "unique_sentence1": 388, + "min_sentence2_length": 12, + "average_sentence2_length": 46.44845360824742, + "max_sentence2_length": 123, + "unique_sentence2": 388 + }, + "kaz-tur": { + "num_samples": 340, + "number_of_characters": 31637, + "unique_pairs": 340, + "min_sentence1_length": 17, + "average_sentence1_length": 47.26764705882353, + "max_sentence1_length": 122, + "unique_sentence1": 340, + "min_sentence2_length": 12, + "average_sentence2_length": 45.78235294117647, + "max_sentence2_length": 114, + "unique_sentence2": 340 + }, + "est-fin": { + "num_samples": 790, + "number_of_characters": 80226, + "unique_pairs": 790, + "min_sentence1_length": 13, + "average_sentence1_length": 50.346835443037975, + "max_sentence1_length": 158, + "unique_sentence1": 790, + "min_sentence2_length": 14, + "average_sentence2_length": 51.20506329113924, + "max_sentence2_length": 152, + "unique_sentence2": 790 + }, + "est-hun": { + "num_samples": 674, + "number_of_characters": 69641, + "unique_pairs": 674, + "min_sentence1_length": 8, + "average_sentence1_length": 50.49258160237389, + "max_sentence1_length": 157, + "unique_sentence1": 674, + "min_sentence2_length": 9, + "average_sentence2_length": 52.832344213649854, + "max_sentence2_length": 180, + "unique_sentence2": 674 + }, + "fin-hun": { + "num_samples": 1542, + "number_of_characters": 167588, + "unique_pairs": 1542, + "min_sentence1_length": 8, + "average_sentence1_length": 53.54928664072633, + "max_sentence1_length": 243, + "unique_sentence1": 1542, + "min_sentence2_length": 9, + "average_sentence2_length": 55.132944228274965, + "max_sentence2_length": 228, + "unique_sentence2": 1542 + }, + "ara-eng": { + "num_samples": 5698, + "number_of_characters": 544132, + "unique_pairs": 5698, + "min_sentence1_length": 10, + "average_sentence1_length": 45.672165672165676, + "max_sentence1_length": 280, + "unique_sentence1": 5698, + "min_sentence2_length": 11, + "average_sentence2_length": 49.82309582309583, + "max_sentence2_length": 287, + "unique_sentence2": 5698 + }, + "aze-eng": { + "num_samples": 603, + "number_of_characters": 58907, + "unique_pairs": 603, + "min_sentence1_length": 12, + "average_sentence1_length": 49.94195688225539, + "max_sentence1_length": 121, + "unique_sentence1": 603, + "min_sentence2_length": 12, + "average_sentence2_length": 47.74792703150912, + "max_sentence2_length": 129, + "unique_sentence2": 603 + }, + "ben-eng": { + "num_samples": 1367, + "number_of_characters": 126399, + "unique_pairs": 1367, + "min_sentence1_length": 10, + "average_sentence1_length": 46.61155815654718, + "max_sentence1_length": 147, + "unique_sentence1": 1367, + "min_sentence2_length": 14, + "average_sentence2_length": 45.85296269202634, + "max_sentence2_length": 148, + "unique_sentence2": 1367 + }, + "bul-eng": { + "num_samples": 2133, + "number_of_characters": 219893, + "unique_pairs": 2133, + "min_sentence1_length": 12, + "average_sentence1_length": 53.70604781997187, + "max_sentence1_length": 273, + "unique_sentence1": 2133, + "min_sentence2_length": 11, + "average_sentence2_length": 49.384903891233, + "max_sentence2_length": 287, + "unique_sentence2": 2133 + }, + "cat-eng": { + "num_samples": 1152, + "number_of_characters": 118852, + "unique_pairs": 1152, + "min_sentence1_length": 10, + "average_sentence1_length": 52.33940972222222, + "max_sentence1_length": 202, + "unique_sentence1": 1152, + "min_sentence2_length": 11, + "average_sentence2_length": 50.830729166666664, + "max_sentence2_length": 195, + "unique_sentence2": 1152 + }, + "ces-eng": { + "num_samples": 3775, + "number_of_characters": 364386, + "unique_pairs": 3775, + "min_sentence1_length": 10, + "average_sentence1_length": 47.79019867549669, + "max_sentence1_length": 265, + "unique_sentence1": 3775, + "min_sentence2_length": 10, + "average_sentence2_length": 48.7358940397351, + "max_sentence2_length": 287, + "unique_sentence2": 3775 + }, + "dan-eng": { + "num_samples": 4512, + "number_of_characters": 451232, + "unique_pairs": 4512, + "min_sentence1_length": 10, + "average_sentence1_length": 51.150930851063826, + "max_sentence1_length": 296, + "unique_sentence1": 4512, + "min_sentence2_length": 10, + "average_sentence2_length": 48.856161347517734, + "max_sentence2_length": 287, + "unique_sentence2": 4512 + }, + "deu-eng": { + "num_samples": 37348, + "number_of_characters": 3890899, + "unique_pairs": 37348, + "min_sentence1_length": 8, + "average_sentence1_length": 55.154519652993464, + "max_sentence1_length": 716, + "unique_sentence1": 37348, + "min_sentence2_length": 9, + "average_sentence2_length": 49.025061582949554, + "max_sentence2_length": 586, + "unique_sentence2": 37348 + }, + "ell-eng": { + "num_samples": 2790, + "number_of_characters": 302459, + "unique_pairs": 2790, + "min_sentence1_length": 14, + "average_sentence1_length": 58.78207885304659, + "max_sentence1_length": 286, + "unique_sentence1": 2790, + "min_sentence2_length": 11, + "average_sentence2_length": 49.62616487455197, + "max_sentence2_length": 243, + "unique_sentence2": 2790 + }, + "eng-est": { + "num_samples": 755, + "number_of_characters": 75289, + "unique_pairs": 755, + "min_sentence1_length": 12, + "average_sentence1_length": 49.10331125827815, + "max_sentence1_length": 152, + "unique_sentence1": 755, + "min_sentence2_length": 15, + "average_sentence2_length": 50.617218543046356, + "max_sentence2_length": 160, + "unique_sentence2": 755 + }, + "eng-fas": { + "num_samples": 556, + "number_of_characters": 52628, + "unique_pairs": 556, + "min_sentence1_length": 12, + "average_sentence1_length": 48.160071942446045, + "max_sentence1_length": 184, + "unique_sentence1": 556, + "min_sentence2_length": 10, + "average_sentence2_length": 46.49460431654676, + "max_sentence2_length": 161, + "unique_sentence2": 556 + }, + "eng-fin": { + "num_samples": 3443, + "number_of_characters": 348436, + "unique_pairs": 3443, + "min_sentence1_length": 11, + "average_sentence1_length": 49.67876851582922, + "max_sentence1_length": 410, + "unique_sentence1": 3443, + "min_sentence2_length": 11, + "average_sentence2_length": 51.522509439442345, + "max_sentence2_length": 387, + "unique_sentence2": 3443 + }, + "eng-fra": { + "num_samples": 37208, + "number_of_characters": 4091513, + "unique_pairs": 37208, + "min_sentence1_length": 8, + "average_sentence1_length": 49.105138679853795, + "max_sentence1_length": 414, + "unique_sentence1": 37208, + "min_sentence2_length": 10, + "average_sentence2_length": 60.85812190926683, + "max_sentence2_length": 428, + "unique_sentence2": 37208 + }, + "eng-heb": { + "num_samples": 882, + "number_of_characters": 88397, + "unique_pairs": 882, + "min_sentence1_length": 13, + "average_sentence1_length": 54.84126984126984, + "max_sentence1_length": 193, + "unique_sentence1": 882, + "min_sentence2_length": 10, + "average_sentence2_length": 45.38208616780045, + "max_sentence2_length": 162, + "unique_sentence2": 882 + }, + "eng-hin": { + "num_samples": 2219, + "number_of_characters": 227451, + "unique_pairs": 2219, + "min_sentence1_length": 12, + "average_sentence1_length": 49.35466426318161, + "max_sentence1_length": 222, + "unique_sentence1": 2219, + "min_sentence2_length": 12, + "average_sentence2_length": 53.146913023884636, + "max_sentence2_length": 281, + "unique_sentence2": 2219 + }, + "eng-hrv": { + "num_samples": 336, + "number_of_characters": 35675, + "unique_pairs": 336, + "min_sentence1_length": 11, + "average_sentence1_length": 52.61904761904762, + "max_sentence1_length": 264, + "unique_sentence1": 336, + "min_sentence2_length": 16, + "average_sentence2_length": 53.55654761904762, + "max_sentence2_length": 258, + "unique_sentence2": 336 + }, + "eng-hun": { + "num_samples": 2185, + "number_of_characters": 225323, + "unique_pairs": 2185, + "min_sentence1_length": 8, + "average_sentence1_length": 50.013272311212816, + "max_sentence1_length": 238, + "unique_sentence1": 2185, + "min_sentence2_length": 9, + "average_sentence2_length": 53.10938215102975, + "max_sentence2_length": 233, + "unique_sentence2": 2185 + }, + "eng-ind": { + "num_samples": 3454, + "number_of_characters": 364799, + "unique_pairs": 3454, + "min_sentence1_length": 11, + "average_sentence1_length": 49.18413433700058, + "max_sentence1_length": 222, + "unique_sentence1": 3454, + "min_sentence2_length": 11, + "average_sentence2_length": 56.43225246091488, + "max_sentence2_length": 245, + "unique_sentence2": 3454 + }, + "eng-isl": { + "num_samples": 358, + "number_of_characters": 33431, + "unique_pairs": 358, + "min_sentence1_length": 13, + "average_sentence1_length": 46.72346368715084, + "max_sentence1_length": 136, + "unique_sentence1": 358, + "min_sentence2_length": 13, + "average_sentence2_length": 46.659217877094974, + "max_sentence2_length": 122, + "unique_sentence2": 358 + }, + "eng-ita": { + "num_samples": 19661, + "number_of_characters": 2063797, + "unique_pairs": 19661, + "min_sentence1_length": 10, + "average_sentence1_length": 49.52357458928844, + "max_sentence1_length": 365, + "unique_sentence1": 19661, + "min_sentence2_length": 10, + "average_sentence2_length": 55.445501246121765, + "max_sentence2_length": 362, + "unique_sentence2": 19661 + }, + "eng-jpn": { + "num_samples": 3807, + "number_of_characters": 318641, + "unique_pairs": 3807, + "min_sentence1_length": 12, + "average_sentence1_length": 51.44890990281061, + "max_sentence1_length": 743, + "unique_sentence1": 3807, + "min_sentence2_length": 10, + "average_sentence2_length": 32.249802994483844, + "max_sentence2_length": 414, + "unique_sentence2": 3807 + }, + "eng-kaz": { + "num_samples": 346, + "number_of_characters": 32798, + "unique_pairs": 346, + "min_sentence1_length": 15, + "average_sentence1_length": 45.540462427745666, + "max_sentence1_length": 110, + "unique_sentence1": 346, + "min_sentence2_length": 18, + "average_sentence2_length": 49.2514450867052, + "max_sentence2_length": 122, + "unique_sentence2": 346 + }, + "eng-kor": { + "num_samples": 2558, + "number_of_characters": 217654, + "unique_pairs": 2558, + "min_sentence1_length": 12, + "average_sentence1_length": 51.77990617670055, + "max_sentence1_length": 252, + "unique_sentence1": 2558, + "min_sentence2_length": 10, + "average_sentence2_length": 33.30766223612197, + "max_sentence2_length": 225, + "unique_sentence2": 2558 + }, + "eng-lav": { + "num_samples": 1079, + "number_of_characters": 103672, + "unique_pairs": 1079, + "min_sentence1_length": 12, + "average_sentence1_length": 47.47265987025023, + "max_sentence1_length": 165, + "unique_sentence1": 1079, + "min_sentence2_length": 11, + "average_sentence2_length": 48.60889712696942, + "max_sentence2_length": 151, + "unique_sentence2": 1079 + }, + "eng-lit": { + "num_samples": 1185, + "number_of_characters": 113428, + "unique_pairs": 1185, + "min_sentence1_length": 12, + "average_sentence1_length": 47.12236286919831, + "max_sentence1_length": 175, + "unique_sentence1": 1185, + "min_sentence2_length": 12, + "average_sentence2_length": 48.59746835443038, + "max_sentence2_length": 167, + "unique_sentence2": 1185 + }, + "eng-mar": { + "num_samples": 280, + "number_of_characters": 26641, + "unique_pairs": 280, + "min_sentence1_length": 14, + "average_sentence1_length": 47.65714285714286, + "max_sentence1_length": 148, + "unique_sentence1": 280, + "min_sentence2_length": 14, + "average_sentence2_length": 47.489285714285714, + "max_sentence2_length": 143, + "unique_sentence2": 280 + }, + "eng-msa": { + "num_samples": 469, + "number_of_characters": 59862, + "unique_pairs": 469, + "min_sentence1_length": 15, + "average_sentence1_length": 61.27078891257996, + "max_sentence1_length": 163, + "unique_sentence1": 469, + "min_sentence2_length": 14, + "average_sentence2_length": 66.36673773987206, + "max_sentence2_length": 170, + "unique_sentence2": 469 + }, + "eng-nld": { + "num_samples": 15613, + "number_of_characters": 1555910, + "unique_pairs": 15613, + "min_sentence1_length": 10, + "average_sentence1_length": 47.859860372766285, + "max_sentence1_length": 312, + "unique_sentence1": 15613, + "min_sentence2_length": 10, + "average_sentence2_length": 51.79491449433165, + "max_sentence2_length": 337, + "unique_sentence2": 15613 + }, + "eng-nor": { + "num_samples": 2666, + "number_of_characters": 263032, + "unique_pairs": 2666, + "min_sentence1_length": 11, + "average_sentence1_length": 49.035258814703674, + "max_sentence1_length": 287, + "unique_sentence1": 2666, + "min_sentence2_length": 11, + "average_sentence2_length": 49.626406601650416, + "max_sentence2_length": 291, + "unique_sentence2": 2666 + }, + "eng-pol": { + "num_samples": 6868, + "number_of_characters": 682796, + "unique_pairs": 6868, + "min_sentence1_length": 8, + "average_sentence1_length": 47.905503785672686, + "max_sentence1_length": 313, + "unique_sentence1": 6868, + "min_sentence2_length": 9, + "average_sentence2_length": 51.51150262085032, + "max_sentence2_length": 325, + "unique_sentence2": 6868 + }, + "eng-por": { + "num_samples": 12406, + "number_of_characters": 1237980, + "unique_pairs": 12406, + "min_sentence1_length": 10, + "average_sentence1_length": 47.422376269546994, + "max_sentence1_length": 743, + "unique_sentence1": 12406, + "min_sentence2_length": 10, + "average_sentence2_length": 52.36643559567951, + "max_sentence2_length": 761, + "unique_sentence2": 12406 + }, + "eng-ron": { + "num_samples": 3039, + "number_of_characters": 322632, + "unique_pairs": 3039, + "min_sentence1_length": 8, + "average_sentence1_length": 50.611714379730174, + "max_sentence1_length": 586, + "unique_sentence1": 3039, + "min_sentence2_length": 9, + "average_sentence2_length": 55.55215531424811, + "max_sentence2_length": 699, + "unique_sentence2": 3039 + }, + "eng-rus": { + "num_samples": 9360, + "number_of_characters": 983153, + "unique_pairs": 9360, + "min_sentence1_length": 10, + "average_sentence1_length": 51.38547008547008, + "max_sentence1_length": 607, + "unique_sentence1": 9360, + "min_sentence2_length": 10, + "average_sentence2_length": 53.65224358974359, + "max_sentence2_length": 773, + "unique_sentence2": 9360 + }, + "eng-slk": { + "num_samples": 1823, + "number_of_characters": 178649, + "unique_pairs": 1823, + "min_sentence1_length": 12, + "average_sentence1_length": 49.05979155238618, + "max_sentence1_length": 637, + "unique_sentence1": 1823, + "min_sentence2_length": 10, + "average_sentence2_length": 48.93746571585299, + "max_sentence2_length": 597, + "unique_sentence2": 1823 + }, + "eng-slv": { + "num_samples": 1450, + "number_of_characters": 151964, + "unique_pairs": 1450, + "min_sentence1_length": 12, + "average_sentence1_length": 51.282758620689656, + "max_sentence1_length": 287, + "unique_sentence1": 1450, + "min_sentence2_length": 11, + "average_sentence2_length": 53.52, + "max_sentence2_length": 252, + "unique_sentence2": 1450 + }, + "eng-spa": { + "num_samples": 35446, + "number_of_characters": 3679410, + "unique_pairs": 35446, + "min_sentence1_length": 10, + "average_sentence1_length": 48.672374880099305, + "max_sentence1_length": 525, + "unique_sentence1": 35446, + "min_sentence2_length": 10, + "average_sentence2_length": 55.13087513400666, + "max_sentence2_length": 607, + "unique_sentence2": 35446 + }, + "eng-srp": { + "num_samples": 303, + "number_of_characters": 29733, + "unique_pairs": 303, + "min_sentence1_length": 15, + "average_sentence1_length": 48.15181518151815, + "max_sentence1_length": 123, + "unique_sentence1": 303, + "min_sentence2_length": 10, + "average_sentence2_length": 49.976897689768975, + "max_sentence2_length": 128, + "unique_sentence2": 303 + }, + "eng-swe": { + "num_samples": 6005, + "number_of_characters": 597122, + "unique_pairs": 6005, + "min_sentence1_length": 10, + "average_sentence1_length": 49.21248959200666, + "max_sentence1_length": 287, + "unique_sentence1": 6005, + "min_sentence2_length": 10, + "average_sentence2_length": 50.22497918401332, + "max_sentence2_length": 289, + "unique_sentence2": 6005 + }, + "eng-tgl": { + "num_samples": 551, + "number_of_characters": 56336, + "unique_pairs": 551, + "min_sentence1_length": 14, + "average_sentence1_length": 45.60798548094374, + "max_sentence1_length": 165, + "unique_sentence1": 551, + "min_sentence2_length": 13, + "average_sentence2_length": 56.635208711433755, + "max_sentence2_length": 198, + "unique_sentence2": 551 + }, + "eng-tha": { + "num_samples": 814, + "number_of_characters": 79610, + "unique_pairs": 814, + "min_sentence1_length": 11, + "average_sentence1_length": 50.45331695331695, + "max_sentence1_length": 544, + "unique_sentence1": 814, + "min_sentence2_length": 11, + "average_sentence2_length": 47.347665847665844, + "max_sentence2_length": 511, + "unique_sentence2": 814 + }, + "eng-tur": { + "num_samples": 4606, + "number_of_characters": 446492, + "unique_pairs": 4606, + "min_sentence1_length": 10, + "average_sentence1_length": 47.3936170212766, + "max_sentence1_length": 287, + "unique_sentence1": 4606, + "min_sentence2_length": 10, + "average_sentence2_length": 49.54342162396873, + "max_sentence2_length": 314, + "unique_sentence2": 4606 + }, + "eng-ukr": { + "num_samples": 3778, + "number_of_characters": 388398, + "unique_pairs": 3778, + "min_sentence1_length": 11, + "average_sentence1_length": 51.143197458973, + "max_sentence1_length": 284, + "unique_sentence1": 3778, + "min_sentence2_length": 10, + "average_sentence2_length": 51.661990471148755, + "max_sentence2_length": 255, + "unique_sentence2": 3778 + }, + "eng-urd": { + "num_samples": 268, + "number_of_characters": 25182, + "unique_pairs": 268, + "min_sentence1_length": 14, + "average_sentence1_length": 46.1044776119403, + "max_sentence1_length": 99, + "unique_sentence1": 268, + "min_sentence2_length": 15, + "average_sentence2_length": 47.85820895522388, + "max_sentence2_length": 107, + "unique_sentence2": 268 + }, + "eng-vie": { + "num_samples": 1264, + "number_of_characters": 123022, + "unique_pairs": 1264, + "min_sentence1_length": 12, + "average_sentence1_length": 45.6503164556962, + "max_sentence1_length": 241, + "unique_sentence1": 1264, + "min_sentence2_length": 10, + "average_sentence2_length": 51.677215189873415, + "max_sentence2_length": 262, + "unique_sentence2": 1264 + }, + "eng-zho": { + "num_samples": 4959, + "number_of_characters": 347349, + "unique_pairs": 4959, + "min_sentence1_length": 11, + "average_sentence1_length": 50.00564629965719, + "max_sentence1_length": 847, + "unique_sentence1": 4959, + "min_sentence2_length": 8, + "average_sentence2_length": 20.038515829804396, + "max_sentence2_length": 620, + "unique_sentence2": 4959 + } + } + } +} \ No newline at end of file diff --git a/mteb/tasks/BitextMining/__init__.py b/mteb/tasks/BitextMining/__init__.py index 9236fdb972..79c2e706a4 100644 --- a/mteb/tasks/BitextMining/__init__.py +++ b/mteb/tasks/BitextMining/__init__.py @@ -22,5 +22,6 @@ from .multilingual.PhincBitextMining import * from .multilingual.RomaTalesBitextMining import * from .multilingual.TatoebaBitextMining import * +from .multilingual.WebFAQBitextMining import * from .srn.SRNCorpusBitextMining import * from .vie.VieMedEVBitextMining import * diff --git a/mteb/tasks/BitextMining/multilingual/WebFAQBitextMining.py b/mteb/tasks/BitextMining/multilingual/WebFAQBitextMining.py new file mode 100644 index 0000000000..c0561e01ab --- /dev/null +++ b/mteb/tasks/BitextMining/multilingual/WebFAQBitextMining.py @@ -0,0 +1,306 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskBitextMining import AbsTaskBitextMining +from mteb.abstasks.MultilingualTask import MultilingualTask +from mteb.abstasks.TaskMetadata import TaskMetadata + +# Consider only those language pairs with at least 250 samples +_LANGUAGES = { + # Afro-Asiatic, Indo-European (Iranian) + "ara-fas": ["ara-Arab", "fas-Arab"], # Samples: 609 + "ara-heb": ["ara-Arab", "heb-Hebr"], # Samples: 978 + # + # Austroasiatic, Japonic, Koreanic, Sino-Tibetan + "jpn-kor": ["jpn-Jpan", "kor-Kore"], # Samples: 4,820 + "jpn-vie": ["jpn-Jpan", "vie-Latn"], # Samples: 1,356 + "jpn-zho": ["jpn-Jpan", "zho-Hans"], # Samples: 1,728 + "kor-vie": ["kor-Kore", "vie-Latn"], # Samples: 1,386 + "kor-zho": ["kor-Kore", "zho-Hans"], # Samples: 1,087 + "vie-zho": ["vie-Latn", "zho-Hans"], # Samples: 646 + # + # Austronesian, Kra-Dai + "ind-msa": ["ind-Latn", "msa-Latn"], # Samples: 455 + "ind-tgl": ["ind-Latn", "tgl-Latn"], # Samples: 378 + "ind-tha": ["ind-Latn", "tha-Thai"], # Samples: 1,258 + # + # Caucasian, Indo-European (Baltic), Indo-European (Slavic) + "bul-ces": ["bul-Cyrl", "ces-Latn"], # Samples: 1,485 + "bul-lav": ["bul-Cyrl", "lav-Latn"], # Samples: 710 + "bul-lit": ["bul-Cyrl", "lit-Latn"], # Samples: 803 + "bul-pol": ["bul-Cyrl", "pol-Latn"], # Samples: 1,635 + "bul-rus": ["bul-Cyrl", "rus-Cyrl"], # Samples: 1,476 + "bul-slk": ["bul-Cyrl", "slk-Latn"], # Samples: 1,154 + "bul-slv": ["bul-Cyrl", "slv-Latn"], # Samples: 1,034 + "bul-srp": ["bul-Cyrl", "srp-Cyrl"], # Samples: 296 + "bul-ukr": ["bul-Cyrl", "ukr-Cyrl"], # Samples: 1,074 + "ces-lav": ["ces-Latn", "lav-Latn"], # Samples: 875 + "ces-lit": ["ces-Latn", "lit-Latn"], # Samples: 1,002 + "ces-pol": ["ces-Latn", "pol-Latn"], # Samples: 3,367 + "ces-rus": ["ces-Latn", "rus-Cyrl"], # Samples: 2,144 + "ces-slk": ["ces-Latn", "slk-Latn"], # Samples: 2,551 + "ces-slv": ["ces-Latn", "slv-Latn"], # Samples: 1,370 + "ces-srp": ["ces-Latn", "srp-Cyrl"], # Samples: 362 + "ces-ukr": ["ces-Latn", "ukr-Cyrl"], # Samples: 1,285 + "hrv-slk": ["hrv-Latn", "slk-Latn"], # Samples: 313 + "kat-rus": ["kat-Geor", "rus-Cyrl"], # Samples: 262 + "lav-lit": ["lav-Latn", "lit-Latn"], # Samples: 1,061 + "lav-pol": ["lav-Latn", "pol-Latn"], # Samples: 951 + "lav-rus": ["lav-Latn", "rus-Cyrl"], # Samples: 1,412 + "lav-slk": ["lav-Latn", "slk-Latn"], # Samples: 789 + "lav-slv": ["lav-Latn", "slv-Latn"], # Samples: 518 + "lav-ukr": ["lav-Latn", "ukr-Cyrl"], # Samples: 579 + "lit-pol": ["lit-Latn", "pol-Latn"], # Samples: 1,026 + "lit-rus": ["lit-Latn", "rus-Cyrl"], # Samples: 961 + "lit-slk": ["lit-Latn", "slk-Latn"], # Samples: 859 + "lit-slv": ["lit-Latn", "slv-Latn"], # Samples: 607 + "lit-ukr": ["lit-Latn", "ukr-Cyrl"], # Samples: 639 + "pol-rus": ["pol-Latn", "rus-Cyrl"], # Samples: 5,014 + "pol-slk": ["pol-Latn", "slk-Latn"], # Samples: 1,918 + "pol-slv": ["pol-Latn", "slv-Latn"], # Samples: 1,382 + "pol-srp": ["pol-Latn", "srp-Cyrl"], # Samples: 492 + "pol-ukr": ["pol-Latn", "ukr-Cyrl"], # Samples: 2,370 + "rus-slk": ["rus-Cyrl", "slk-Latn"], # Samples: 1,263 + "rus-slv": ["rus-Cyrl", "slv-Latn"], # Samples: 1,096 + "rus-srp": ["rus-Cyrl", "srp-Cyrl"], # Samples: 455 + "rus-ukr": ["rus-Cyrl", "ukr-Cyrl"], # Samples: 15,251 + "slk-slv": ["slk-Latn", "slv-Latn"], # Samples: 1,259 + "slk-srp": ["slk-Latn", "srp-Cyrl"], # Samples: 561 + "slk-ukr": ["slk-Latn", "ukr-Cyrl"], # Samples: 944 + "slv-srp": ["slv-Latn", "srp-Cyrl"], # Samples: 499 + "slv-ukr": ["slv-Latn", "ukr-Cyrl"], # Samples: 733 + # + # Indo-European (Germanic), Indo-European (Romance) + "cat-deu": ["cat-Latn", "deu-Latn"], # Samples: 302 + "cat-fra": ["cat-Latn", "fra-Latn"], # Samples: 598 + "cat-ita": ["cat-Latn", "ita-Latn"], # Samples: 418 + "cat-por": ["cat-Latn", "por-Latn"], # Samples: 370 + "cat-spa": ["cat-Latn", "spa-Latn"], # Samples: 2,648 + "dan-deu": ["dan-Latn", "deu-Latn"], # Samples: 4,337 + "dan-fra": ["dan-Latn", "fra-Latn"], # Samples: 3,802 + "dan-isl": ["dan-Latn", "isl-Latn"], # Samples: 327 + "dan-ita": ["dan-Latn", "ita-Latn"], # Samples: 3,818 + "dan-nld": ["dan-Latn", "nld-Latn"], # Samples: 4,099 + "dan-nor": ["dan-Latn", "nor-Latn"], # Samples: 2,603 + "dan-por": ["dan-Latn", "por-Latn"], # Samples: 3,206 + "dan-ron": ["dan-Latn", "ron-Latn"], # Samples: 2,052 + "dan-spa": ["dan-Latn", "spa-Latn"], # Samples: 3,571 + "dan-swe": ["dan-Latn", "swe-Latn"], # Samples: 4,268 + "deu-fra": ["deu-Latn", "fra-Latn"], # Samples: 27,727 + "deu-isl": ["deu-Latn", "isl-Latn"], # Samples: 294 + "deu-ita": ["deu-Latn", "ita-Latn"], # Samples: 18,787 + "deu-nld": ["deu-Latn", "nld-Latn"], # Samples: 14,211 + "deu-nor": ["deu-Latn", "nor-Latn"], # Samples: 2,783 + "deu-por": ["deu-Latn", "por-Latn"], # Samples: 11,319 + "deu-ron": ["deu-Latn", "ron-Latn"], # Samples: 3,598 + "deu-spa": ["deu-Latn", "spa-Latn"], # Samples: 19,739 + "deu-swe": ["deu-Latn", "swe-Latn"], # Samples: 5,772 + "fra-isl": ["fra-Latn", "isl-Latn"], # Samples: 347 + "fra-ita": ["fra-Latn", "ita-Latn"], # Samples: 20,002 + "fra-nld": ["fra-Latn", "nld-Latn"], # Samples: 14,684 + "fra-nor": ["fra-Latn", "nor-Latn"], # Samples: 2,558 + "fra-por": ["fra-Latn", "por-Latn"], # Samples: 13,265 + "fra-ron": ["fra-Latn", "ron-Latn"], # Samples: 3,295 + "fra-spa": ["fra-Latn", "spa-Latn"], # Samples: 23,311 + "fra-swe": ["fra-Latn", "swe-Latn"], # Samples: 5,006 + "isl-ita": ["isl-Latn", "ita-Latn"], # Samples: 421 + "isl-nld": ["isl-Latn", "nld-Latn"], # Samples: 311 + "isl-por": ["isl-Latn", "por-Latn"], # Samples: 341 + "isl-spa": ["isl-Latn", "spa-Latn"], # Samples: 366 + "isl-swe": ["isl-Latn", "swe-Latn"], # Samples: 312 + "ita-nld": ["ita-Latn", "nld-Latn"], # Samples: 9,160 + "ita-nor": ["ita-Latn", "nor-Latn"], # Samples: 2,516 + "ita-por": ["ita-Latn", "por-Latn"], # Samples: 10,924 + "ita-ron": ["ita-Latn", "ron-Latn"], # Samples: 3,360 + "ita-spa": ["ita-Latn", "spa-Latn"], # Samples: 16,534 + "ita-swe": ["ita-Latn", "swe-Latn"], # Samples: 4,741 + "nld-nor": ["nld-Latn", "nor-Latn"], # Samples: 2,664 + "nld-por": ["nld-Latn", "por-Latn"], # Samples: 7,021 + "nld-ron": ["nld-Latn", "ron-Latn"], # Samples: 2,888 + "nld-spa": ["nld-Latn", "spa-Latn"], # Samples: 9,555 + "nld-swe": ["nld-Latn", "swe-Latn"], # Samples: 5,072 + "nor-por": ["nor-Latn", "por-Latn"], # Samples: 2,096 + "nor-ron": ["nor-Latn", "ron-Latn"], # Samples: 1,412 + "nor-spa": ["nor-Latn", "spa-Latn"], # Samples: 2,603 + "nor-swe": ["nor-Latn", "swe-Latn"], # Samples: 3,165 + "por-ron": ["por-Latn", "ron-Latn"], # Samples: 3,026 + "por-spa": ["por-Latn", "spa-Latn"], # Samples: 16,084 + "por-swe": ["por-Latn", "swe-Latn"], # Samples: 4,235 + "ron-spa": ["ron-Latn", "spa-Latn"], # Samples: 3,375 + "ron-swe": ["ron-Latn", "swe-Latn"], # Samples: 2,154 + "spa-swe": ["spa-Latn", "swe-Latn"], # Samples: 4,884 + # + # Indo-European (Indo-Aryan) + "ben-hin": ["ben-Beng", "hin-Deva"], # Samples: 1,174 + "ben-mar": ["ben-Beng", "mar-Deva"], # Samples: 566 + "ben-urd": ["ben-Beng", "urd-Arab"], # Samples: 488 + "hin-mar": ["hin-Deva", "mar-Deva"], # Samples: 615 + "hin-urd": ["hin-Deva", "urd-Arab"], # Samples: 545 + "mar-urd": ["mar-Deva", "urd-Arab"], # Samples: 270 + # + # Turkic + "aze-kaz": ["aze-Latn", "kaz-Cyrl"], # Samples: 412 + "aze-tur": ["aze-Latn", "tur-Latn"], # Samples: 388 + "kaz-tur": ["kaz-Cyrl", "tur-Latn"], # Samples: 340 + # + # Uralic + "est-fin": ["est-Latn", "fin-Latn"], # Samples: 790 + "est-hun": ["est-Latn", "hun-Latn"], # Samples: 674 + "fin-hun": ["fin-Latn", "hun-Latn"], # Samples: 1,542 + # + # Any2English + "ara-eng": ["ara-Arab", "eng-Latn"], # Samples: 5,698 + "aze-eng": ["aze-Latn", "eng-Latn"], # Samples: 603 + "ben-eng": ["ben-Beng", "eng-Latn"], # Samples: 1,367 + "bul-eng": ["bul-Cyrl", "eng-Latn"], # Samples: 2,133 + "cat-eng": ["cat-Latn", "eng-Latn"], # Samples: 1,152 + "ces-eng": ["ces-Latn", "eng-Latn"], # Samples: 3,775 + "dan-eng": ["dan-Latn", "eng-Latn"], # Samples: 4,512 + "deu-eng": ["deu-Latn", "eng-Latn"], # Samples: 37,348 + "ell-eng": ["ell-Grek", "eng-Latn"], # Samples: 2,790 + "eng-est": ["eng-Latn", "est-Latn"], # Samples: 755 + "eng-fas": ["eng-Latn", "fas-Arab"], # Samples: 556 + "eng-fin": ["eng-Latn", "fin-Latn"], # Samples: 3,443 + "eng-fra": ["eng-Latn", "fra-Latn"], # Samples: 37,208 + "eng-heb": ["eng-Latn", "heb-Hebr"], # Samples: 882 + "eng-hin": ["eng-Latn", "hin-Deva"], # Samples: 2,219 + "eng-hrv": ["eng-Latn", "hrv-Latn"], # Samples: 336 + "eng-hun": ["eng-Latn", "hun-Latn"], # Samples: 2,185 + "eng-ind": ["eng-Latn", "ind-Latn"], # Samples: 3,454 + "eng-isl": ["eng-Latn", "isl-Latn"], # Samples: 358 + "eng-ita": ["eng-Latn", "ita-Latn"], # Samples: 19,661 + "eng-jpn": ["eng-Latn", "jpn-Jpan"], # Samples: 3,807 + "eng-kaz": ["eng-Latn", "kaz-Cyrl"], # Samples: 346 + "eng-kor": ["eng-Latn", "kor-Kore"], # Samples: 2,558 + "eng-lav": ["eng-Latn", "lav-Latn"], # Samples: 1,079 + "eng-lit": ["eng-Latn", "lit-Latn"], # Samples: 1,185 + "eng-mar": ["eng-Latn", "mar-Deva"], # Samples: 280 + "eng-msa": ["eng-Latn", "msa-Latn"], # Samples: 469 + "eng-nld": ["eng-Latn", "nld-Latn"], # Samples: 15,613 + "eng-nor": ["eng-Latn", "nor-Latn"], # Samples: 2,666 + "eng-pol": ["eng-Latn", "pol-Latn"], # Samples: 6,868 + "eng-por": ["eng-Latn", "por-Latn"], # Samples: 12,406 + "eng-ron": ["eng-Latn", "ron-Latn"], # Samples: 3,039 + "eng-rus": ["eng-Latn", "rus-Cyrl"], # Samples: 9,360 + "eng-slk": ["eng-Latn", "slk-Latn"], # Samples: 1,823 + "eng-slv": ["eng-Latn", "slv-Latn"], # Samples: 1,450 + "eng-spa": ["eng-Latn", "spa-Latn"], # Samples: 35,446 + "eng-srp": ["eng-Latn", "srp-Cyrl"], # Samples: 303 + "eng-swe": ["eng-Latn", "swe-Latn"], # Samples: 6,005 + "eng-tgl": ["eng-Latn", "tgl-Latn"], # Samples: 551 + "eng-tha": ["eng-Latn", "tha-Thai"], # Samples: 814 + "eng-tur": ["eng-Latn", "tur-Latn"], # Samples: 4,606 + "eng-ukr": ["eng-Latn", "ukr-Cyrl"], # Samples: 3,778 + "eng-urd": ["eng-Latn", "urd-Arab"], # Samples: 268 + "eng-vie": ["eng-Latn", "vie-Latn"], # Samples: 1,264 + "eng-zho": ["eng-Latn", "zho-Hans"], # Samples: 4,959 +} + +_SPLITS = ["default"] + + +class WebFAQBitextMiningQuestions(AbsTaskBitextMining, MultilingualTask): + metadata = TaskMetadata( + name="WebFAQBitextMiningQuestions", + description="""The WebFAQ Bitext Dataset consists of natural FAQ-style Question-Answer pairs that align across languages. +A sentence in the "WebFAQBitextMiningQuestions" task is the question originating from an aligned QA. +The dataset is sourced from FAQ pages on the web.""", + reference="https://huggingface.co/PaDaS-Lab", + dataset={ + "path": "PaDaS-Lab/webfaq-bitexts", + "revision": "a1bc0e8fd36c3d5015bd64c14ca098596774784a", + }, + type="BitextMining", + category="s2s", + modalities=["text"], + eval_splits=_SPLITS, + eval_langs=_LANGUAGES, + main_score="f1", + date=("2022-09-01", "2024-10-01"), + domains=["Web", "Written"], + task_subtypes=[], + license="cc-by-4.0", + annotations_creators="human-annotated", + dialect=[], + sample_creation="human-translated", + bibtex_citation="""@misc{dinzinger2025webfaq, + title={WebFAQ: A Multilingual Collection of Natural Q&A Datasets for Dense Retrieval}, + author={Michael Dinzinger and Laura Caspari and Kanishka Ghosh Dastidar and Jelena Mitrović and Michael Granitzer}, + year={2025}, + eprint={2502.20936}, + archivePrefix={arXiv}, + primaryClass={cs.CL}, + url={https://arxiv.org/abs/2502.20936}, +}""", + ) + + def dataset_transform(self): + dataset = {} + for langs in self.dataset: + dataset[langs] = {} + for split in _SPLITS: + sentence1 = [] + sentence2 = [] + for document in self.dataset[langs][split]: + sentence1.append(document["question1"]) + sentence2.append(document["question2"]) + + dataset[langs][split] = { + "sentence1": sentence1, + "sentence2": sentence2, + "gold": [(i, i) for i in range(len(sentence1))], + } + self.dataset = dataset + + +class WebFAQBitextMiningQAs(AbsTaskBitextMining, MultilingualTask): + metadata = TaskMetadata( + name="WebFAQBitextMiningQAs", + description="""The WebFAQ Bitext Dataset consists of natural FAQ-style Question-Answer pairs that align across languages. +A sentence in the "WebFAQBitextMiningQAs" task is a concatenation of a question and its corresponding answer. +The dataset is sourced from FAQ pages on the web.""", + reference="https://huggingface.co/PaDaS-Lab", + dataset={ + "path": "PaDaS-Lab/webfaq-bitexts", + "revision": "a1bc0e8fd36c3d5015bd64c14ca098596774784a", + }, + type="BitextMining", + category="p2p", + modalities=["text"], + eval_splits=_SPLITS, + eval_langs=_LANGUAGES, + main_score="f1", + date=("2022-09-01", "2024-10-01"), + domains=["Web", "Written"], + task_subtypes=[], + license="cc-by-4.0", + annotations_creators="human-annotated", + dialect=[], + sample_creation="human-translated", + bibtex_citation="""@misc{dinzinger2025webfaq, + title={WebFAQ: A Multilingual Collection of Natural Q&A Datasets for Dense Retrieval}, + author={Michael Dinzinger and Laura Caspari and Kanishka Ghosh Dastidar and Jelena Mitrović and Michael Granitzer}, + year={2025}, + eprint={2502.20936}, + archivePrefix={arXiv}, + primaryClass={cs.CL}, + url={https://arxiv.org/abs/2502.20936}, +}""", + ) + + def dataset_transform(self): + dataset = {} + for langs in self.dataset: + dataset[langs] = {} + for split in _SPLITS: + sentence1 = [] + sentence2 = [] + for document in self.dataset[langs][split]: + sentence1.append(document["question1"] + " " + document["answer1"]) + sentence2.append(document["question2"] + " " + document["answer2"]) + + dataset[langs][split] = { + "sentence1": sentence1, + "sentence2": sentence2, + "gold": [(i, i) for i in range(len(sentence1))], + } + self.dataset = dataset