diff --git a/mteb/descriptive_stats/Clustering/RedditClustering-VN.v2.json b/mteb/descriptive_stats/Clustering/RedditClustering-VN.v2.json new file mode 100755 index 0000000000..72e53ed2e7 --- /dev/null +++ b/mteb/descriptive_stats/Clustering/RedditClustering-VN.v2.json @@ -0,0 +1,171 @@ +{ + "test": { + "num_samples": 2048, + "text_statistics": { + "total_text_length": 162939, + "min_text_length": 16, + "average_text_length": 79.56005859375, + "max_text_length": 396, + "unique_texts": 2014 + }, + "image_statistics": null, + "labels_statistics": { + "min_labels_per_text": 1, + "average_label_per_text": 1.0, + "max_labels_per_text": 1, + "unique_labels": 50, + "labels": { + "12": { + "count": 25 + }, + "8": { + "count": 41 + }, + "0": { + "count": 39 + }, + "1": { + "count": 33 + }, + "7": { + "count": 41 + }, + "22": { + "count": 56 + }, + "14": { + "count": 57 + }, + "35": { + "count": 35 + }, + "33": { + "count": 35 + }, + "2": { + "count": 62 + }, + "6": { + "count": 32 + }, + "17": { + "count": 47 + }, + "5": { + "count": 51 + }, + "15": { + "count": 50 + }, + "31": { + "count": 48 + }, + "40": { + "count": 38 + }, + "16": { + "count": 37 + }, + "27": { + "count": 61 + }, + "29": { + "count": 24 + }, + "39": { + "count": 64 + }, + "9": { + "count": 56 + }, + "28": { + "count": 38 + }, + "34": { + "count": 37 + }, + "26": { + "count": 65 + }, + "3": { + "count": 25 + }, + "20": { + "count": 49 + }, + "44": { + "count": 63 + }, + "46": { + "count": 70 + }, + "4": { + "count": 42 + }, + "49": { + "count": 62 + }, + "48": { + "count": 28 + }, + "21": { + "count": 42 + }, + "24": { + "count": 16 + }, + "36": { + "count": 47 + }, + "13": { + "count": 24 + }, + "18": { + "count": 33 + }, + "23": { + "count": 36 + }, + "11": { + "count": 47 + }, + "42": { + "count": 24 + }, + "32": { + "count": 37 + }, + "19": { + "count": 37 + }, + "45": { + "count": 34 + }, + "30": { + "count": 46 + }, + "25": { + "count": 31 + }, + "41": { + "count": 37 + }, + "10": { + "count": 34 + }, + "43": { + "count": 19 + }, + "37": { + "count": 49 + }, + "38": { + "count": 13 + }, + "47": { + "count": 31 + } + } + } + } +} diff --git a/mteb/descriptive_stats/Clustering/RedditClusteringP2P-VN.v2.json b/mteb/descriptive_stats/Clustering/RedditClusteringP2P-VN.v2.json new file mode 100755 index 0000000000..c42ad7ac66 --- /dev/null +++ b/mteb/descriptive_stats/Clustering/RedditClusteringP2P-VN.v2.json @@ -0,0 +1,1263 @@ +{ + "test": { + "num_samples": 330995, + "text_statistics": { + "total_text_length": 243616795, + "min_text_length": 25, + "average_text_length": 736.0135198416895, + "max_text_length": 7825, + "unique_texts": 318308 + }, + "image_statistics": null, + "labels_statistics": { + "min_labels_per_text": 1, + "average_label_per_text": 1.0, + "max_labels_per_text": 1, + "unique_labels": 414, + "labels": { + "FortNiteBR": { + "count": 53 + }, + "offmychest": { + "count": 45 + }, + "nus": { + "count": 6 + }, + "relationship_advice": { + "count": 9040 + }, + "premed": { + "count": 10 + }, + "dogecoin": { + "count": 6024 + }, + "GamingLaptops": { + "count": 14 + }, + "asktransgender": { + "count": 30 + }, + "MachineLearning": { + "count": 3 + }, + "buildapc": { + "count": 5827 + }, + "GunAccessoriesForSale": { + "count": 1621 + }, + "MonsterHunter": { + "count": 26 + }, + "tipofmypenis": { + "count": 11 + }, + "samsung": { + "count": 11 + }, + "PersonalFinanceCanada": { + "count": 46 + }, + "bleach": { + "count": 9 + }, + "AmItheAsshole": { + "count": 1165 + }, + "WallStreetbetsELITE": { + "count": 35 + }, + "GlobalPowers": { + "count": 4 + }, + "puppy101": { + "count": 502 + }, + "ABraThatFits": { + "count": 16 + }, + "PokemonGoFriends": { + "count": 748 + }, + "masseffect": { + "count": 26 + }, + "dating_advice": { + "count": 50 + }, + "yoga": { + "count": 6 + }, + "depression": { + "count": 46 + }, + "COVID19positive": { + "count": 18 + }, + "generationology": { + "count": 4 + }, + "NoMansSkyTheGame": { + "count": 18 + }, + "feedthebeast": { + "count": 19 + }, + "Catholicism": { + "count": 18 + }, + "thedivision": { + "count": 9 + }, + "EliteDangerous": { + "count": 26 + }, + "socialanxiety": { + "count": 13 + }, + "askTO": { + "count": 10 + }, + "Bogleheads": { + "count": 6 + }, + "dragonquest": { + "count": 6 + }, + "dirtykikpals": { + "count": 21 + }, + "gorillaz": { + "count": 3 + }, + "GoRVing": { + "count": 4 + }, + "comicswap": { + "count": 5 + }, + "VirtualYoutubers": { + "count": 11 + }, + "AskCulinary": { + "count": 9 + }, + "Gta5Modding": { + "count": 2 + }, + "obs": { + "count": 8 + }, + "KingkillerChronicle": { + "count": 2 + }, + "AmongUs": { + "count": 8 + }, + "breakingmom": { + "count": 9 + }, + "AskAnAmerican": { + "count": 10 + }, + "Dodocodes": { + "count": 2 + }, + "NoContract": { + "count": 2 + }, + "PanicAttack": { + "count": 2 + }, + "KassadinMains": { + "count": 3 + }, + "Random_Acts_Of_Amazon": { + "count": 924 + }, + "chronotrigger": { + "count": 2 + }, + "skincareexchange": { + "count": 2 + }, + "islam": { + "count": 53 + }, + "Aliexpress": { + "count": 2 + }, + "PokemonHome": { + "count": 2 + }, + "alcoholicsanonymous": { + "count": 52 + }, + "survivinginfidelity": { + "count": 5 + }, + "Dyson_Sphere_Program": { + "count": 2 + }, + "ADHD": { + "count": 2873 + }, + "BuddyCrossing": { + "count": 328 + }, + "libraryofruina": { + "count": 71 + }, + "SluttyConfessions": { + "count": 2018 + }, + "tipofmytongue": { + "count": 5741 + }, + "amcstock": { + "count": 10116 + }, + "teenagers": { + "count": 55379 + }, + "suggestmeabook": { + "count": 1182 + }, + "dirtypenpals": { + "count": 4479 + }, + "MinecraftServer": { + "count": 166 + }, + "CreditCards": { + "count": 580 + }, + "Guitar": { + "count": 8687 + }, + "rpg": { + "count": 438 + }, + "NoFap": { + "count": 9351 + }, + "fleshlight": { + "count": 96 + }, + "lfg": { + "count": 1033 + }, + "MarsWallStreet": { + "count": 863 + }, + "SummonSign": { + "count": 724 + }, + "AssassinsCreedValhala": { + "count": 223 + }, + "hoi4": { + "count": 302 + }, + "xbox": { + "count": 415 + }, + "Coins4Sale": { + "count": 238 + }, + "TooAfraidToAsk": { + "count": 5673 + }, + "KGBTR": { + "count": 475 + }, + "roblox": { + "count": 187 + }, + "NBA2k": { + "count": 392 + }, + "mechmarket": { + "count": 4472 + }, + "Gaming_Headsets": { + "count": 93 + }, + "TwoXChromosomes": { + "count": 1192 + }, + "pittsburgh": { + "count": 161 + }, + "CryptoMars": { + "count": 1518 + }, + "salesforce": { + "count": 154 + }, + "FridayNightFunkin": { + "count": 321 + }, + "vaginismus": { + "count": 63 + }, + "transpositive": { + "count": 10 + }, + "comicbooks": { + "count": 233 + }, + "BDSMcommunity": { + "count": 122 + }, + "aliens": { + "count": 169 + }, + "Scotch": { + "count": 60 + }, + "KikRoleplay": { + "count": 115 + }, + "196": { + "count": 31 + }, + "digimon": { + "count": 123 + }, + "Evernote": { + "count": 38 + }, + "Kayaking": { + "count": 70 + }, + "logh": { + "count": 21 + }, + "arlington": { + "count": 13 + }, + "Adopted": { + "count": 8 + }, + "DissonautUniverse": { + "count": 4 + }, + "Midsommar": { + "count": 11 + }, + "xmpp": { + "count": 6 + }, + "SofiawithanF": { + "count": 61 + }, + "ZombsRoyale": { + "count": 12 + }, + "accesscontrol": { + "count": 8 + }, + "WetlanderHumor": { + "count": 2 + }, + "scatstories": { + "count": 2 + }, + "techsupport": { + "count": 233 + }, + "whatcarshouldIbuy": { + "count": 66 + }, + "Stormlight_Archive": { + "count": 14 + }, + "deadbydaylight": { + "count": 60 + }, + "bicycling": { + "count": 20 + }, + "oculus": { + "count": 55 + }, + "NoFeeAC": { + "count": 71 + }, + "Crypto_com": { + "count": 34 + }, + "aromantic": { + "count": 8 + }, + "Cartalk": { + "count": 19 + }, + "Revu": { + "count": 3 + }, + "exalted": { + "count": 2 + }, + "HilariaBaldwin": { + "count": 18 + }, + "ITCareerQuestions": { + "count": 160 + }, + "Testosterone": { + "count": 31 + }, + "Screenwriting": { + "count": 131 + }, + "steinsgate": { + "count": 12 + }, + "Sims4": { + "count": 29 + }, + "LifeProTips": { + "count": 37 + }, + "Baystreetbets": { + "count": 7 + }, + "AskGirls": { + "count": 5 + }, + "idlechampions": { + "count": 6 + }, + "facebook": { + "count": 16 + }, + "mfdoom": { + "count": 2 + }, + "FiddlesticksMains": { + "count": 2 + }, + "HFY": { + "count": 4 + }, + "FiestaST": { + "count": 2 + }, + "tf2trade": { + "count": 5 + }, + "whatsthatbook": { + "count": 755 + }, + "KazuhaMains": { + "count": 142 + }, + "RepTime": { + "count": 181 + }, + "metalgearsolid": { + "count": 143 + }, + "GearsOfWar": { + "count": 636 + }, + "qBittorrent": { + "count": 31 + }, + "AstroGaming": { + "count": 124 + }, + "ELLIPAL_Official": { + "count": 23 + }, + "raisedbynarcissists": { + "count": 1908 + }, + "unpopularopinion": { + "count": 11426 + }, + "ACTrade": { + "count": 4373 + }, + "AskVet": { + "count": 647 + }, + "whowouldwin": { + "count": 3747 + }, + "playstation": { + "count": 1255 + }, + "anime": { + "count": 5570 + }, + "DotA2": { + "count": 1509 + }, + "cryptostreetbets": { + "count": 2094 + }, + "GME": { + "count": 8925 + }, + "MonsterHunterWorld": { + "count": 524 + }, + "DnD": { + "count": 3366 + }, + "leagueoflegends": { + "count": 2644 + }, + "doordash_drivers": { + "count": 1212 + }, + "theta_network": { + "count": 448 + }, + "exmuslim": { + "count": 1063 + }, + "Market76": { + "count": 10924 + }, + "gonewildaudio": { + "count": 2232 + }, + "conspiracy": { + "count": 2975 + }, + "heroesofthestorm": { + "count": 405 + }, + "askcarsales": { + "count": 1112 + }, + "FanFiction": { + "count": 2210 + }, + "Doom": { + "count": 985 + }, + "texas": { + "count": 238 + }, + "youtubers": { + "count": 623 + }, + "boardgames": { + "count": 1068 + }, + "bravelydefault": { + "count": 253 + }, + "Vent": { + "count": 1162 + }, + "askseddit": { + "count": 170 + }, + "selfimprovement": { + "count": 977 + }, + "teenagersnew": { + "count": 166 + }, + "brasil": { + "count": 985 + }, + "MatthiasSubmissions": { + "count": 841 + }, + "MarylandUnemployment": { + "count": 192 + }, + "ChronicPain": { + "count": 325 + }, + "BokunoheroFanfiction": { + "count": 127 + }, + "BenignExistence": { + "count": 106 + }, + "SaltLakeCity": { + "count": 360 + }, + "GayYoungOldDating": { + "count": 136 + }, + "Bible": { + "count": 132 + }, + "haskell": { + "count": 142 + }, + "seduction": { + "count": 260 + }, + "HiveOS": { + "count": 88 + }, + "PerkByDaylight": { + "count": 11 + }, + "xmen": { + "count": 225 + }, + "HyperRP": { + "count": 103 + }, + "ConquerorsBlade": { + "count": 141 + }, + "fantasywriters": { + "count": 139 + }, + "tutanota": { + "count": 121 + }, + "Hedgehog": { + "count": 21 + }, + "CultoftheFranklin": { + "count": 38 + }, + "langrisser": { + "count": 34 + }, + "CozyGrove": { + "count": 46 + }, + "Sverigesforsvarsmakt": { + "count": 6 + }, + "silverbugbets": { + "count": 18 + }, + "WreckingBallMains": { + "count": 3 + }, + "capitalism_in_decay": { + "count": 6 + }, + "paintdotnet": { + "count": 8 + }, + "u_mawadom118": { + "count": 4 + }, + "xboxfindfriends": { + "count": 2 + }, + "CPTSD": { + "count": 315 + }, + "destiny2": { + "count": 202 + }, + "Wallstreetsilver": { + "count": 816 + }, + "DestinyTheGame": { + "count": 698 + }, + "InstacartShoppers": { + "count": 145 + }, + "RocketLeagueExchange": { + "count": 754 + }, + "apexlegends": { + "count": 2290 + }, + "kansascity": { + "count": 48 + }, + "namenerds": { + "count": 217 + }, + "Kengan_Ashura": { + "count": 111 + }, + "GameSale": { + "count": 261 + }, + "Reduction": { + "count": 87 + }, + "sex": { + "count": 568 + }, + "bostonr4r": { + "count": 72 + }, + "LegendsOfRuneterra": { + "count": 166 + }, + "madisonwi": { + "count": 47 + }, + "steelseries": { + "count": 65 + }, + "ClashOfClansRecruit": { + "count": 206 + }, + "CharacterRant": { + "count": 35 + }, + "help": { + "count": 119 + }, + "NameThatSong": { + "count": 140 + }, + "depressed": { + "count": 58 + }, + "40kLore": { + "count": 188 + }, + "miraculousladybug": { + "count": 128 + }, + "ask": { + "count": 189 + }, + "EverMerge": { + "count": 18 + }, + "AirForce": { + "count": 65 + }, + "overlord": { + "count": 41 + }, + "BitLifeApp": { + "count": 31 + }, + "FireEmblemHeroes": { + "count": 75 + }, + "software": { + "count": 58 + }, + "GriefSupport": { + "count": 88 + }, + "blackopscoldwar": { + "count": 277 + }, + "onewheel": { + "count": 27 + }, + "MensRights": { + "count": 61 + }, + "nhl": { + "count": 17 + }, + "ps3homebrew": { + "count": 31 + }, + "ClashOfClans": { + "count": 70 + }, + "TMJ": { + "count": 34 + }, + "LightNovels": { + "count": 66 + }, + "redsox": { + "count": 22 + }, + "CryptoMarkets": { + "count": 42 + }, + "GCXRep": { + "count": 12 + }, + "cscareerquestionsEU": { + "count": 45 + }, + "sexstories": { + "count": 52 + }, + "MindHunter": { + "count": 5 + }, + "ugly": { + "count": 33 + }, + "starcraft2coop": { + "count": 10 + }, + "nanocurrency": { + "count": 1320 + }, + "ShieldAndroidTV": { + "count": 64 + }, + "thetagang": { + "count": 104 + }, + "ModelCars": { + "count": 7 + }, + "ibs": { + "count": 85 + }, + "UKJobs": { + "count": 25 + }, + "Netherlands": { + "count": 39 + }, + "podcasts": { + "count": 77 + }, + "clonewars": { + "count": 8 + }, + "Julia": { + "count": 11 + }, + "Prolactinoma": { + "count": 9 + }, + "sofi": { + "count": 11 + }, + "royalfamily": { + "count": 6 + }, + "ConnecticutR4R": { + "count": 5 + }, + "weather": { + "count": 4 + }, + "oneui": { + "count": 6 + }, + "KTM": { + "count": 5 + }, + "Aerials": { + "count": 2 + }, + "seoul": { + "count": 2 + }, + "exjw": { + "count": 2174 + }, + "ModernMagic": { + "count": 588 + }, + "Paladins": { + "count": 864 + }, + "hitbtc": { + "count": 285 + }, + "kdramarecommends": { + "count": 765 + }, + "endocrinology": { + "count": 45 + }, + "Bath": { + "count": 37 + }, + "NassauCountyHookups": { + "count": 4 + }, + "feminineboys": { + "count": 1000 + }, + "dreamsmp": { + "count": 1680 + }, + "Minecraft": { + "count": 6247 + }, + "SquaredCircle": { + "count": 2047 + }, + "spirituality": { + "count": 1358 + }, + "Eldenring": { + "count": 1182 + }, + "bonnaroo": { + "count": 162 + }, + "gardening": { + "count": 878 + }, + "Unemployment": { + "count": 4836 + }, + "mac": { + "count": 1737 + }, + "Bestbuy": { + "count": 314 + }, + "lawschooladmissions": { + "count": 2491 + }, + "McMaster": { + "count": 535 + }, + "covidlonghaulers": { + "count": 872 + }, + "stalker": { + "count": 605 + }, + "MLBTheShow": { + "count": 1992 + }, + "FortniteCompetitive": { + "count": 683 + }, + "NiceHash": { + "count": 1939 + }, + "appliancerepair": { + "count": 394 + }, + "quittingkratom": { + "count": 692 + }, + "delhi": { + "count": 193 + }, + "dpdr": { + "count": 268 + }, + "leafs": { + "count": 197 + }, + "HotWheels": { + "count": 147 + }, + "Sat": { + "count": 875 + }, + "90dayfianceuncensored": { + "count": 481 + }, + "Throwers": { + "count": 111 + }, + "CryptoHorde": { + "count": 109 + }, + "ShuumatsuNoValkyrie": { + "count": 396 + }, + "TeensMeetTeens": { + "count": 403 + }, + "thomasthetankengine": { + "count": 174 + }, + "Huel": { + "count": 255 + }, + "dbrand": { + "count": 93 + }, + "SLFmeetups": { + "count": 15 + }, + "passive_income": { + "count": 171 + }, + "Wavyhair": { + "count": 138 + }, + "1200isplentyketo": { + "count": 33 + }, + "BroadCity": { + "count": 13 + }, + "RevenantMain": { + "count": 39 + }, + "extrarfl": { + "count": 10 + }, + "AgonGame": { + "count": 4 + }, + "FitnessDE": { + "count": 2 + }, + "gaming": { + "count": 1153 + }, + "livesound": { + "count": 68 + }, + "IBO": { + "count": 1355 + }, + "EscapefromTarkov": { + "count": 821 + }, + "amex": { + "count": 130 + }, + "VinylCollectors": { + "count": 537 + }, + "cardano": { + "count": 641 + }, + "brave_browser": { + "count": 145 + }, + "dating": { + "count": 651 + }, + "OculusQuest": { + "count": 836 + }, + "Superstonk": { + "count": 2262 + }, + "MtF": { + "count": 725 + }, + "DMAcademy": { + "count": 897 + }, + "findaleague": { + "count": 156 + }, + "Nioh": { + "count": 256 + }, + "IRS": { + "count": 431 + }, + "transgendercirclejerk": { + "count": 241 + }, + "learnmath": { + "count": 283 + }, + "piano": { + "count": 239 + }, + "LeagueConnect": { + "count": 198 + }, + "eu4": { + "count": 380 + }, + "Wordpress": { + "count": 288 + }, + "RoleplayingForReddit": { + "count": 28 + }, + "LOONA": { + "count": 82 + }, + "newtothenavy": { + "count": 98 + }, + "HaircareScience": { + "count": 50 + }, + "appletv": { + "count": 158 + }, + "raleigh": { + "count": 151 + }, + "realonlyfansreviews": { + "count": 19 + }, + "AskGames": { + "count": 47 + }, + "PokemonTCG": { + "count": 258 + }, + "GoogleDataStudio": { + "count": 12 + }, + "controlgame": { + "count": 86 + }, + "MECoOp": { + "count": 25 + }, + "snuffrp": { + "count": 36 + }, + "wicked_edge": { + "count": 72 + }, + "WhiteWolfRPG": { + "count": 105 + }, + "BMW": { + "count": 84 + }, + "choiceofgames": { + "count": 19 + }, + "hisdarkmaterials": { + "count": 10 + }, + "SakuraGakuin": { + "count": 24 + }, + "detrans": { + "count": 38 + }, + "Smallville": { + "count": 30 + }, + "kingofqueens": { + "count": 6 + }, + "JamesHoffmann": { + "count": 19 + }, + "stashinvest": { + "count": 12 + }, + "lockpicking": { + "count": 56 + }, + "sissypersonals": { + "count": 89 + }, + "ABA": { + "count": 64 + }, + "ladybusiness": { + "count": 8 + }, + "gamegrumps": { + "count": 30 + }, + "GodEater": { + "count": 20 + }, + "tomorrow": { + "count": 37 + }, + "Tomorrowland": { + "count": 9 + }, + "BlackCountryNewRoad": { + "count": 5 + }, + "STAYC": { + "count": 3 + }, + "SatoshiStreetBets": { + "count": 3393 + }, + "AskLosAngeles": { + "count": 923 + }, + "buildapcforme": { + "count": 1375 + }, + "ApplyingToCollege": { + "count": 7527 + }, + "watercooling": { + "count": 993 + }, + "BreakUps": { + "count": 2765 + }, + "FIFA": { + "count": 2938 + }, + "emacs": { + "count": 624 + }, + "Shittyaskflying": { + "count": 115 + }, + "AmazonFC": { + "count": 815 + }, + "stocks": { + "count": 3800 + }, + "religion": { + "count": 505 + }, + "pokemon": { + "count": 3601 + }, + "trakstocks": { + "count": 571 + }, + "cuboulder": { + "count": 204 + }, + "self": { + "count": 1294 + }, + "TheMagnusArchives": { + "count": 234 + }, + "tarot": { + "count": 727 + }, + "Superhero_Ideas": { + "count": 28 + }, + "touhou": { + "count": 558 + }, + "NTU": { + "count": 198 + }, + "JoJolion": { + "count": 44 + }, + "BangaloreMains": { + "count": 20 + }, + "turtles": { + "count": 29 + }, + "popperpigs": { + "count": 55 + }, + "aggretsuko": { + "count": 16 + }, + "lasers": { + "count": 20 + }, + "Library": { + "count": 5 + } + } + } + } +} diff --git a/mteb/descriptive_stats/Clustering/StackExchangeClustering-VN.v2.json b/mteb/descriptive_stats/Clustering/StackExchangeClustering-VN.v2.json new file mode 100755 index 0000000000..2daf6ff655 --- /dev/null +++ b/mteb/descriptive_stats/Clustering/StackExchangeClustering-VN.v2.json @@ -0,0 +1,171 @@ +{ + "test": { + "num_samples": 2048, + "text_statistics": { + "total_text_length": 133154, + "min_text_length": 12, + "average_text_length": 65.0166015625, + "max_text_length": 197, + "unique_texts": 2046 + }, + "image_statistics": null, + "labels_statistics": { + "min_labels_per_text": 1, + "average_label_per_text": 1.0, + "max_labels_per_text": 1, + "unique_labels": 50, + "labels": { + "47": { + "count": 45 + }, + "38": { + "count": 50 + }, + "27": { + "count": 46 + }, + "10": { + "count": 21 + }, + "48": { + "count": 74 + }, + "42": { + "count": 47 + }, + "22": { + "count": 37 + }, + "30": { + "count": 25 + }, + "44": { + "count": 38 + }, + "37": { + "count": 58 + }, + "21": { + "count": 41 + }, + "46": { + "count": 92 + }, + "7": { + "count": 61 + }, + "35": { + "count": 52 + }, + "17": { + "count": 37 + }, + "1": { + "count": 38 + }, + "19": { + "count": 12 + }, + "39": { + "count": 54 + }, + "40": { + "count": 114 + }, + "26": { + "count": 12 + }, + "14": { + "count": 70 + }, + "34": { + "count": 16 + }, + "32": { + "count": 8 + }, + "15": { + "count": 25 + }, + "23": { + "count": 59 + }, + "28": { + "count": 13 + }, + "0": { + "count": 55 + }, + "25": { + "count": 145 + }, + "5": { + "count": 68 + }, + "13": { + "count": 27 + }, + "45": { + "count": 20 + }, + "31": { + "count": 47 + }, + "4": { + "count": 32 + }, + "2": { + "count": 52 + }, + "3": { + "count": 60 + }, + "16": { + "count": 11 + }, + "24": { + "count": 20 + }, + "33": { + "count": 51 + }, + "12": { + "count": 24 + }, + "11": { + "count": 34 + }, + "29": { + "count": 16 + }, + "9": { + "count": 57 + }, + "49": { + "count": 24 + }, + "8": { + "count": 7 + }, + "20": { + "count": 36 + }, + "36": { + "count": 48 + }, + "43": { + "count": 31 + }, + "6": { + "count": 22 + }, + "41": { + "count": 10 + }, + "18": { + "count": 6 + } + } + } + } +} diff --git a/mteb/descriptive_stats/Clustering/StackExchangeClusteringP2P-VN.v2.json b/mteb/descriptive_stats/Clustering/StackExchangeClusteringP2P-VN.v2.json new file mode 100755 index 0000000000..9855252bf2 --- /dev/null +++ b/mteb/descriptive_stats/Clustering/StackExchangeClusteringP2P-VN.v2.json @@ -0,0 +1,1500 @@ +{ + "test": { + "num_samples": 64785, + "text_statistics": { + "total_text_length": 73003068, + "min_text_length": 45, + "average_text_length": 1126.8514007872193, + "max_text_length": 5812, + "unique_texts": 42969 + }, + "image_statistics": null, + "labels_statistics": { + "min_labels_per_text": 1, + "average_label_per_text": 1.0, + "max_labels_per_text": 1, + "unique_labels": 493, + "labels": { + "unity": { + "count": 16511 + }, + "directx": { + "count": 513 + }, + "opengl": { + "count": 4328 + }, + "javascript": { + "count": 993 + }, + "c#": { + "count": 2847 + }, + "animation": { + "count": 471 + }, + "game-design": { + "count": 1442 + }, + "procedural-generation": { + "count": 269 + }, + "physics": { + "count": 839 + }, + "c++": { + "count": 3960 + }, + "terminology": { + "count": 54 + }, + "android": { + "count": 1530 + }, + "graphics": { + "count": 397 + }, + "2d": { + "count": 1989 + }, + "java": { + "count": 3383 + }, + "ios": { + "count": 379 + }, + "cocos2d-iphone": { + "count": 164 + }, + "tools": { + "count": 86 + }, + "terrain": { + "count": 73 + }, + "xna": { + "count": 3604 + }, + "html5": { + "count": 129 + }, + "rendering": { + "count": 289 + }, + "mathematics": { + "count": 1050 + }, + "architecture": { + "count": 874 + }, + "matrix": { + "count": 87 + }, + "lighting": { + "count": 103 + }, + "shaders": { + "count": 629 + }, + "libgdx": { + "count": 1195 + }, + "collision-detection": { + "count": 1005 + }, + "sdl": { + "count": 112 + }, + "blender": { + "count": 117 + }, + "software-engineering": { + "count": 255 + }, + "design-patterns": { + "count": 43 + }, + "box2d": { + "count": 228 + }, + "color": { + "count": 25 + }, + "udk": { + "count": 118 + }, + "transformation": { + "count": 33 + }, + "3d": { + "count": 1128 + }, + "tiles": { + "count": 135 + }, + "camera": { + "count": 248 + }, + "legal": { + "count": 295 + }, + "python": { + "count": 293 + }, + "andengine": { + "count": 75 + }, + "distribution": { + "count": 6 + }, + "lua": { + "count": 90 + }, + "algorithm": { + "count": 589 + }, + "tilemap": { + "count": 95 + }, + "game-maker": { + "count": 295 + }, + "pygame": { + "count": 45 + }, + "networking": { + "count": 424 + }, + "flash": { + "count": 308 + }, + "sprites": { + "count": 264 + }, + "movement": { + "count": 179 + }, + "virtual-reality": { + "count": 12 + }, + "hlsl": { + "count": 101 + }, + "waypoints": { + "count": 5 + }, + "mmo": { + "count": 156 + }, + "textures": { + "count": 481 + }, + "facebook": { + "count": 27 + }, + "level-design": { + "count": 42 + }, + "rigging": { + "count": 14 + }, + "open-source": { + "count": 44 + }, + "software-rendering": { + "count": 7 + }, + "publishing": { + "count": 48 + }, + "books": { + "count": 22 + }, + "objects": { + "count": 7 + }, + "minecraft-modding": { + "count": 102 + }, + "storyboard": { + "count": 3 + }, + "iphone": { + "count": 338 + }, + "sharpdx": { + "count": 22 + }, + "glsl": { + "count": 114 + }, + "opengl-es": { + "count": 124 + }, + "loading": { + "count": 19 + }, + "effects": { + "count": 3 + }, + "accessibility": { + "count": 5 + }, + "performance": { + "count": 163 + }, + "browser-based-games": { + "count": 68 + }, + "path-finding": { + "count": 239 + }, + "fund-raising": { + "count": 4 + }, + "sdl2": { + "count": 78 + }, + "xna-4.0": { + "count": 218 + }, + "monogame": { + "count": 114 + }, + "game-loop": { + "count": 78 + }, + "music": { + "count": 35 + }, + "board-game": { + "count": 3 + }, + "opengl-es2": { + "count": 60 + }, + "game-industry": { + "count": 90 + }, + "physx": { + "count": 11 + }, + "input": { + "count": 136 + }, + "ai": { + "count": 385 + }, + "unreal": { + "count": 68 + }, + "starcraft-2": { + "count": 8 + }, + "geometry": { + "count": 89 + }, + "directx11": { + "count": 251 + }, + "smartfox": { + "count": 2 + }, + "jobs": { + "count": 20 + }, + "quaternion": { + "count": 21 + }, + "actionscript-3": { + "count": 105 + }, + "education": { + "count": 45 + }, + "art": { + "count": 52 + }, + "copyright": { + "count": 57 + }, + "roguelikes": { + "count": 7 + }, + "orientation": { + "count": 3 + }, + "projection": { + "count": 9 + }, + "phaser": { + "count": 36 + }, + "antialiasing": { + "count": 4 + }, + "3dsmax": { + "count": 51 + }, + "isometric": { + "count": 29 + }, + "mouse": { + "count": 14 + }, + "unreal-4": { + "count": 336 + }, + "cocos2d": { + "count": 53 + }, + "assets": { + "count": 99 + }, + "nintendo": { + "count": 20 + }, + "asset-management": { + "count": 4 + }, + "mobile": { + "count": 65 + }, + "graphics-programming": { + "count": 60 + }, + "maya": { + "count": 24 + }, + "control": { + "count": 8 + }, + "game-mechanics": { + "count": 167 + }, + "directx9": { + "count": 45 + }, + "windows": { + "count": 68 + }, + "component-based": { + "count": 59 + }, + "gpu": { + "count": 39 + }, + "gui": { + "count": 87 + }, + "adventure-game-studio": { + "count": 3 + }, + "roblox": { + "count": 15 + }, + "rotation": { + "count": 176 + }, + "planning": { + "count": 12 + }, + "uv-mapping": { + "count": 2 + }, + "cross-platform": { + "count": 35 + }, + "c": { + "count": 43 + }, + "vector": { + "count": 105 + }, + "terrain-rendering": { + "count": 23 + }, + "multiplayer": { + "count": 267 + }, + "screen": { + "count": 4 + }, + "project-management": { + "count": 37 + }, + "data-structure": { + "count": 53 + }, + "rpg": { + "count": 66 + }, + "source-engine": { + "count": 7 + }, + "windows-phone-7": { + "count": 15 + }, + "testing": { + "count": 25 + }, + "controllers": { + "count": 11 + }, + "projectile-physics": { + "count": 17 + }, + "godot": { + "count": 147 + }, + "collider": { + "count": 3 + }, + "user-experience": { + "count": 22 + }, + "maps": { + "count": 83 + }, + "free-to-play": { + "count": 5 + }, + "physics-engine": { + "count": 33 + }, + "keyboard": { + "count": 15 + }, + "random": { + "count": 26 + }, + "fragment-shader": { + "count": 12 + }, + "file-format": { + "count": 15 + }, + "bullet-physics": { + "count": 37 + }, + "server": { + "count": 84 + }, + "racing": { + "count": 9 + }, + "unityscript": { + "count": 14 + }, + "simulations": { + "count": 29 + }, + "directx10": { + "count": 15 + }, + "licensing": { + "count": 43 + }, + "xbox360": { + "count": 24 + }, + "multithreading": { + "count": 47 + }, + "point-cloud": { + "count": 3 + }, + "sfml": { + "count": 51 + }, + "modding": { + "count": 80 + }, + "logic": { + "count": 16 + }, + "vulkan": { + "count": 17 + }, + "adventure-games": { + "count": 10 + }, + "development-speed": { + "count": 6 + }, + "competition": { + "count": 10 + }, + "voxels": { + "count": 36 + }, + "obj": { + "count": 6 + }, + "business": { + "count": 34 + }, + "entity-system": { + "count": 98 + }, + "3d-meshes": { + "count": 50 + }, + "audio": { + "count": 50 + }, + "databases": { + "count": 39 + }, + "html-canvas": { + "count": 28 + }, + "php": { + "count": 31 + }, + "cocos2d-x-js": { + "count": 7 + }, + "hardware": { + "count": 17 + }, + "trigonometry": { + "count": 2 + }, + "levels": { + "count": 12 + }, + "blender-game-engine": { + "count": 5 + }, + "touch": { + "count": 4 + }, + "events": { + "count": 20 + }, + "macos": { + "count": 7 + }, + "webgl": { + "count": 29 + }, + "web": { + "count": 29 + }, + "savegame": { + "count": 30 + }, + "entity-component-system": { + "count": 11 + }, + "resolution": { + "count": 11 + }, + "particles": { + "count": 13 + }, + "octree": { + "count": 4 + }, + "steam": { + "count": 63 + }, + "dev-groups": { + "count": 2 + }, + "leaderboards": { + "count": 28 + }, + "special-effects": { + "count": 9 + }, + "client-server": { + "count": 44 + }, + "scene-graph": { + "count": 12 + }, + "models": { + "count": 109 + }, + "timer": { + "count": 20 + }, + "corona-sdk": { + "count": 24 + }, + "node.js": { + "count": 19 + }, + "security": { + "count": 14 + }, + "raycasting": { + "count": 23 + }, + "jmonkeyengine": { + "count": 12 + }, + "sky": { + "count": 3 + }, + "game-recording": { + "count": 5 + }, + "optimization": { + "count": 53 + }, + "character": { + "count": 22 + }, + "appstore": { + "count": 10 + }, + "rigidbody": { + "count": 13 + }, + "objective-c": { + "count": 20 + }, + "transparency": { + "count": 6 + }, + "ranking": { + "count": 14 + }, + "inventory": { + "count": 3 + }, + "cocos2d-x": { + "count": 63 + }, + "marketing": { + "count": 88 + }, + "coordinates": { + "count": 29 + }, + "ogre": { + "count": 38 + }, + "wpf": { + "count": 2 + }, + "floating-point": { + "count": 2 + }, + "voice": { + "count": 9 + }, + "space-partitioning": { + "count": 3 + }, + "sound": { + "count": 35 + }, + "deployment": { + "count": 8 + }, + "game-maker-dnd": { + "count": 4 + }, + "console": { + "count": 21 + }, + "havok": { + "count": 8 + }, + "three.js": { + "count": 13 + }, + "scripting": { + "count": 64 + }, + "joystick": { + "count": 3 + }, + "level-of-detail": { + "count": 9 + }, + "quake3": { + "count": 4 + }, + "interpolation": { + "count": 12 + }, + "frame-rate": { + "count": 30 + }, + "spritekit": { + "count": 22 + }, + "file": { + "count": 6 + }, + "sales": { + "count": 3 + }, + "collision-resolution": { + "count": 23 + }, + "google-play": { + "count": 33 + }, + "tetris": { + "count": 2 + }, + "ruby": { + "count": 14 + }, + "gimp": { + "count": 2 + }, + "grid": { + "count": 10 + }, + "graphics-design": { + "count": 10 + }, + "heuristics": { + "count": 3 + }, + "google-play-services": { + "count": 5 + }, + "visual-studio": { + "count": 7 + }, + "playstation4": { + "count": 15 + }, + "line-of-sight": { + "count": 5 + }, + "refactoring": { + "count": 2 + }, + "interactive-fiction": { + "count": 7 + }, + "scoring": { + "count": 11 + }, + "game-state": { + "count": 3 + }, + "fluid-dynamics": { + "count": 3 + }, + "anti-cheat": { + "count": 9 + }, + "rpg-maker": { + "count": 7 + }, + "direct3d12": { + "count": 13 + }, + "parallax-scrolling": { + "count": 4 + }, + "memory": { + "count": 3 + }, + "pc": { + "count": 6 + }, + "state": { + "count": 9 + }, + "collada": { + "count": 4 + }, + "construct-2": { + "count": 4 + }, + "platformer": { + "count": 22 + }, + "replays": { + "count": 2 + }, + "curves": { + "count": 4 + }, + "scala": { + "count": 2 + }, + "aabb": { + "count": 4 + }, + "porting": { + "count": 4 + }, + "cryengine": { + "count": 10 + }, + "glfw": { + "count": 9 + }, + ".net": { + "count": 7 + }, + "raytracing": { + "count": 9 + }, + "fmod": { + "count": 3 + }, + "flixel": { + "count": 6 + }, + "hdr": { + "count": 3 + }, + "platform": { + "count": 11 + }, + "community-management": { + "count": 11 + }, + "mesh": { + "count": 12 + }, + "hexagonal-grid": { + "count": 10 + }, + "materials": { + "count": 9 + }, + "spritesheet": { + "count": 20 + }, + "career": { + "count": 35 + }, + "lwjgl": { + "count": 39 + }, + "installer": { + "count": 4 + }, + "spatial-partitioning": { + "count": 3 + }, + "pyglet": { + "count": 6 + }, + "slick": { + "count": 9 + }, + "image": { + "count": 19 + }, + "sql": { + "count": 4 + }, + "rts": { + "count": 26 + }, + "localization": { + "count": 7 + }, + "text": { + "count": 18 + }, + "map-editor": { + "count": 6 + }, + "viewport": { + "count": 3 + }, + "face": { + "count": 4 + }, + "first-person-shooter": { + "count": 19 + }, + "noise": { + "count": 2 + }, + "tower-defense": { + "count": 5 + }, + "jbox2d": { + "count": 3 + }, + "fsm": { + "count": 14 + }, + "content-rating": { + "count": 8 + }, + "glut": { + "count": 3 + }, + "turn-based": { + "count": 20 + }, + "compression": { + "count": 8 + }, + "vertex": { + "count": 10 + }, + "teamwork": { + "count": 4 + }, + "slimdx": { + "count": 3 + }, + "plugin": { + "count": 3 + }, + "selection": { + "count": 3 + }, + "intersection": { + "count": 3 + }, + "graphic-effects": { + "count": 23 + }, + "shading": { + "count": 4 + }, + "entity-component": { + "count": 6 + }, + "fonts": { + "count": 21 + }, + "spherical-harmonics": { + "count": 2 + }, + "vb.net": { + "count": 5 + }, + "psm": { + "count": 6 + }, + "actionscript": { + "count": 7 + }, + "microsoft": { + "count": 7 + }, + "puzzle": { + "count": 16 + }, + "car": { + "count": 2 + }, + "sound-effects": { + "count": 3 + }, + "love2d": { + "count": 10 + }, + "quadtree": { + "count": 6 + }, + "linear-algebra": { + "count": 9 + }, + "scale": { + "count": 2 + }, + "ipad": { + "count": 6 + }, + "linux": { + "count": 19 + }, + "global-illumination": { + "count": 4 + }, + "unity-networking": { + "count": 11 + }, + "crusader-kings-2-modding": { + "count": 5 + }, + "playtesting": { + "count": 5 + }, + "heightmap": { + "count": 6 + }, + "kinect": { + "count": 4 + }, + "websocket": { + "count": 6 + }, + "statistics": { + "count": 3 + }, + "fbx": { + "count": 7 + }, + "jumping": { + "count": 5 + }, + "world-of-warcraft-modding": { + "count": 3 + }, + "source-code": { + "count": 2 + }, + "perlin-noise": { + "count": 3 + }, + "skyrim-modding": { + "count": 9 + }, + "debugging": { + "count": 7 + }, + "economy": { + "count": 5 + }, + "assembly": { + "count": 3 + }, + "geolocation": { + "count": 2 + }, + "timestep": { + "count": 3 + }, + "skeletal-animation": { + "count": 17 + }, + "splash-screen": { + "count": 2 + }, + "pixel": { + "count": 10 + }, + "swift": { + "count": 5 + }, + "visualization": { + "count": 3 + }, + "balance": { + "count": 6 + }, + "management": { + "count": 2 + }, + "eclipse": { + "count": 3 + }, + "pixel-art": { + "count": 13 + }, + "tournament": { + "count": 2 + }, + "zbrush": { + "count": 2 + }, + "motivation": { + "count": 6 + }, + "online": { + "count": 8 + }, + "oop": { + "count": 6 + }, + "index-buffer": { + "count": 5 + }, + "timing": { + "count": 5 + }, + "methodology": { + "count": 10 + }, + "impactjs": { + "count": 2 + }, + "depth-buffer": { + "count": 11 + }, + "interface": { + "count": 5 + }, + "go": { + "count": 3 + }, + "steering-behaviors": { + "count": 4 + }, + "wii": { + "count": 3 + }, + "path": { + "count": 3 + }, + "editors": { + "count": 3 + }, + "monetization": { + "count": 12 + }, + "process": { + "count": 4 + }, + "trademark": { + "count": 8 + }, + "soya3d": { + "count": 2 + }, + "mission-design": { + "count": 3 + }, + "version-control": { + "count": 6 + }, + "resource-management": { + "count": 8 + }, + "marching-cubes": { + "count": 5 + }, + "skybox": { + "count": 8 + }, + "scene": { + "count": 7 + }, + "haxe": { + "count": 3 + }, + "openscenegraph": { + "count": 2 + }, + "encryption": { + "count": 4 + }, + "turn-based-strategy": { + "count": 6 + }, + "advertisements": { + "count": 5 + }, + "beta": { + "count": 3 + }, + "game-maker-studio-2": { + "count": 3 + }, + "3d-modeling": { + "count": 9 + }, + "udp": { + "count": 3 + }, + "jquery": { + "count": 4 + }, + "cloud-computing": { + "count": 2 + }, + "commodore-64": { + "count": 3 + }, + "ui-design": { + "count": 4 + }, + "release": { + "count": 3 + }, + "demo": { + "count": 4 + }, + "normals": { + "count": 8 + }, + "navmesh": { + "count": 2 + }, + "social": { + "count": 3 + }, + "separating-axis-theorem": { + "count": 4 + }, + "data-driven": { + "count": 4 + }, + "revenue": { + "count": 4 + }, + "openal": { + "count": 2 + }, + "shadows": { + "count": 7 + }, + "window-management": { + "count": 4 + }, + "magicavoxel": { + "count": 2 + }, + "twine": { + "count": 4 + }, + "allegro": { + "count": 7 + }, + "video": { + "count": 18 + }, + "pipeline": { + "count": 4 + }, + "normal-mapping": { + "count": 4 + }, + "css": { + "count": 2 + }, + "game-center": { + "count": 3 + }, + "text-based": { + "count": 5 + }, + "torque-x": { + "count": 2 + }, + "signed-distance-field": { + "count": 3 + }, + "spritebatch": { + "count": 3 + }, + "business-model": { + "count": 2 + }, + "opentk": { + "count": 4 + }, + "assimp": { + "count": 6 + }, + "trailer": { + "count": 3 + }, + "unity-webgl": { + "count": 2 + }, + "tiled": { + "count": 3 + }, + "glm": { + "count": 2 + }, + "splines": { + "count": 6 + }, + "unit-testing": { + "count": 3 + }, + "compatibility": { + "count": 3 + }, + "strategy": { + "count": 4 + }, + "analytics": { + "count": 4 + }, + "irrlicht": { + "count": 2 + }, + "configuration": { + "count": 3 + }, + "filesystem": { + "count": 2 + }, + "piracy": { + "count": 3 + }, + "difficulty": { + "count": 3 + }, + "water": { + "count": 3 + }, + "teaching": { + "count": 4 + }, + "memory-efficiency": { + "count": 9 + }, + "sketchup": { + "count": 2 + }, + "portals": { + "count": 3 + }, + "windows-forms": { + "count": 5 + }, + "farseer-physics-engine": { + "count": 2 + }, + "hammer": { + "count": 2 + }, + "efficiency": { + "count": 4 + }, + "alpha-blending": { + "count": 3 + }, + "card-game": { + "count": 4 + }, + "human-resources": { + "count": 2 + }, + "quake2": { + "count": 2 + }, + "emulation": { + "count": 5 + }, + "image-processing": { + "count": 2 + }, + "hacks": { + "count": 3 + }, + "tessellation": { + "count": 2 + }, + "mvc": { + "count": 2 + }, + "jogl": { + "count": 3 + }, + "ide": { + "count": 2 + }, + "augmented-reality": { + "count": 3 + }, + "productivity": { + "count": 2 + }, + "oculus": { + "count": 2 + }, + "vsync": { + "count": 2 + }, + "google-app-engine": { + "count": 2 + }, + "authentication": { + "count": 3 + }, + "spawning": { + "count": 2 + }, + "user-interface": { + "count": 3 + }, + "gameobject": { + "count": 3 + }, + "silverlight": { + "count": 3 + }, + "htc-vive": { + "count": 2 + }, + "profiling": { + "count": 2 + }, + "lumberyard-engine": { + "count": 3 + }, + "unity-ads": { + "count": 2 + }, + "ajax": { + "count": 2 + }, + "bot": { + "count": 2 + }, + "outsourcing": { + "count": 2 + }, + "functional": { + "count": 2 + }, + "synchronization": { + "count": 2 + }, + "user-generated-content": { + "count": 2 + }, + "rpg-maker-mv": { + "count": 2 + }, + "javafx": { + "count": 2 + }, + "data": { + "count": 2 + }, + "computational-geometry": { + "count": 3 + }, + "harlowe": { + "count": 2 + }, + "hud": { + "count": 2 + }, + "stencyl": { + "count": 2 + }, + "rpg-maker-xp": { + "count": 3 + }, + "stage3d": { + "count": 2 + }, + "renpy": { + "count": 2 + }, + "content-generation": { + "count": 2 + }, + "babylonjs": { + "count": 2 + }, + "palette": { + "count": 2 + }, + "culling": { + "count": 2 + } + } + } + } +} diff --git a/mteb/descriptive_stats/Clustering/TwentyNewsgroupsClustering-VN.v2.json b/mteb/descriptive_stats/Clustering/TwentyNewsgroupsClustering-VN.v2.json new file mode 100755 index 0000000000..88821e7cbe --- /dev/null +++ b/mteb/descriptive_stats/Clustering/TwentyNewsgroupsClustering-VN.v2.json @@ -0,0 +1,81 @@ +{ + "test": { + "num_samples": 35034, + "text_statistics": { + "total_text_length": 1324367, + "min_text_length": 7, + "average_text_length": 37.80233487469315, + "max_text_length": 337, + "unique_texts": 3335 + }, + "image_statistics": null, + "labels_statistics": { + "min_labels_per_text": 1, + "average_label_per_text": 1.0, + "max_labels_per_text": 1, + "unique_labels": 20, + "labels": { + "12": { + "count": 1423 + }, + "6": { + "count": 1964 + }, + "2": { + "count": 2307 + }, + "17": { + "count": 2009 + }, + "0": { + "count": 1202 + }, + "1": { + "count": 1841 + }, + "10": { + "count": 1794 + }, + "15": { + "count": 1545 + }, + "7": { + "count": 2139 + }, + "5": { + "count": 2047 + }, + "3": { + "count": 2143 + }, + "11": { + "count": 1885 + }, + "8": { + "count": 1727 + }, + "19": { + "count": 1214 + }, + "9": { + "count": 1656 + }, + "4": { + "count": 2281 + }, + "13": { + "count": 1153 + }, + "18": { + "count": 1514 + }, + "16": { + "count": 1456 + }, + "14": { + "count": 1734 + } + } + } + } +} diff --git a/mteb/tasks/clustering/vie/__init__.py b/mteb/tasks/clustering/vie/__init__.py old mode 100644 new mode 100755 index 1a651a7840..8d5eac7846 --- a/mteb/tasks/clustering/vie/__init__.py +++ b/mteb/tasks/clustering/vie/__init__.py @@ -1,13 +1,27 @@ -from .reddit_clustering_p2p_vn import RedditClusteringP2PVN -from .reddit_clustering_vn import RedditClusteringVN -from .stack_exchange_clustering_p2p_vn import StackExchangeClusteringP2PVN -from .stack_exchange_clustering_vn import StackExchangeClusteringVN -from .twenty_newsgroups_clustering_vn import TwentyNewsgroupsClusteringVN +from .reddit_clustering_p2p_vn import RedditClusteringP2PVN, RedditFastClusteringP2PVN +from .reddit_clustering_vn import RedditClusteringVN, RedditFastClusteringVN +from .stack_exchange_clustering_p2p_vn import ( + StackExchangeClusteringP2PVN, + StackExchangeFastClusteringP2PVN, +) +from .stack_exchange_clustering_vn import ( + StackExchangeClusteringVN, + StackExchangeFastClusteringVN, +) +from .twenty_newsgroups_clustering_vn import ( + TwentyNewsgroupsClusteringVN, + TwentyNewsgroupsFastClusteringVN, +) __all__ = [ "RedditClusteringP2PVN", "RedditClusteringVN", + "RedditFastClusteringP2PVN", + "RedditFastClusteringVN", "StackExchangeClusteringP2PVN", "StackExchangeClusteringVN", + "StackExchangeFastClusteringP2PVN", + "StackExchangeFastClusteringVN", "TwentyNewsgroupsClusteringVN", + "TwentyNewsgroupsFastClusteringVN", ] diff --git a/mteb/tasks/clustering/vie/reddit_clustering_p2p_vn.py b/mteb/tasks/clustering/vie/reddit_clustering_p2p_vn.py old mode 100644 new mode 100755 index d838500af0..27fc05f2e9 --- a/mteb/tasks/clustering/vie/reddit_clustering_p2p_vn.py +++ b/mteb/tasks/clustering/vie/reddit_clustering_p2p_vn.py @@ -1,3 +1,12 @@ +import itertools + +import numpy as np +from datasets import Dataset, DatasetDict + +from mteb.abstasks.clustering import ( + AbsTaskClustering, + _check_label_distribution, +) from mteb.abstasks.clustering_legacy import AbsTaskClusteringLegacy from mteb.abstasks.task_metadata import TaskMetadata @@ -35,4 +44,63 @@ class RedditClusteringP2PVN(AbsTaskClusteringLegacy): } """, adapted_from=["RedditClusteringP2P"], + superseded_by="RedditClusteringP2P-VN.v2", ) + + +class RedditFastClusteringP2PVN(AbsTaskClustering): + metadata = TaskMetadata( + name="RedditClusteringP2P-VN.v2", + description="A translated dataset from Clustering of title+posts from reddit. Clustering of 10 sets of 50k paragraphs and 40 sets of 10k paragraphs. The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system: - The system uses large language models (LLMs), specifically Coherence's Aya model, for translation. - Applies advanced embedding models to filter the translations. - Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.", + reference="https://arxiv.org/abs/2104.07081", + dataset={ + "path": "GreenNode/reddit-clustering-p2p-vn", + "revision": "841856dcb82496f1f2f59356e4798ce15baeb200", + }, + type="Clustering", + category="t2c", + eval_splits=["test"], + eval_langs=["vie-Latn"], + main_score="v_measure", + date=("2025-07-29", "2025-07-30"), + license="cc-by-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="machine-translated and LM verified", + domains=["Web", "Social", "Written"], + task_subtypes=["Thematic clustering"], + bibtex_citation=r""" +@misc{pham2025vnmtebvietnamesemassivetext, + archiveprefix = {arXiv}, + author = {Loc Pham and Tung Luu and Thu Vo and Minh Nguyen and Viet Hoang}, + eprint = {2507.21500}, + primaryclass = {cs.CL}, + title = {VN-MTEB: Vietnamese Massive Text Embedding Benchmark}, + url = {https://arxiv.org/abs/2507.21500}, + year = {2025}, +} +""", + adapted_from=["RedditClusteringP2P-VN"], + ) + + def dataset_transform(self): + ds = {} + for split in self.metadata.eval_splits: + labels = list(itertools.chain.from_iterable(self.dataset[split]["labels"])) + sentences = list( + itertools.chain.from_iterable(self.dataset[split]["sentences"]) + ) + + _check_label_distribution(self.dataset[split]) + + # Remove sentences and labels with only 1 label example. + unique_labels, counts = np.unique(labels, return_counts=True) + solo_label_idx = np.where(counts == 1) + solo_labels = unique_labels[solo_label_idx] + for solo_label in solo_labels: + loc = labels.index(solo_label) + labels.pop(loc) + sentences.pop(loc) + ds[split] = Dataset.from_dict({"labels": labels, "sentences": sentences}) + + self.dataset = DatasetDict(ds) diff --git a/mteb/tasks/clustering/vie/reddit_clustering_vn.py b/mteb/tasks/clustering/vie/reddit_clustering_vn.py old mode 100644 new mode 100755 index 86e6931c54..d6c2ac2876 --- a/mteb/tasks/clustering/vie/reddit_clustering_vn.py +++ b/mteb/tasks/clustering/vie/reddit_clustering_vn.py @@ -1,3 +1,11 @@ +import itertools + +from datasets import Dataset, DatasetDict + +from mteb.abstasks.clustering import ( + AbsTaskClustering, + _check_label_distribution, +) from mteb.abstasks.clustering_legacy import AbsTaskClusteringLegacy from mteb.abstasks.task_metadata import TaskMetadata @@ -35,4 +43,59 @@ class RedditClusteringVN(AbsTaskClusteringLegacy): } """, adapted_from=["RedditClustering"], + superseded_by="RedditClustering-VN.v2", ) + + +class RedditFastClusteringVN(AbsTaskClustering): + metadata = TaskMetadata( + name="RedditClustering-VN.v2", + description="A translated dataset from Clustering of titles from 199 subreddits. Clustering of 25 sets, each with 10-50 classes, and each class with 100 - 1000 sentences. The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system: - The system uses large language models (LLMs), specifically Coherence's Aya model, for translation. - Applies advanced embedding models to filter the translations. - Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.", + reference="https://arxiv.org/abs/2104.07081", + dataset={ + "path": "GreenNode/reddit-clustering-vn", + "revision": "7f7d4097979633181b2df3f73905218f74c4665d", + }, + type="Clustering", + category="t2c", + eval_splits=["test"], + eval_langs=["vie-Latn"], + main_score="v_measure", + date=("2025-07-29", "2025-07-30"), + license="cc-by-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="machine-translated and LM verified", + domains=["Web", "Social", "Written"], + task_subtypes=["Thematic clustering"], + bibtex_citation=r""" +@misc{pham2025vnmtebvietnamesemassivetext, + archiveprefix = {arXiv}, + author = {Loc Pham and Tung Luu and Thu Vo and Minh Nguyen and Viet Hoang}, + eprint = {2507.21500}, + primaryclass = {cs.CL}, + title = {VN-MTEB: Vietnamese Massive Text Embedding Benchmark}, + url = {https://arxiv.org/abs/2507.21500}, + year = {2025}, +} +""", + adapted_from=["RedditClustering"], + ) + + def dataset_transform(self): + ds = {} + for split in self.metadata.eval_splits: + labels = list(itertools.chain.from_iterable(self.dataset[split]["labels"])) + sentences = list( + itertools.chain.from_iterable(self.dataset[split]["sentences"]) + ) + _check_label_distribution(self.dataset[split]) + ds[split] = Dataset.from_dict({"labels": labels, "sentences": sentences}) + self.dataset = DatasetDict(ds) + self.dataset = self.stratified_subsampling( + self.dataset, + self.seed, + self.metadata.eval_splits, + label="labels", + ) + self.max_fraction_of_documents_to_embed = None diff --git a/mteb/tasks/clustering/vie/stack_exchange_clustering_p2p_vn.py b/mteb/tasks/clustering/vie/stack_exchange_clustering_p2p_vn.py old mode 100644 new mode 100755 index a6a7bc117f..f1efd2a4c3 --- a/mteb/tasks/clustering/vie/stack_exchange_clustering_p2p_vn.py +++ b/mteb/tasks/clustering/vie/stack_exchange_clustering_p2p_vn.py @@ -1,3 +1,12 @@ +import itertools + +import numpy as np +from datasets import Dataset, DatasetDict + +from mteb.abstasks.clustering import ( + AbsTaskClustering, + _check_label_distribution, +) from mteb.abstasks.clustering_legacy import AbsTaskClusteringLegacy from mteb.abstasks.task_metadata import TaskMetadata @@ -35,4 +44,63 @@ class StackExchangeClusteringP2PVN(AbsTaskClusteringLegacy): } """, adapted_from=["StackExchangeClusteringP2P"], + superseded_by="StackExchangeClusteringP2P-VN.v2", ) + + +class StackExchangeFastClusteringP2PVN(AbsTaskClustering): + metadata = TaskMetadata( + name="StackExchangeClusteringP2P-VN.v2", + description="A translated Clustering of title+body from stackexchange. Clustering of 5 sets of 10k paragraphs and 5 sets of 5k paragraphs. The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system: - The system uses large language models (LLMs), specifically Coherence's Aya model, for translation. - Applies advanced embedding models to filter the translations. - Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.", + reference="https://arxiv.org/abs/2104.07081", + dataset={ + "path": "GreenNode/stackexchange-clustering-p2p-vn", + "revision": "8f154ee524a466850028531d21e1a62d958b8156", + }, + type="Clustering", + category="t2c", + eval_splits=["test"], + eval_langs=["vie-Latn"], + main_score="v_measure", + date=("2025-07-29", "2025-07-30"), + license="cc-by-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="machine-translated and LM verified", + domains=["Web", "Written"], + task_subtypes=["Thematic clustering"], + bibtex_citation=r""" +@misc{pham2025vnmtebvietnamesemassivetext, + archiveprefix = {arXiv}, + author = {Loc Pham and Tung Luu and Thu Vo and Minh Nguyen and Viet Hoang}, + eprint = {2507.21500}, + primaryclass = {cs.CL}, + title = {VN-MTEB: Vietnamese Massive Text Embedding Benchmark}, + url = {https://arxiv.org/abs/2507.21500}, + year = {2025}, +} +""", + adapted_from=["StackExchangeClusteringP2P-VN"], + ) + + def dataset_transform(self): + ds = {} + for split in self.metadata.eval_splits: + labels = list(itertools.chain.from_iterable(self.dataset[split]["labels"])) + sentences = list( + itertools.chain.from_iterable(self.dataset[split]["sentences"]) + ) + + _check_label_distribution(self.dataset[split]) + + # Remove sentences and labels with only 1 label example. + unique_labels, counts = np.unique(labels, return_counts=True) + solo_label_idx = np.where(counts == 1) + solo_labels = unique_labels[solo_label_idx] + for solo_label in solo_labels: + loc = labels.index(solo_label) + labels.pop(loc) + sentences.pop(loc) + ds[split] = Dataset.from_dict({"labels": labels, "sentences": sentences}) + + self.dataset = DatasetDict(ds) diff --git a/mteb/tasks/clustering/vie/stack_exchange_clustering_vn.py b/mteb/tasks/clustering/vie/stack_exchange_clustering_vn.py old mode 100644 new mode 100755 index 400710b7ed..7967e3b929 --- a/mteb/tasks/clustering/vie/stack_exchange_clustering_vn.py +++ b/mteb/tasks/clustering/vie/stack_exchange_clustering_vn.py @@ -1,3 +1,11 @@ +import itertools + +from datasets import Dataset, DatasetDict + +from mteb.abstasks.clustering import ( + AbsTaskClustering, + _check_label_distribution, +) from mteb.abstasks.clustering_legacy import AbsTaskClusteringLegacy from mteb.abstasks.task_metadata import TaskMetadata @@ -35,4 +43,61 @@ class StackExchangeClusteringVN(AbsTaskClusteringLegacy): } """, adapted_from=["StackExchangeClustering"], + superseded_by="StackExchangeClustering-VN.v2", ) + + +class StackExchangeFastClusteringVN(AbsTaskClustering): + metadata = TaskMetadata( + name="StackExchangeClustering-VN.v2", + description="A translated dataset from Clustering of titles from 121 stackexchanges. Clustering of 25 sets, each with 10-50 classes, and each class with 100 - 1000 sentences. The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system: - The system uses large language models (LLMs), specifically Coherence's Aya model, for translation. - Applies advanced embedding models to filter the translations. - Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.", + reference="https://arxiv.org/abs/2104.07081", + dataset={ + "path": "GreenNode/stackexchange-clustering-vn", + "revision": "cf01db048f2bf705741675b51613dc48e0bb122b", + }, + type="Clustering", + category="t2c", + eval_splits=["test"], + eval_langs=["vie-Latn"], + main_score="v_measure", + date=("2025-07-29", "2025-07-30"), + license="cc-by-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="machine-translated and LM verified", + domains=["Web", "Written"], + task_subtypes=["Thematic clustering"], + bibtex_citation=r""" +@misc{pham2025vnmtebvietnamesemassivetext, + archiveprefix = {arXiv}, + author = {Loc Pham and Tung Luu and Thu Vo and Minh Nguyen and Viet Hoang}, + eprint = {2507.21500}, + primaryclass = {cs.CL}, + title = {VN-MTEB: Vietnamese Massive Text Embedding Benchmark}, + url = {https://arxiv.org/abs/2507.21500}, + year = {2025}, +} +""", + adapted_from=["StackExchangeClustering"], + ) + + def dataset_transform(self): + ds = {} + for split in self.metadata.eval_splits: + labels = list(itertools.chain.from_iterable(self.dataset[split]["labels"])) + sentences = list( + itertools.chain.from_iterable(self.dataset[split]["sentences"]) + ) + + _check_label_distribution(self.dataset[split]) + + ds[split] = Dataset.from_dict({"labels": labels, "sentences": sentences}) + self.dataset = DatasetDict(ds) + self.dataset = self.stratified_subsampling( + self.dataset, + self.seed, + self.metadata.eval_splits, + label="labels", + ) + self.max_fraction_of_documents_to_embed = None diff --git a/mteb/tasks/clustering/vie/twenty_newsgroups_clustering_vn.py b/mteb/tasks/clustering/vie/twenty_newsgroups_clustering_vn.py old mode 100644 new mode 100755 index 8f83656a51..00dfc3b9ea --- a/mteb/tasks/clustering/vie/twenty_newsgroups_clustering_vn.py +++ b/mteb/tasks/clustering/vie/twenty_newsgroups_clustering_vn.py @@ -1,4 +1,12 @@ -from mteb.abstasks import AbsTaskClusteringLegacy +import itertools + +from datasets import Dataset, DatasetDict + +from mteb.abstasks.clustering import ( + AbsTaskClustering, + _check_label_distribution, +) +from mteb.abstasks.clustering_legacy import AbsTaskClusteringLegacy from mteb.abstasks.task_metadata import TaskMetadata @@ -35,4 +43,54 @@ class TwentyNewsgroupsClusteringVN(AbsTaskClusteringLegacy): } """, adapted_from=["TwentyNewsgroupsClustering"], + superseded_by="TwentyNewsgroupsClustering-VN.v2", + ) + + +class TwentyNewsgroupsFastClusteringVN(AbsTaskClustering): + metadata = TaskMetadata( + name="TwentyNewsgroupsClustering-VN.v2", + description="A translated dataset from Clustering of the 20 Newsgroups dataset (subject only). The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system: - The system uses large language models (LLMs), specifically Coherence's Aya model, for translation. - Applies advanced embedding models to filter the translations. - Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.", + reference="https://scikit-learn.org/0.19/datasets/twenty_newsgroups.html", + dataset={ + "path": "GreenNode/twentynewsgroups-clustering-vn", + "revision": "770e1b9029cd85c79410bc6df1528a43fc2b9ad1", + }, + type="Clustering", + category="t2c", + eval_splits=["test"], + eval_langs=["vie-Latn"], + main_score="v_measure", + date=("2025-07-29", "2025-07-30"), + license="cc-by-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="machine-translated and LM verified", + domains=["News", "Written"], + task_subtypes=["Thematic clustering"], + bibtex_citation=r""" +@misc{pham2025vnmtebvietnamesemassivetext, + archiveprefix = {arXiv}, + author = {Loc Pham and Tung Luu and Thu Vo and Minh Nguyen and Viet Hoang}, + eprint = {2507.21500}, + primaryclass = {cs.CL}, + title = {VN-MTEB: Vietnamese Massive Text Embedding Benchmark}, + url = {https://arxiv.org/abs/2507.21500}, + year = {2025}, +} +""", + adapted_from=["TwentyNewsgroupsClustering-VN"], ) + + def dataset_transform(self): + ds = {} + for split in self.metadata.eval_splits: + labels = list(itertools.chain.from_iterable(self.dataset[split]["labels"])) + sentences = list( + itertools.chain.from_iterable(self.dataset[split]["sentences"]) + ) + + _check_label_distribution(self.dataset[split]) + + ds[split] = Dataset.from_dict({"labels": labels, "sentences": sentences}) + self.dataset = DatasetDict(ds)