diff --git a/mteb/descriptive_stats/BitextMining/BUCC.json b/mteb/descriptive_stats/BitextMining/BUCC.json index 55093a585f..f316056774 100644 --- a/mteb/descriptive_stats/BitextMining/BUCC.json +++ b/mteb/descriptive_stats/BitextMining/BUCC.json @@ -3,66 +3,96 @@ "num_samples": 35000, "number_of_characters": 146737556, "unique_pairs": 35000, - "min_sentence1_length": 16, - "average_sentence1_length": 99.10931428571429, - "max_sentence1_length": 204, - "unique_sentence1": 34978, - "min_sentence2_length": 17, - "average_sentence2_length": 101.14933691422246, - "max_sentence2_length": 339, - "unique_sentence2": 1133728, + "sentence1_statistics": { + "total_text_length": 3468826, + "min_text_length": 16, + "average_text_length": 99.10931428571429, + "max_text_length": 204, + "unique_texts": 34978 + }, + "sentence2_statistics": { + "total_text_length": 143268730, + "min_text_length": 17, + "average_text_length": 101.14933691422246, + "max_text_length": 339, + "unique_texts": 1133728 + }, "hf_subset_descriptive_stats": { "de-en": { "num_samples": 9580, "number_of_characters": 41450074, "unique_pairs": 9580, - "min_sentence1_length": 50, - "average_sentence1_length": 109.07974947807934, - "max_sentence1_length": 204, - "unique_sentence1": 9573, - "min_sentence2_length": 17, - "average_sentence2_length": 101.18043156531952, - "max_sentence2_length": 293, - "unique_sentence2": 397151 + "sentence1_statistics": { + "total_text_length": 1044984, + "min_text_length": 50, + "average_text_length": 109.07974947807934, + "max_text_length": 204, + "unique_texts": 9573 + }, + "sentence2_statistics": { + "total_text_length": 40405090, + "min_text_length": 17, + "average_text_length": 101.18043156531952, + "max_text_length": 293, + "unique_texts": 397151 + } }, "fr-en": { "num_samples": 9086, "number_of_characters": 38272453, "unique_pairs": 9086, - "min_sentence1_length": 43, - "average_sentence1_length": 99.31785163988553, - "max_sentence1_length": 174, - "unique_sentence1": 9081, - "min_sentence2_length": 21, - "average_sentence2_length": 101.05202942051324, - "max_sentence2_length": 319, - "unique_sentence2": 368033 + "sentence1_statistics": { + "total_text_length": 902402, + "min_text_length": 43, + "average_text_length": 99.31785163988553, + "max_text_length": 174, + "unique_texts": 9081 + }, + "sentence2_statistics": { + "total_text_length": 37370051, + "min_text_length": 21, + "average_text_length": 101.05202942051324, + "max_text_length": 319, + "unique_texts": 368033 + } }, "ru-en": { "num_samples": 14435, "number_of_characters": 57904085, "unique_pairs": 14435, - "min_sentence1_length": 40, - "average_sentence1_length": 101.6593003117423, - "max_sentence1_length": 186, - "unique_sentence1": 14425, - "min_sentence2_length": 21, - "average_sentence2_length": 101.06828784332406, - "max_sentence2_length": 339, - "unique_sentence2": 555503 + "sentence1_statistics": { + "total_text_length": 1467452, + "min_text_length": 40, + "average_text_length": 101.6593003117423, + "max_text_length": 186, + "unique_texts": 14425 + }, + "sentence2_statistics": { + "total_text_length": 56436633, + "min_text_length": 21, + "average_text_length": 101.06828784332406, + "max_text_length": 339, + "unique_texts": 555503 + } }, "zh-en": { "num_samples": 1899, "number_of_characters": 9110944, "unique_pairs": 1899, - "min_sentence1_length": 16, - "average_sentence1_length": 28.429699842022117, - "max_sentence1_length": 40, - "unique_sentence1": 1899, - "min_sentence2_length": 22, - "average_sentence2_length": 101.92388026108485, - "max_sentence2_length": 249, - "unique_sentence2": 88360 + "sentence1_statistics": { + "total_text_length": 53988, + "min_text_length": 16, + "average_text_length": 28.429699842022117, + "max_text_length": 40, + "unique_texts": 1899 + }, + "sentence2_statistics": { + "total_text_length": 9056956, + "min_text_length": 22, + "average_text_length": 101.92388026108485, + "max_text_length": 249, + "unique_texts": 88360 + } } } } diff --git a/mteb/descriptive_stats/Classification/DKHateClassification.json b/mteb/descriptive_stats/Classification/DKHateClassification.json index 595721b14e..bd38cac16f 100644 --- a/mteb/descriptive_stats/Classification/DKHateClassification.json +++ b/mteb/descriptive_stats/Classification/DKHateClassification.json @@ -1,37 +1,53 @@ { "test": { "num_samples": 329, - "number_of_characters": 29011, "number_texts_intersect_with_train": 4, - "min_text_length": 1, - "average_text_length": 88.17933130699087, - "max_text_length": 2434, - "unique_text": 326, - "unique_labels": 2, - "labels": { - "0": { - "count": 288 - }, - "1": { - "count": 41 + "text_statistics": { + "total_text_length": 29011, + "min_text_length": 1, + "average_text_length": 88.17933130699087, + "max_text_length": 2434, + "unique_texts": 326 + }, + "image_statistics": null, + "label_statistics": { + "min_labels_per_text": 1, + "average_label_per_text": 1.0, + "max_labels_per_text": 1, + "unique_labels": 2, + "labels": { + "0": { + "count": 288 + }, + "1": { + "count": 41 + } } } }, "train": { "num_samples": 2960, - "number_of_characters": 307722, "number_texts_intersect_with_train": null, - "min_text_length": 1, - "average_text_length": 103.96013513513513, - "max_text_length": 5403, - "unique_text": 2902, - "unique_labels": 2, - "labels": { - "0": { - "count": 2576 - }, - "1": { - "count": 384 + "text_statistics": { + "total_text_length": 307722, + "min_text_length": 1, + "average_text_length": 103.96013513513513, + "max_text_length": 5403, + "unique_texts": 2902 + }, + "image_statistics": null, + "label_statistics": { + "min_labels_per_text": 1, + "average_label_per_text": 1.0, + "max_labels_per_text": 1, + "unique_labels": 2, + "labels": { + "0": { + "count": 2576 + }, + "1": { + "count": 384 + } } } } diff --git a/mteb/descriptive_stats/Classification/FinancialPhrasebankClassification.json b/mteb/descriptive_stats/Classification/FinancialPhrasebankClassification.json index 52324ebe1a..53afab8fd8 100644 --- a/mteb/descriptive_stats/Classification/FinancialPhrasebankClassification.json +++ b/mteb/descriptive_stats/Classification/FinancialPhrasebankClassification.json @@ -1,22 +1,30 @@ { "train": { "num_samples": 2264, - "number_of_characters": 276123, "number_texts_intersect_with_train": null, - "min_text_length": 9, - "average_text_length": 121.96245583038869, - "max_text_length": 315, - "unique_text": 2259, - "unique_labels": 3, - "labels": { - "1": { - "count": 1391 - }, - "2": { - "count": 570 - }, - "0": { - "count": 303 + "text_statistics": { + "total_text_length": 276123, + "min_text_length": 9, + "average_text_length": 121.96245583038869, + "max_text_length": 315, + "unique_texts": 2259 + }, + "image_statistics": null, + "label_statistics": { + "min_labels_per_text": 1, + "average_label_per_text": 1.0, + "max_labels_per_text": 1, + "unique_labels": 3, + "labels": { + "1": { + "count": 1391 + }, + "2": { + "count": 570 + }, + "0": { + "count": 303 + } } } } diff --git a/mteb/descriptive_stats/Classification/KorHateClassification.json b/mteb/descriptive_stats/Classification/KorHateClassification.json index 90022c35bd..21c91ea998 100644 --- a/mteb/descriptive_stats/Classification/KorHateClassification.json +++ b/mteb/descriptive_stats/Classification/KorHateClassification.json @@ -1,22 +1,30 @@ { "train": { "num_samples": 2048, - "number_of_characters": 79006, "number_texts_intersect_with_train": null, - "min_text_length": 4, - "average_text_length": 38.5771484375, - "max_text_length": 130, - "unique_text": 2048, - "unique_labels": 3, - "labels": { - "1": { - "count": 648 - }, - "2": { - "count": 904 - }, - "0": { - "count": 496 + "text_statistics": { + "total_text_length": 79006, + "min_text_length": 4, + "average_text_length": 38.5771484375, + "max_text_length": 130, + "unique_texts": 2048 + }, + "image_statistics": null, + "label_statistics": { + "min_labels_per_text": 1, + "average_label_per_text": 1.0, + "max_labels_per_text": 1, + "unique_labels": 3, + "labels": { + "1": { + "count": 648 + }, + "2": { + "count": 904 + }, + "0": { + "count": 496 + } } } } diff --git a/mteb/descriptive_stats/Clustering/SwednClustering.json b/mteb/descriptive_stats/Clustering/SwednClustering.json index 73b1f158c5..6a59c69da8 100644 --- a/mteb/descriptive_stats/Clustering/SwednClustering.json +++ b/mteb/descriptive_stats/Clustering/SwednClustering.json @@ -1,27 +1,32 @@ { - "all": { - "num_samples": 4, - "number_of_characters": 2048, - "min_text_length": 512, - "average_text_length": 512.0, - "max_text_length": 512, - "unique_texts": 2047, - "min_labels_per_text": 234, - "average_labels_per_text": 512.0, - "max_labels_per_text": 1164, - "unique_labels": 4, - "labels": { - "culture": { - "count": 294 - }, - "domestic news": { - "count": 1164 - }, - "economy": { - "count": 234 - }, - "sports": { - "count": 356 + "test": { + "num_samples": 2048, + "text_statistics": { + "total_text_length": 3317163, + "min_text_length": 64, + "average_text_length": 1619.70849609375, + "max_text_length": 28913, + "unique_texts": 2047 + }, + "image_statistics": null, + "label_statistics": { + "min_labels_per_text": 1, + "average_label_per_text": 1.0, + "max_labels_per_text": 1, + "unique_labels": 4, + "labels": { + "culture": { + "count": 294 + }, + "domestic news": { + "count": 1164 + }, + "economy": { + "count": 234 + }, + "sports": { + "count": 356 + } } } } diff --git a/mteb/descriptive_stats/Image/ZeroShotClassification/PatchCamelyonZeroShot.json b/mteb/descriptive_stats/Image/ZeroShotClassification/PatchCamelyonZeroShot.json index fd23fdd6fa..523e81f747 100644 --- a/mteb/descriptive_stats/Image/ZeroShotClassification/PatchCamelyonZeroShot.json +++ b/mteb/descriptive_stats/Image/ZeroShotClassification/PatchCamelyonZeroShot.json @@ -1,23 +1,37 @@ { "test": { "num_samples": 32768, - "unique_num_labels": 2, - "min_image_width": 96, - "average_image_width": 96.0, - "max_image_width": 96, - "min_image_height": 96, - "average_image_height": 96.0, - "max_image_height": 96, - "min_label_text_length": 35, - "average_label_text_length": 52.0, - "max_label_text_length": 69, - "labels": { - "0": { - "count": 16391 - }, - "1": { - "count": 16377 + "number_of_characters": null, + "text_statistics": null, + "image_statistics": { + "min_image_width": 96, + "average_image_width": 96.0, + "max_image_width": 96, + "min_image_height": 96, + "average_image_height": 96.0, + "max_image_height": 96, + "unique_images": 29383 + }, + "label_statistics": { + "min_labels_per_text": 1, + "average_label_per_text": 1.0, + "max_labels_per_text": 1, + "unique_labels": 2, + "labels": { + "0": { + "count": 16391 + }, + "1": { + "count": 16377 + } } + }, + "candidates_labels_text_statistics": { + "total_text_length": 103, + "min_text_length": 34, + "average_text_length": 51.5, + "max_text_length": 69, + "unique_texts": 2 } } } diff --git a/mteb/tasks/bitext_mining/multilingual/bucc_bitext_mining.py b/mteb/tasks/bitext_mining/multilingual/bucc_bitext_mining.py index bfef39c9a5..2adfccfec7 100644 --- a/mteb/tasks/bitext_mining/multilingual/bucc_bitext_mining.py +++ b/mteb/tasks/bitext_mining/multilingual/bucc_bitext_mining.py @@ -21,7 +21,7 @@ class BUCCBitextMining(AbsTaskBitextMining): name="BUCC", dataset={ "path": "mteb/BUCC", - "revision": "39f20d5ac4a82e59dbcecaabdd599b82cbefa666", + "revision": "414572247440f0ccacf7eb0bb70a31533a0e5443", }, description="BUCC bitext mining dataset", reference="https://comparable.limsi.fr/bucc2018/bucc2018-task.html", diff --git a/mteb/tasks/classification/eng/financial_phrasebank_classification.py b/mteb/tasks/classification/eng/financial_phrasebank_classification.py index c750e51042..27c67a4b36 100644 --- a/mteb/tasks/classification/eng/financial_phrasebank_classification.py +++ b/mteb/tasks/classification/eng/financial_phrasebank_classification.py @@ -36,9 +36,6 @@ class FinancialPhrasebankClassification(AbsTaskClassification): superseded_by="FinancialPhrasebankClassification.v2", ) - def dataset_transform(self): - self.dataset = self.dataset.rename_column("sentence", "text") - class FinancialPhrasebankClassificationV2(AbsTaskClassification): metadata = TaskMetadata( diff --git a/mteb/tasks/classification/kor/kor_hate_classification.py b/mteb/tasks/classification/kor/kor_hate_classification.py index 3d5a74f0f7..c508d979dd 100644 --- a/mteb/tasks/classification/kor/kor_hate_classification.py +++ b/mteb/tasks/classification/kor/kor_hate_classification.py @@ -44,18 +44,6 @@ class KorHateClassification(AbsTaskClassification): superseded_by="KorHateClassification.v2", ) - def dataset_transform(self): - keep_cols = ["comments", "hate"] - rename_dict = dict(zip(keep_cols, ["text", "label"])) - remove_cols = [ - col for col in self.dataset["test"].column_names if col not in keep_cols - ] - self.dataset = self.dataset.rename_columns(rename_dict) - self.dataset = self.dataset.remove_columns(remove_cols) - self.dataset = self.stratified_subsampling( - self.dataset, seed=self.seed, splits=["train"] - ) - class KorHateClassificationV2(AbsTaskClassification): metadata = TaskMetadata( diff --git a/mteb/tasks/clustering/swe/swedn_clustering.py b/mteb/tasks/clustering/swe/swedn_clustering.py index 1569011345..a59cd52fbb 100644 --- a/mteb/tasks/clustering/swe/swedn_clustering.py +++ b/mteb/tasks/clustering/swe/swedn_clustering.py @@ -8,14 +8,14 @@ class SwednClustering(AbsTaskClusteringLegacy): name="SwednClustering", dataset={ "path": "mteb/SwednClustering", - "revision": "7125017ead5797297f46e17b31bf78b56d12c2b2", + "revision": "45d2a99c3f1b6ee6189a6bb762ed74b7ef45dd9d", }, description="The SWE-DN corpus is based on 1,963,576 news articles from the Swedish newspaper Dagens Nyheter (DN) during the years 2000--2020. The articles are filtered to resemble the CNN/DailyMail dataset both regarding textual structure. This dataset uses the category labels as clusters.", reference="https://spraakbanken.gu.se/en/resources/swedn", type="Clustering", category="t2c", modalities=["text"], - eval_splits=["all"], + eval_splits=["test"], eval_langs=["swe-Latn"], main_score="v_measure", date=("2000-01-01", "2020-12-31"), # best guess diff --git a/mteb/tasks/zeroshot_classification/eng/templates/PatchCamelyon_labels.txt b/mteb/tasks/zeroshot_classification/eng/templates/PatchCamelyon_labels.txt new file mode 100644 index 0000000000..7188374485 --- /dev/null +++ b/mteb/tasks/zeroshot_classification/eng/templates/PatchCamelyon_labels.txt @@ -0,0 +1,2 @@ +lymph node +lymph node containing metastatic tumor tissue