From 5f4af5fdb231b68b2271510f81700ed9bd4fdb8d Mon Sep 17 00:00:00 2001 From: Isaac Chung Date: Thu, 13 Feb 2025 15:06:16 +0000 Subject: [PATCH 1/3] add image clustering descirptive stats and run --- mteb/abstasks/Image/AbsTaskImageClustering.py | 73 ++- .../ImageClustering/CIFAR100Clustering.json | 314 +++++++++ .../ImageClustering/CIFAR10Clustering.json | 44 ++ .../ImageNetDog15Clustering.json | 59 ++ .../TinyImageNetClustering.json | 614 ++++++++++++++++++ scripts/mieb_des_stats.py | 18 + 6 files changed, 1120 insertions(+), 2 deletions(-) create mode 100644 mteb/descriptive_stats/Image/ImageClustering/CIFAR100Clustering.json create mode 100644 mteb/descriptive_stats/Image/ImageClustering/CIFAR10Clustering.json create mode 100644 mteb/descriptive_stats/Image/ImageClustering/ImageNetDog15Clustering.json create mode 100644 mteb/descriptive_stats/Image/ImageClustering/TinyImageNetClustering.json create mode 100644 scripts/mieb_des_stats.py diff --git a/mteb/abstasks/Image/AbsTaskImageClustering.py b/mteb/abstasks/Image/AbsTaskImageClustering.py index 8152bf10f7..585bf4ea5b 100644 --- a/mteb/abstasks/Image/AbsTaskImageClustering.py +++ b/mteb/abstasks/Image/AbsTaskImageClustering.py @@ -1,6 +1,7 @@ from __future__ import annotations import logging +from collections import Counter from typing import Any from datasets import Dataset @@ -10,10 +11,43 @@ from ...encoder_interface import Encoder from ...evaluation.evaluators import ImageClusteringEvaluator from ..AbsTask import AbsTask, ScoresDict +from ..TaskMetadata import DescriptiveStatistics logger = logging.getLogger(__name__) +class ImageClusteringDescriptiveStatistics(DescriptiveStatistics): + """Descriptive statistics for ImageClustering + + Attributes: + num_samples: number of samples in the dataset. + + min_image_width: Minimum width of images + average_image_width: Average width of images + max_image_width: Maximum width of images + + min_image_height: Minimum height of images + average_image_height: Average height of images + max_image_height: Maximum height of images + + unique_labels: Number of unique labels + labels: dict of label frequencies + """ + + num_samples: int + + min_image_width: float + average_image_width: float + max_image_width: float + + min_image_height: float + average_image_height: float + max_image_height: float + + unique_num_labels: int + labels: dict[str, dict[str, int]] + + class AbsTaskImageClustering(AbsTask): """Abstract class for Clustering tasks The similarity is computed between pairs and the results are ranked. @@ -34,8 +68,43 @@ def _add_main_score(self, scores: dict[HFSubset, ScoresDict]) -> None: def _calculate_metrics_from_split( self, split: str, hf_subset: str | None = None, compute_overall: bool = False - ): - pass + ) -> ImageClusteringDescriptiveStatistics: + if hf_subset: + imgs = self.dataset[hf_subset][split][self.image_column_name] + labels = self.dataset[hf_subset][split][self.label_column_name] + elif compute_overall: + imgs = [] + labels = [] + for hf_subset in self.metadata.eval_langs: + imgs.extend(self.dataset[hf_subset][split][self.image_column_name]) + labels.extend(self.dataset[hf_subset][split][self.label_column_name]) + else: + imgs = self.dataset[split][self.image_column_name] + labels = self.dataset[split][self.label_column_name] + + num_samples = len(labels) + unique_num_labels = len(set(labels)) + label_count = Counter(labels) + + img_widths, img_heights = [], [] + for img in imgs: + width, height = img.size + img_heights.append(height) + img_widths.append(width) + + return ImageClusteringDescriptiveStatistics( + num_samples=num_samples, + unique_num_labels=unique_num_labels, + min_image_width=min(img_widths), + average_image_width=sum(img_widths) / len(img_widths), + max_image_width=max(img_widths), + min_image_height=min(img_heights), + average_image_height=sum(img_heights) / len(img_heights), + max_image_height=max(img_heights), + labels={ + str(label): {"count": count} for label, count in label_count.items() + }, + ) def _evaluate_subset( self, diff --git a/mteb/descriptive_stats/Image/ImageClustering/CIFAR100Clustering.json b/mteb/descriptive_stats/Image/ImageClustering/CIFAR100Clustering.json new file mode 100644 index 0000000000..e8a282bc67 --- /dev/null +++ b/mteb/descriptive_stats/Image/ImageClustering/CIFAR100Clustering.json @@ -0,0 +1,314 @@ +{ + "test": { + "num_samples": 10000, + "unique_num_labels": 100, + "min_image_width": 32, + "average_image_width": 32.0, + "max_image_width": 32, + "min_image_height": 32, + "average_image_height": 32.0, + "max_image_height": 32, + "labels": { + "49": { + "count": 100 + }, + "33": { + "count": 100 + }, + "72": { + "count": 100 + }, + "51": { + "count": 100 + }, + "71": { + "count": 100 + }, + "92": { + "count": 100 + }, + "15": { + "count": 100 + }, + "14": { + "count": 100 + }, + "23": { + "count": 100 + }, + "0": { + "count": 100 + }, + "75": { + "count": 100 + }, + "81": { + "count": 100 + }, + "69": { + "count": 100 + }, + "40": { + "count": 100 + }, + "43": { + "count": 100 + }, + "97": { + "count": 100 + }, + "70": { + "count": 100 + }, + "53": { + "count": 100 + }, + "29": { + "count": 100 + }, + "21": { + "count": 100 + }, + "16": { + "count": 100 + }, + "39": { + "count": 100 + }, + "8": { + "count": 100 + }, + "20": { + "count": 100 + }, + "61": { + "count": 100 + }, + "41": { + "count": 100 + }, + "93": { + "count": 100 + }, + "56": { + "count": 100 + }, + "73": { + "count": 100 + }, + "58": { + "count": 100 + }, + "11": { + "count": 100 + }, + "25": { + "count": 100 + }, + "37": { + "count": 100 + }, + "63": { + "count": 100 + }, + "24": { + "count": 100 + }, + "22": { + "count": 100 + }, + "17": { + "count": 100 + }, + "4": { + "count": 100 + }, + "6": { + "count": 100 + }, + "9": { + "count": 100 + }, + "57": { + "count": 100 + }, + "2": { + "count": 100 + }, + "32": { + "count": 100 + }, + "52": { + "count": 100 + }, + "42": { + "count": 100 + }, + "77": { + "count": 100 + }, + "27": { + "count": 100 + }, + "65": { + "count": 100 + }, + "7": { + "count": 100 + }, + "35": { + "count": 100 + }, + "82": { + "count": 100 + }, + "66": { + "count": 100 + }, + "90": { + "count": 100 + }, + "67": { + "count": 100 + }, + "91": { + "count": 100 + }, + "10": { + "count": 100 + }, + "78": { + "count": 100 + }, + "54": { + "count": 100 + }, + "89": { + "count": 100 + }, + "18": { + "count": 100 + }, + "13": { + "count": 100 + }, + "50": { + "count": 100 + }, + "26": { + "count": 100 + }, + "83": { + "count": 100 + }, + "47": { + "count": 100 + }, + "95": { + "count": 100 + }, + "76": { + "count": 100 + }, + "59": { + "count": 100 + }, + "85": { + "count": 100 + }, + "19": { + "count": 100 + }, + "46": { + "count": 100 + }, + "1": { + "count": 100 + }, + "74": { + "count": 100 + }, + "60": { + "count": 100 + }, + "64": { + "count": 100 + }, + "45": { + "count": 100 + }, + "36": { + "count": 100 + }, + "87": { + "count": 100 + }, + "30": { + "count": 100 + }, + "99": { + "count": 100 + }, + "80": { + "count": 100 + }, + "28": { + "count": 100 + }, + "98": { + "count": 100 + }, + "12": { + "count": 100 + }, + "94": { + "count": 100 + }, + "68": { + "count": 100 + }, + "44": { + "count": 100 + }, + "31": { + "count": 100 + }, + "79": { + "count": 100 + }, + "34": { + "count": 100 + }, + "55": { + "count": 100 + }, + "62": { + "count": 100 + }, + "96": { + "count": 100 + }, + "84": { + "count": 100 + }, + "38": { + "count": 100 + }, + "86": { + "count": 100 + }, + "5": { + "count": 100 + }, + "48": { + "count": 100 + }, + "3": { + "count": 100 + }, + "88": { + "count": 100 + } + } + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Image/ImageClustering/CIFAR10Clustering.json b/mteb/descriptive_stats/Image/ImageClustering/CIFAR10Clustering.json new file mode 100644 index 0000000000..34ff70e050 --- /dev/null +++ b/mteb/descriptive_stats/Image/ImageClustering/CIFAR10Clustering.json @@ -0,0 +1,44 @@ +{ + "test": { + "num_samples": 10000, + "unique_num_labels": 10, + "min_image_width": 32, + "average_image_width": 32.0, + "max_image_width": 32, + "min_image_height": 32, + "average_image_height": 32.0, + "max_image_height": 32, + "labels": { + "3": { + "count": 1000 + }, + "8": { + "count": 1000 + }, + "0": { + "count": 1000 + }, + "6": { + "count": 1000 + }, + "1": { + "count": 1000 + }, + "9": { + "count": 1000 + }, + "5": { + "count": 1000 + }, + "7": { + "count": 1000 + }, + "4": { + "count": 1000 + }, + "2": { + "count": 1000 + } + } + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Image/ImageClustering/ImageNetDog15Clustering.json b/mteb/descriptive_stats/Image/ImageClustering/ImageNetDog15Clustering.json new file mode 100644 index 0000000000..7719d70e57 --- /dev/null +++ b/mteb/descriptive_stats/Image/ImageClustering/ImageNetDog15Clustering.json @@ -0,0 +1,59 @@ +{ + "test": { + "num_samples": 1076, + "unique_num_labels": 15, + "min_image_width": 224, + "average_image_width": 224.0, + "max_image_width": 224, + "min_image_height": 224, + "average_image_height": 224.0, + "max_image_height": 224, + "labels": { + "0": { + "count": 152 + }, + "1": { + "count": 88 + }, + "2": { + "count": 75 + }, + "3": { + "count": 96 + }, + "4": { + "count": 57 + }, + "5": { + "count": 50 + }, + "6": { + "count": 52 + }, + "7": { + "count": 50 + }, + "8": { + "count": 50 + }, + "9": { + "count": 50 + }, + "10": { + "count": 53 + }, + "11": { + "count": 57 + }, + "12": { + "count": 50 + }, + "13": { + "count": 100 + }, + "14": { + "count": 96 + } + } + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Image/ImageClustering/TinyImageNetClustering.json b/mteb/descriptive_stats/Image/ImageClustering/TinyImageNetClustering.json new file mode 100644 index 0000000000..777c0bda4c --- /dev/null +++ b/mteb/descriptive_stats/Image/ImageClustering/TinyImageNetClustering.json @@ -0,0 +1,614 @@ +{ + "valid": { + "num_samples": 10000, + "unique_num_labels": 200, + "min_image_width": 64, + "average_image_width": 64.0, + "max_image_width": 64, + "min_image_height": 64, + "average_image_height": 64.0, + "max_image_height": 64, + "labels": { + "0": { + "count": 50 + }, + "1": { + "count": 50 + }, + "2": { + "count": 50 + }, + "3": { + "count": 50 + }, + "4": { + "count": 50 + }, + "5": { + "count": 50 + }, + "6": { + "count": 50 + }, + "7": { + "count": 50 + }, + "8": { + "count": 50 + }, + "9": { + "count": 50 + }, + "10": { + "count": 50 + }, + "11": { + "count": 50 + }, + "12": { + "count": 50 + }, + "13": { + "count": 50 + }, + "14": { + "count": 50 + }, + "15": { + "count": 50 + }, + "16": { + "count": 50 + }, + "17": { + "count": 50 + }, + "18": { + "count": 50 + }, + "19": { + "count": 50 + }, + "20": { + "count": 50 + }, + "21": { + "count": 50 + }, + "22": { + "count": 50 + }, + "23": { + "count": 50 + }, + "24": { + "count": 50 + }, + "25": { + "count": 50 + }, + "26": { + "count": 50 + }, + "27": { + "count": 50 + }, + "28": { + "count": 50 + }, + "29": { + "count": 50 + }, + "30": { + "count": 50 + }, + "31": { + "count": 50 + }, + "32": { + "count": 50 + }, + "33": { + "count": 50 + }, + "34": { + "count": 50 + }, + "35": { + "count": 50 + }, + "36": { + "count": 50 + }, + "37": { + "count": 50 + }, + "38": { + "count": 50 + }, + "39": { + "count": 50 + }, + "40": { + "count": 50 + }, + "41": { + "count": 50 + }, + "42": { + "count": 50 + }, + "43": { + "count": 50 + }, + "44": { + "count": 50 + }, + "45": { + "count": 50 + }, + "46": { + "count": 50 + }, + "47": { + "count": 50 + }, + "48": { + "count": 50 + }, + "49": { + "count": 50 + }, + "50": { + "count": 50 + }, + "51": { + "count": 50 + }, + "52": { + "count": 50 + }, + "53": { + "count": 50 + }, + "54": { + "count": 50 + }, + "55": { + "count": 50 + }, + "56": { + "count": 50 + }, + "57": { + "count": 50 + }, + "58": { + "count": 50 + }, + "59": { + "count": 50 + }, + "60": { + "count": 50 + }, + "61": { + "count": 50 + }, + "62": { + "count": 50 + }, + "63": { + "count": 50 + }, + "64": { + "count": 50 + }, + "65": { + "count": 50 + }, + "66": { + "count": 50 + }, + "67": { + "count": 50 + }, + "68": { + "count": 50 + }, + "69": { + "count": 50 + }, + "70": { + "count": 50 + }, + "71": { + "count": 50 + }, + "72": { + "count": 50 + }, + "73": { + "count": 50 + }, + "74": { + "count": 50 + }, + "75": { + "count": 50 + }, + "76": { + "count": 50 + }, + "77": { + "count": 50 + }, + "78": { + "count": 50 + }, + "79": { + "count": 50 + }, + "80": { + "count": 50 + }, + "81": { + "count": 50 + }, + "82": { + "count": 50 + }, + "83": { + "count": 50 + }, + "84": { + "count": 50 + }, + "85": { + "count": 50 + }, + "86": { + "count": 50 + }, + "87": { + "count": 50 + }, + "88": { + "count": 50 + }, + "89": { + "count": 50 + }, + "90": { + "count": 50 + }, + "91": { + "count": 50 + }, + "92": { + "count": 50 + }, + "93": { + "count": 50 + }, + "94": { + "count": 50 + }, + "95": { + "count": 50 + }, + "96": { + "count": 50 + }, + "97": { + "count": 50 + }, + "98": { + "count": 50 + }, + "99": { + "count": 50 + }, + "100": { + "count": 50 + }, + "101": { + "count": 50 + }, + "102": { + "count": 50 + }, + "103": { + "count": 50 + }, + "104": { + "count": 50 + }, + "105": { + "count": 50 + }, + "106": { + "count": 50 + }, + "107": { + "count": 50 + }, + "108": { + "count": 50 + }, + "109": { + "count": 50 + }, + "110": { + "count": 50 + }, + "111": { + "count": 50 + }, + "112": { + "count": 50 + }, + "113": { + "count": 50 + }, + "114": { + "count": 50 + }, + "115": { + "count": 50 + }, + "116": { + "count": 50 + }, + "117": { + "count": 50 + }, + "118": { + "count": 50 + }, + "119": { + "count": 50 + }, + "120": { + "count": 50 + }, + "121": { + "count": 50 + }, + "122": { + "count": 50 + }, + "123": { + "count": 50 + }, + "124": { + "count": 50 + }, + "125": { + "count": 50 + }, + "126": { + "count": 50 + }, + "127": { + "count": 50 + }, + "128": { + "count": 50 + }, + "129": { + "count": 50 + }, + "130": { + "count": 50 + }, + "131": { + "count": 50 + }, + "132": { + "count": 50 + }, + "133": { + "count": 50 + }, + "134": { + "count": 50 + }, + "135": { + "count": 50 + }, + "136": { + "count": 50 + }, + "137": { + "count": 50 + }, + "138": { + "count": 50 + }, + "139": { + "count": 50 + }, + "140": { + "count": 50 + }, + "141": { + "count": 50 + }, + "142": { + "count": 50 + }, + "143": { + "count": 50 + }, + "144": { + "count": 50 + }, + "145": { + "count": 50 + }, + "146": { + "count": 50 + }, + "147": { + "count": 50 + }, + "148": { + "count": 50 + }, + "149": { + "count": 50 + }, + "150": { + "count": 50 + }, + "151": { + "count": 50 + }, + "152": { + "count": 50 + }, + "153": { + "count": 50 + }, + "154": { + "count": 50 + }, + "155": { + "count": 50 + }, + "156": { + "count": 50 + }, + "157": { + "count": 50 + }, + "158": { + "count": 50 + }, + "159": { + "count": 50 + }, + "160": { + "count": 50 + }, + "161": { + "count": 50 + }, + "162": { + "count": 50 + }, + "163": { + "count": 50 + }, + "164": { + "count": 50 + }, + "165": { + "count": 50 + }, + "166": { + "count": 50 + }, + "167": { + "count": 50 + }, + "168": { + "count": 50 + }, + "169": { + "count": 50 + }, + "170": { + "count": 50 + }, + "171": { + "count": 50 + }, + "172": { + "count": 50 + }, + "173": { + "count": 50 + }, + "174": { + "count": 50 + }, + "175": { + "count": 50 + }, + "176": { + "count": 50 + }, + "177": { + "count": 50 + }, + "178": { + "count": 50 + }, + "179": { + "count": 50 + }, + "180": { + "count": 50 + }, + "181": { + "count": 50 + }, + "182": { + "count": 50 + }, + "183": { + "count": 50 + }, + "184": { + "count": 50 + }, + "185": { + "count": 50 + }, + "186": { + "count": 50 + }, + "187": { + "count": 50 + }, + "188": { + "count": 50 + }, + "189": { + "count": 50 + }, + "190": { + "count": 50 + }, + "191": { + "count": 50 + }, + "192": { + "count": 50 + }, + "193": { + "count": 50 + }, + "194": { + "count": 50 + }, + "195": { + "count": 50 + }, + "196": { + "count": 50 + }, + "197": { + "count": 50 + }, + "198": { + "count": 50 + }, + "199": { + "count": 50 + } + } + } +} \ No newline at end of file diff --git a/scripts/mieb_des_stats.py b/scripts/mieb_des_stats.py new file mode 100644 index 0000000000..babbc7b4e7 --- /dev/null +++ b/scripts/mieb_des_stats.py @@ -0,0 +1,18 @@ +from __future__ import annotations + +from multiprocessing import Pool, cpu_count + +from tqdm import tqdm + +import mteb + + +def process_task(task): + task.calculate_metadata_metrics() + + +if __name__ == "__main__": + tasks = mteb.get_tasks(task_types=["ImageClustering"]) + + with Pool(cpu_count()) as pool: + list(tqdm(pool.imap(process_task, tasks), total=len(tasks))) From 498c367690a136e5b4dfe631c99bab0ade2703de Mon Sep 17 00:00:00 2001 From: Isaac Chung Date: Thu, 13 Feb 2025 15:09:45 +0000 Subject: [PATCH 2/3] finish off last one --- .../ImageClustering/ImageNet10Clustering.json | 44 +++++++++++++++++++ 1 file changed, 44 insertions(+) create mode 100644 mteb/descriptive_stats/Image/ImageClustering/ImageNet10Clustering.json diff --git a/mteb/descriptive_stats/Image/ImageClustering/ImageNet10Clustering.json b/mteb/descriptive_stats/Image/ImageClustering/ImageNet10Clustering.json new file mode 100644 index 0000000000..8d367e6406 --- /dev/null +++ b/mteb/descriptive_stats/Image/ImageClustering/ImageNet10Clustering.json @@ -0,0 +1,44 @@ +{ + "test": { + "num_samples": 13000, + "unique_num_labels": 10, + "min_image_width": 224, + "average_image_width": 224.0, + "max_image_width": 224, + "min_image_height": 224, + "average_image_height": 224.0, + "max_image_height": 224, + "labels": { + "0": { + "count": 1300 + }, + "1": { + "count": 1300 + }, + "2": { + "count": 1300 + }, + "3": { + "count": 1300 + }, + "4": { + "count": 1300 + }, + "5": { + "count": 1300 + }, + "6": { + "count": 1300 + }, + "7": { + "count": 1300 + }, + "8": { + "count": 1300 + }, + "10": { + "count": 1300 + } + } + } +} \ No newline at end of file From 39d76d1f85b653ec44fcb1e9612b80b2c8cd1c21 Mon Sep 17 00:00:00 2001 From: Isaac Chung Date: Thu, 13 Feb 2025 15:19:05 +0000 Subject: [PATCH 3/3] remove script --- scripts/mieb_des_stats.py | 18 ------------------ 1 file changed, 18 deletions(-) delete mode 100644 scripts/mieb_des_stats.py diff --git a/scripts/mieb_des_stats.py b/scripts/mieb_des_stats.py deleted file mode 100644 index babbc7b4e7..0000000000 --- a/scripts/mieb_des_stats.py +++ /dev/null @@ -1,18 +0,0 @@ -from __future__ import annotations - -from multiprocessing import Pool, cpu_count - -from tqdm import tqdm - -import mteb - - -def process_task(task): - task.calculate_metadata_metrics() - - -if __name__ == "__main__": - tasks = mteb.get_tasks(task_types=["ImageClustering"]) - - with Pool(cpu_count()) as pool: - list(tqdm(pool.imap(process_task, tasks), total=len(tasks)))