encord-team · frederik-encord · Dec 6, 2023
@@ -44,7 +44,9 @@ def __init__(self, cache_dir: Path, subset_size: Optional[int] = None, **kwargs)
         self.label_rows = self.project.label_rows
 
     @abstractmethod
-    def iterate(self, desc: str = "") -> Generator[Tuple[dict, Optional[Image.Image]], None, None]:
+    def iterate(
+        self, desc: str = "", include_images: bool = True
+    ) -> Generator[Tuple[dict, Optional[Image.Image]], None, None]:
         pass
 
     @abstractmethod
@@ -77,7 +79,9 @@ def __init__(self, cache_dir: Path, subset_size: Optional[int] = None, skip_labe
             0,
         )
 
-    def iterate(self, desc: str = "") -> Generator[Tuple[dict, Optional[Image.Image]], None, None]:
+    def iterate(
+        self, desc: str = "", include_images: bool = True
+    ) -> Generator[Tuple[dict, Optional[Image.Image]], None, None]:
         with PrismaConnection(self.project_file_structure) as cache_db:
             pbar = tqdm(total=self.length, desc=desc, leave=False)
             for label_hash, label_row in self.label_rows.items():
@@ -87,7 +91,6 @@ def iterate(self, desc: str = "") -> Generator[Tuple[dict, Optional[Image.Image]
                     self.num_frames = len(label_row.data_units)
                     data_units = sorted(label_row.data_units.values(), key=lambda du: int(du["data_sequence"]))
                     for data_unit in data_units:
-
                         if self._skip_labeled_data:
                             du_label = data_unit.get("labels", {})
                             if du_label.get("objects", []) != [] or du_label.get("classifications", []) != []:
@@ -104,7 +107,7 @@ def iterate(self, desc: str = "") -> Generator[Tuple[dict, Optional[Image.Image]
                                 None,
                             )
                             image = None
-                            if img_metadata is not None:
+                            if img_metadata is not None and include_images:
                                 image = download_image(
                                     img_metadata.signed_url,
                                     project_dir=self.project_file_structure.project_dir,
@@ -156,7 +159,7 @@ def iterate(self, desc: str = "") -> Generator[Tuple[dict, Optional[Image.Image]
                                     continue
 
                             image_path = next(video_images_dir.glob(f"{self.du_hash}_{frame_id}.*"), None)
-                            if image_path:
+                            if image_path and include_images:
                                 yield fake_data_unit, Image.open(image_path)
                             else:
                                 yield fake_data_unit, None

@@ -34,7 +34,7 @@ def execute(self, iterator: Iterator, writer: CSVMetricWriter):
         valid_annotation_types = {annotation_type.value for annotation_type in self.metadata.annotation_type}
         found_any = False
 
-        for data_unit, _ in iterator.iterate(desc="Looking for duplicates"):
+        for data_unit, _ in iterator.iterate(desc="Looking for duplicates", include_images=False):
             objects = [obj for obj in data_unit["labels"].get("objects", []) if obj["shape"] in valid_annotation_types]
             polygons = [get_polygon(obj) for obj in objects]
 

@@ -63,7 +63,7 @@ def execute(self, iterator: Iterator, writer: CSVMetricWriter):
         hu_moments_df = get_hu_embeddings(iterator)
         hu_moments_identifiers = set(hu_moments_df["identifier"])
 
-        for data_unit, _ in iterator.iterate(desc="Computing moment similarity"):
+        for data_unit, _ in iterator.iterate(desc="Computing moment similarity", include_images=False):
             for obj in data_unit["labels"].get("objects", []):
                 if obj["shape"] not in valid_annotation_types:
                     continue

@@ -33,7 +33,7 @@ def execute(self, iterator: Iterator, writer: CSVMetricWriter):
         valid_annotation_types = {annotation_type.value for annotation_type in self.metadata.annotation_type}
         found_any = False
 
-        for data_unit, _ in iterator.iterate(desc="Computing closeness to border"):
+        for data_unit, _ in iterator.iterate(desc="Computing closeness to border", include_images=False):
             for obj in data_unit["labels"].get("objects", []):
                 if obj["shape"] not in valid_annotation_types:
                     continue

@@ -19,7 +19,6 @@
 
 def get_area(obj: dict) -> float:
     if obj["shape"] in {*BoxShapes, ObjectShape.POLYGON}:
-
         points = get_object_coordinates(obj)
         if points is None or len(points) < 3:
             logger.debug("Less than 3 points")
@@ -65,7 +64,7 @@ def execute(self, iterator: Iterator, writer: CSVMetricWriter):
         valid_annotation_types = {annotation_type.value for annotation_type in self.metadata.annotation_type}
         found_any = False
 
-        for data_unit, _ in iterator.iterate(desc="Computing object area"):
+        for data_unit, _ in iterator.iterate(desc="Computing object area", include_images=False):
             for obj in data_unit["labels"].get("objects", []):
                 if obj["shape"] not in valid_annotation_types:
                     continue

@@ -105,7 +105,7 @@ def execute(self, iterator: Iterator, writer: CSVMetricWriter):
         if not videos:
             logger.info("<yellow>[Skipping]</yellow> No videos in dataset. ")
 
-        for data_unit, _ in iterator.iterate(desc="Storing occlusion index"):
+        for data_unit, _ in iterator.iterate(desc="Storing occlusion index", include_images=False):
             label_row_hash = iterator.label_hash
             if label_row_hash not in videos.keys():
                 continue

@@ -70,7 +70,7 @@ def execute(self, iterator: Iterator, writer: CSVMetricWriter):
         label_hash = ""
         previous_objects = None
         previous_polygons = None
-        for data_unit, _ in iterator.iterate(desc="Looking for overlapping objects"):
+        for data_unit, _ in iterator.iterate(desc="Looking for overlapping objects", include_images=False):
             label_row = iterator.label_rows[iterator.label_hash]
             data_type = label_row["data_type"]
             if not (data_type == "video" or (data_type == "img_group" and len(label_row["data_units"]) > 1)):

@@ -96,7 +96,7 @@ def execute(self, iterator: Iterator, writer: CSVMetricWriter):
         # Prepare sliding window of previous two frames to compare polygons over time
         window: List[List[Tuple[dict, Polygon]]] = []
 
-        for data_unit, _ in iterator.iterate(desc="Looking for broken tracks"):
+        for data_unit, _ in iterator.iterate(desc="Looking for broken tracks", include_images=False):
             label_row = iterator.label_rows[iterator.label_hash]
             frame = iterator.frame
 
@@ -211,7 +211,7 @@ def execute(self, iterator: Iterator, writer: CSVMetricWriter):
         # Collect the results in the CSV file.
         # Everything not found above with get score "1" meaning "no issues".
         annotated = {k: False for k in error_store.errors}
-        for data_unit, _ in iterator.iterate(desc="Storing results"):
+        for data_unit, _ in iterator.iterate(desc="Storing results", include_images=False):
             for obj in data_unit["labels"].get("objects", []):
                 key = (obj["objectHash"], iterator.frame)
                 if key in error_store.errors:

@@ -20,6 +20,6 @@ def execute(self, iterator: Iterator, writer: CSVMetricWriter):
         if not iterator.project.ontology.objects:
             return
 
-        for data_unit, _ in iterator.iterate(desc="Counting objects"):
+        for data_unit, _ in iterator.iterate(desc="Counting objects", include_images=False):
             score = len(data_unit["labels"]["objects"]) if "objects" in data_unit["labels"] else 0
             writer.write(score)
@@ -22,7 +22,7 @@ def __init__(self):
         )
 
     def execute(self, iterator: Iterator, writer: CSVMetricWriter):
-        for _ in iterator.iterate(desc="Assigning random values to images"):
+        for _ in iterator.iterate(desc="Assigning random values to images", include_images=False):
             writer.write(np.random.uniform())
 
 
@@ -46,7 +46,9 @@ def execute(self, iterator: Iterator, writer: CSVMetricWriter):
         valid_annotation_types = {annotation_type.value for annotation_type in self.metadata.annotation_type}
 
         found_any = False
-        for data_unit, _ in iterator.iterate(desc="Searching for objects and assigning random scores"):
+        for data_unit, _ in iterator.iterate(
+            desc="Searching for objects and assigning random scores", include_images=False
+        ):
             for obj in data_unit["labels"].get("objects", []):
                 if not obj["shape"] in valid_annotation_types:
                     continue

@@ -108,7 +108,7 @@ def execute(self, iterator: Iterator, writer: CSVMetricWriter):
 
         data_hash_to_score = self._get_difficulty_ranking(cluster_size)
 
-        for data_unit, _ in iterator.iterate(desc="Writing scores to a file"):
+        for data_unit, _ in iterator.iterate(desc="Writing scores to a file", include_images=False):
             score = data_hash_to_score.get(data_unit["data_hash"])
             if score is not None:
                 writer.write(score=score)
@@ -92,7 +92,7 @@ def execute(self, iterator: Iterator, writer: CSVMetricWriter):
         fix_duplicate_image_orders_in_knn_graph_all_rows(query_res.indices)
         scores = self.score_images(embedding_info, query_res, iterator.project.project_hash)
 
-        for data_unit, _ in iterator.iterate(desc="Writing scores to a file"):
+        for data_unit, _ in iterator.iterate(desc="Writing scores to a file", include_images=False):
             data_unit_info = scores.get(data_unit["data_hash"])
             if data_unit_info is not None:
                 writer.write(

@@ -279,7 +279,7 @@ def execute(self, iterator: Iterator, writer: CSVMetricWriter):
         nearest_indexes = self.get_nearest_indexes()
         self.fix_nearest_indexes(nearest_indexes)
         key_score_pairs = self.create_key_score_pairs(nearest_indexes)
-        for data_unit, _ in iterator.iterate(desc="Storing index"):
+        for data_unit, _ in iterator.iterate(desc="Storing index", include_images=False):
             key = iterator.get_identifier()
             is_multiclass = is_multiclass_ontology(iterator.project.ontology)
 

@@ -135,7 +135,7 @@ def execute(self, iterator: Iterator, writer: CSVMetricWriter):
         label_scores = label_matches.mean(axis=-1)
 
         valid_annotation_types = {annotation_type.value for annotation_type in self.metadata.annotation_type}
-        for data_unit, _ in iterator.iterate(desc="Storing index"):
+        for data_unit, _ in iterator.iterate(desc="Storing index", include_images=False):
             for obj in data_unit["labels"].get("objects", []):
                 if obj["shape"] not in valid_annotation_types:
                     continue

@@ -165,7 +165,9 @@ def get_encord_classification(self, pred: Series, ontology_classification: Class
             manualAnnotation=False,
         )
 
-    def iterate(self, desc: str = "") -> Generator[Tuple[dict, Optional[Image.Image]], None, None]:
+    def iterate(
+        self, desc: str = "", include_images: bool = True
+    ) -> Generator[Tuple[dict, Optional[Image.Image]], None, None]:
         pbar = tqdm(total=self.length, desc=desc, leave=False)
         with PrismaConnection(self.project_file_structure) as cache_db:
             for label_hash, lh_group in self.predictions.groupby("label_hash"):
@@ -203,9 +205,7 @@ def iterate(self, desc: str = "") -> Generator[Tuple[dict, Optional[Image.Image]
                             logger.error("The prediction is not in the ontology objects or classifications")
 
                     du["labels"] = {"objects": objects, "classifications": classifications}
-                    image = self.get_image(fr_preds.iloc[0], cache_db=cache_db)
-                    if image is None:
-                        logger.error(f"Failed to open Image at frame: {self.du_hash}/{fr_preds.iloc[0]}")
+                    image = (include_images and self.get_image(fr_preds.iloc[0], cache_db=cache_db)) or None
                     yield du, image
                     pbar.update(1)