huggingface · qubvel · Oct 29, 2024 · Aug 29, 2024 · Aug 30, 2024 · Aug 30, 2024
diff --git a/docs/source/en/model_doc/superpoint.md b/docs/source/en/model_doc/superpoint.md
@@ -86,23 +86,28 @@ model = SuperPointForKeypointDetection.from_pretrained("magic-leap-community/sup
 
 inputs = processor(images, return_tensors="pt")
 outputs = model(**inputs)
+image_sizes = torch.tensor([image.size for image in images]).flip(1)
+outputs = processor.post_process_keypoint_detection(outputs, image_sizes)
 
-for i in range(len(images)):
-    image_mask = outputs.mask[i]
-    image_indices = torch.nonzero(image_mask).squeeze()
-    image_keypoints = outputs.keypoints[i][image_indices]
-    image_scores = outputs.scores[i][image_indices]
-    image_descriptors = outputs.descriptors[i][image_indices]
+for output in outputs:
+    keypoints = output["keypoints"]
+    scores = output["scores"]
+    descriptors = output["descriptors"]
 ```
 
-You can then print the keypoints on the image to visualize the result :
+You can then print the keypoints on the image of your choice to visualize the result :
 ```python
-import cv2
-for keypoint, score in zip(image_keypoints, image_scores):
-    keypoint_x, keypoint_y = int(keypoint[0].item()), int(keypoint[1].item())
-    color = tuple([score.item() * 255] * 3)
-    image = cv2.circle(image, (keypoint_x, keypoint_y), 2, color)
-cv2.imwrite("output_image.png", image)
+import matplotlib.pyplot as plt
+plt.axis("off")
+plt.imshow(image)
+plt.scatter(
+    keypoints[:, 0],
+    keypoints[:, 1],
+    c=scores * 100,
+    s=scores * 50,
+    alpha=0.8
+)
+plt.savefig(f"output_image.png")
 ```
 
 This model was contributed by [stevenbucaille](https://huggingface.co/stevenbucaille).
@@ -123,6 +128,7 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h
 [[autodoc]] SuperPointImageProcessor
 
 - preprocess
+- post_process_keypoint_detection
 
 ## SuperPointForKeypointDetection
 

diff --git a/src/transformers/models/superpoint/image_processing_superpoint.py b/src/transformers/models/superpoint/image_processing_superpoint.py
@@ -17,7 +17,7 @@
 
 import numpy as np
 
-from ... import is_vision_available
+from ... import is_torch_available, is_vision_available
 from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
 from ...image_transforms import resize, to_channel_dimension_format
 from ...image_utils import (
@@ -30,8 +30,12 @@
     valid_images,
 )
 from ...utils import TensorType, logging, requires_backends
+from .modeling_superpoint import SuperPointKeypointDescriptionOutput
 
 
+if is_torch_available():
+    import torch
+
 if is_vision_available():
     import PIL
 
@@ -270,3 +274,46 @@ def preprocess(
         data = {"pixel_values": images}
 
         return BatchFeature(data=data, tensor_type=return_tensors)
+
+    def post_process_keypoint_detection(
+        self, outputs: SuperPointKeypointDescriptionOutput, target_sizes: torch.Tensor
+    ):
+        """
+        Converts the raw output of [`SuperPointForKeypointDetection`] into lists of keypoints, scores and descriptors
+        with coordinates absolute to the original image sizes.
+
+        Args:
+            outputs ([`SuperPointKeypointDescriptionOutput`]):
+                Raw outputs of the model.
+            target_sizes (`torch.Tensor` of shape `(batch_size, 2)`):
+                Tensor containing the size (h, w) of each image of the batch. This must be the original
+                image size (before any processing).
+        Returns:
+            `List[Dict]`: A list of dictionaries, each dictionary containing the keypoints, scores and descriptors for
+            an image in the batch as predicted by the model.
+        """
+        if len(outputs.mask) != len(target_sizes):
+            raise ValueError("Make sure that you pass in as many target sizes as the batch dimension of the mask")
+        if target_sizes.shape[1] != 2:
+            raise ValueError("Each element of target_sizes must contain the size (h, w) of each image of the batch")
+
+        masked_keypoints = outputs.keypoints.clone()
+
+        for keypoints, target_size in zip(masked_keypoints, target_sizes):
+            keypoints[:, 0] = keypoints[:, 0] * target_size[1]
+            keypoints[:, 1] = keypoints[:, 1] * target_size[0]
+
+        # Convert masked_keypoints to int
+        masked_keypoints = masked_keypoints.to(torch.int32)
+
+        results = []
+        for image_mask, keypoints, scores, descriptors in zip(
+            outputs.mask, masked_keypoints, outputs.scores, outputs.descriptors
+        ):
+            indices = torch.nonzero(image_mask).squeeze(1)
+            keypoints = keypoints[indices]
+            scores = scores[indices]
+            descriptors = descriptors[indices]
+            results.append({"keypoints": keypoints, "scores": scores, "descriptors": descriptors})
+
+        return results
diff --git a/src/transformers/models/superpoint/modeling_superpoint.py b/src/transformers/models/superpoint/modeling_superpoint.py
@@ -258,6 +258,9 @@ def _extract_keypoints(self, scores: torch.Tensor) -> Tuple[torch.Tensor, torch.
         # Convert (y, x) to (x, y)
         keypoints = torch.flip(keypoints, [1]).float()
 
+        # Convert to relative coordinates
+        keypoints = keypoints / torch.tensor([width, height], device=keypoints.device)
+
         return keypoints, scores
 
 

diff --git a/tests/models/superpoint/test_image_processing_superpoint.py b/tests/models/superpoint/test_image_processing_superpoint.py
@@ -15,17 +15,21 @@
 
 import numpy as np
 
-from transformers.testing_utils import require_torch, require_vision
-from transformers.utils import is_vision_available
+from transformers.testing_utils import require_torch, require_vision, slow
+from transformers.utils import is_torch_available, is_vision_available
 
 from ...test_image_processing_common import (
     ImageProcessingTestMixin,
     prepare_image_inputs,
 )
 
 
+if is_torch_available():
+    import torch
+
 if is_vision_available():
     from transformers import SuperPointImageProcessor
+    from transformers.models.superpoint.modeling_superpoint import SuperPointKeypointDescriptionOutput
 
 
 class SuperPointImageProcessingTester(unittest.TestCase):
@@ -70,6 +74,23 @@ def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=F
             torchify=torchify,
         )
 
+    def prepare_keypoint_detection_output(self, pixel_values):
+        max_number_keypoints = 50
+        batch_size = len(pixel_values)
+        mask = torch.zeros((batch_size, max_number_keypoints))
+        keypoints = torch.zeros((batch_size, max_number_keypoints, 2))
+        scores = torch.zeros((batch_size, max_number_keypoints))
+        descriptors = torch.zeros((batch_size, max_number_keypoints, 16))
+        for i in range(batch_size):
+            random_number_keypoints = np.random.randint(0, max_number_keypoints)
+            mask[i, :random_number_keypoints] = 1
+            keypoints[i, :random_number_keypoints] = torch.rand((random_number_keypoints, 2))
+            scores[i, :random_number_keypoints] = torch.rand((random_number_keypoints,))
+            descriptors[i, :random_number_keypoints] = torch.rand((random_number_keypoints, 16))
+        return SuperPointKeypointDescriptionOutput(
+            loss=None, keypoints=keypoints, scores=scores, descriptors=descriptors, mask=mask, hidden_states=None
+        )
+
 
 @require_torch
 @require_vision
@@ -110,3 +131,25 @@ def test_input_image_properly_converted_to_grayscale(self):
         pre_processed_images = image_processor.preprocess(image_inputs)
         for image in pre_processed_images["pixel_values"]:
             self.assertTrue(np.all(image[0, ...] == image[1, ...]) and np.all(image[1, ...] == image[2, ...]))
+
+    @slow
+    @require_torch
+    def test_post_processing_keypoint_detection(self):
+        image_processor = self.image_processing_class.from_dict(self.image_processor_dict)
+        image_inputs = self.image_processor_tester.prepare_image_inputs()
+        pre_processed_images = image_processor.preprocess(image_inputs, return_tensors="pt")
+        outputs = self.image_processor_tester.prepare_keypoint_detection_output(**pre_processed_images)
+        image_sizes = torch.tensor([image.size for image in image_inputs]).flip(1)
+        post_processed_outputs = image_processor.post_process_keypoint_detection(outputs, image_sizes)
+
+        self.assertTrue(len(post_processed_outputs) == self.image_processor_tester.batch_size)
+        for post_processed_output, image_size in zip(post_processed_outputs, image_sizes):
+            self.assertTrue("keypoints" in post_processed_output)
+            self.assertTrue("descriptors" in post_processed_output)
+            self.assertTrue("scores" in post_processed_output)
+            keypoints = post_processed_output["keypoints"]
+            all_below_image_size = torch.all(keypoints[:, 0] <= image_size[1]) and torch.all(
+                keypoints[:, 1] <= image_size[0]
+            )
+            all_above_zero = torch.all(keypoints[:, 0] >= 0) and torch.all(keypoints[:, 1] >= 0)
+            self.assertTrue(all_below_image_size and all_above_zero)