From 17d50e881b6e9cd1e9d4a36da17d21206caade52 Mon Sep 17 00:00:00 2001
From: Steven Bucaille <steven.bucaille@buawei.com>
Date: Thu, 29 Aug 2024 14:55:58 +0000
Subject: [PATCH 01/32] feat: Added int conversion and unwrapping

---
 .../superpoint/image_processing_superpoint.py | 30 ++++++++++++++++++-
 .../models/superpoint/modeling_superpoint.py  |  3 ++
 2 files changed, 32 insertions(+), 1 deletion(-)

diff --git a/src/transformers/models/superpoint/image_processing_superpoint.py b/src/transformers/models/superpoint/image_processing_superpoint.py
index fbbb717570cb..b0ea901c4252 100644
--- a/src/transformers/models/superpoint/image_processing_superpoint.py
+++ b/src/transformers/models/superpoint/image_processing_superpoint.py
@@ -17,7 +17,7 @@
 
 import numpy as np
 
-from ... import is_vision_available
+from ... import is_torch_available, is_vision_available
 from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
 from ...image_transforms import resize, to_channel_dimension_format
 from ...image_utils import (
@@ -32,6 +32,9 @@
 from ...utils import TensorType, logging, requires_backends
 
 
+if is_torch_available():
+    import torch
+
 if is_vision_available():
     import PIL
 
@@ -270,3 +273,28 @@ def preprocess(
         data = {"pixel_values": images}
 
         return BatchFeature(data=data, tensor_type=return_tensors)
+
+    def post_process_keypoint_detection(self, outputs, target_sizes, unwrap_batch_dim=True):
+        if len(outputs.mask) != len(target_sizes):
+            raise ValueError("Make sure that you pass in as many target sizes as the batch dimension of the logits")
+        if target_sizes.shape[1] != 2:
+            raise ValueError("Each element of target_sizes must contain the size (h, w) of each image of the batch")
+
+        for keypoints, target_size in zip(outputs.keypoints, target_sizes):
+            keypoints[:, 0] = keypoints[:, 0] * target_size[1]
+            keypoints[:, 1] = keypoints[:, 1] * target_size[0]
+
+        # Convert masked_keypoints to int
+        masked_keypoints = outputs.keypoints.to(torch.int32)
+
+        outputs.keypoints = masked_keypoints
+
+        results = []
+        for i, image_mask in enumerate(outputs.mask):
+            indices = torch.nonzero(image_mask).squeeze(1)
+            keypoints = outputs.keypoints[i][indices]
+            scores = outputs.scores[i][indices]
+            descriptors = outputs.descriptors[i][indices]
+            results.append({"keypoints": keypoints, "scores": scores, "descriptors": descriptors})
+
+        return results
diff --git a/src/transformers/models/superpoint/modeling_superpoint.py b/src/transformers/models/superpoint/modeling_superpoint.py
index cfd3dfd86e8e..b77a90367d4a 100644
--- a/src/transformers/models/superpoint/modeling_superpoint.py
+++ b/src/transformers/models/superpoint/modeling_superpoint.py
@@ -258,6 +258,9 @@ def _extract_keypoints(self, scores: torch.Tensor) -> Tuple[torch.Tensor, torch.
         # Convert (y, x) to (x, y)
         keypoints = torch.flip(keypoints, [1]).float()
 
+        # Convert to relative coordinates
+        keypoints = keypoints / torch.tensor([width, height], device=keypoints.device)
+
         return keypoints, scores
 
 

From 285c465bd484aa01b38bee1caa9086ba0f96bdb9 Mon Sep 17 00:00:00 2001
From: steven <steven.bucaille@gmail.com>
Date: Fri, 30 Aug 2024 11:46:48 +0200
Subject: [PATCH 02/32] test: added tests for post_process_keypoint_detection
 of SuperPointImageProcessor

---
 docs/source/en/model_doc/superpoint.md        |  1 +
 .../superpoint/image_processing_superpoint.py | 37 ++++++++++++++-----
 .../test_image_processing_superpoint.py       | 29 ++++++++++++++-
 3 files changed, 56 insertions(+), 11 deletions(-)

diff --git a/docs/source/en/model_doc/superpoint.md b/docs/source/en/model_doc/superpoint.md
index b9aab2f1b929..13eec5ffe1b2 100644
--- a/docs/source/en/model_doc/superpoint.md
+++ b/docs/source/en/model_doc/superpoint.md
@@ -123,6 +123,7 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h
 [[autodoc]] SuperPointImageProcessor
 
 - preprocess
+- post_process_keypoint_detection
 
 ## SuperPointForKeypointDetection
 
diff --git a/src/transformers/models/superpoint/image_processing_superpoint.py b/src/transformers/models/superpoint/image_processing_superpoint.py
index b0ea901c4252..4ab8a895275d 100644
--- a/src/transformers/models/superpoint/image_processing_superpoint.py
+++ b/src/transformers/models/superpoint/image_processing_superpoint.py
@@ -30,6 +30,7 @@
     valid_images,
 )
 from ...utils import TensorType, logging, requires_backends
+from .modeling_superpoint import SuperPointKeypointDescriptionOutput
 
 
 if is_torch_available():
@@ -274,27 +275,45 @@ def preprocess(
 
         return BatchFeature(data=data, tensor_type=return_tensors)
 
-    def post_process_keypoint_detection(self, outputs, target_sizes, unwrap_batch_dim=True):
+    def post_process_keypoint_detection(
+        self, outputs: SuperPointKeypointDescriptionOutput, target_sizes: torch.Tensor
+    ):
+        """
+        Converts the raw output of [`SuperPointForKeypointDetection`] into lists of keypoints, scores and descriptors
+        with coordinates absolute to the original image sizes.
+
+        Args:
+            outputs ([`SuperPointKeypointDescriptionOutput`]):
+                Raw outputs of the model.
+            target_sizes (`torch.Tensor` of shape `(batch_size, 2)`):
+                Tensor containing the size (h, w) of each image of the batch. This must be the original
+                image size (before any processing).
+        Returns:
+            `List[Dict]`: A list of dictionaries, each dictionary containing the keypoints, scores and descriptors for
+            an image in the batch as predicted by the model.
+        """
         if len(outputs.mask) != len(target_sizes):
             raise ValueError("Make sure that you pass in as many target sizes as the batch dimension of the logits")
         if target_sizes.shape[1] != 2:
             raise ValueError("Each element of target_sizes must contain the size (h, w) of each image of the batch")
 
-        for keypoints, target_size in zip(outputs.keypoints, target_sizes):
+        masked_keypoints = outputs.keypoints.clone()
+
+        for keypoints, target_size in zip(masked_keypoints, target_sizes):
             keypoints[:, 0] = keypoints[:, 0] * target_size[1]
             keypoints[:, 1] = keypoints[:, 1] * target_size[0]
 
         # Convert masked_keypoints to int
-        masked_keypoints = outputs.keypoints.to(torch.int32)
-
-        outputs.keypoints = masked_keypoints
+        masked_keypoints = masked_keypoints.to(torch.int32)
 
         results = []
-        for i, image_mask in enumerate(outputs.mask):
+        for image_mask, keypoints, scores, descriptors in zip(
+            outputs.mask, masked_keypoints, outputs.scores, outputs.descriptors
+        ):
             indices = torch.nonzero(image_mask).squeeze(1)
-            keypoints = outputs.keypoints[i][indices]
-            scores = outputs.scores[i][indices]
-            descriptors = outputs.descriptors[i][indices]
+            keypoints = keypoints[indices]
+            scores = scores[indices]
+            descriptors = descriptors[indices]
             results.append({"keypoints": keypoints, "scores": scores, "descriptors": descriptors})
 
         return results
diff --git a/tests/models/superpoint/test_image_processing_superpoint.py b/tests/models/superpoint/test_image_processing_superpoint.py
index 90bbf82d1ed8..332ac0847157 100644
--- a/tests/models/superpoint/test_image_processing_superpoint.py
+++ b/tests/models/superpoint/test_image_processing_superpoint.py
@@ -15,7 +15,7 @@
 
 import numpy as np
 
-from transformers.testing_utils import require_torch, require_vision
+from transformers.testing_utils import require_torch, require_vision, slow
 from transformers.utils import is_vision_available
 
 from ...test_image_processing_common import (
@@ -25,7 +25,10 @@
 
 
 if is_vision_available():
-    from transformers import SuperPointImageProcessor
+    from transformers import SuperPointForKeypointDetection, SuperPointImageProcessor, is_torch_available
+
+if is_torch_available():
+    import torch
 
 
 class SuperPointImageProcessingTester(unittest.TestCase):
@@ -110,3 +113,25 @@ def test_input_image_properly_converted_to_grayscale(self):
         pre_processed_images = image_processor.preprocess(image_inputs)
         for image in pre_processed_images["pixel_values"]:
             self.assertTrue(np.all(image[0, ...] == image[1, ...]) and np.all(image[1, ...] == image[2, ...]))
+
+    @slow
+    def test_post_processing_keypoint_detection(self):
+        image_processor = self.image_processing_class.from_dict(self.image_processor_dict)
+        model = SuperPointForKeypointDetection.from_pretrained("magic-leap-community/superpoint")
+        image_inputs = self.image_processor_tester.prepare_image_inputs()
+        pre_processed_images = image_processor.preprocess(image_inputs, return_tensors="pt")
+        outputs = model(**pre_processed_images)
+        image_sizes = torch.tensor([image.size for image in image_inputs])
+        post_processed_outputs = image_processor.post_process_keypoint_detection(outputs, image_sizes)
+
+        self.assertTrue(len(post_processed_outputs) == self.image_processor_tester.batch_size)
+        for post_processed_output, image_size in zip(post_processed_outputs, image_sizes):
+            self.assertTrue("keypoints" in post_processed_output)
+            self.assertTrue("descriptors" in post_processed_output)
+            self.assertTrue("scores" in post_processed_output)
+            keypoints = post_processed_output["keypoints"]
+            all_below_image_size = torch.all(keypoints[:, 0] <= image_size[1]) and torch.all(
+                keypoints[:, 1] <= image_size[0]
+            )
+            all_above_zero = torch.all(keypoints[:, 0] >= 0) and torch.all(keypoints[:, 1] >= 0)
+            self.assertTrue(all_below_image_size and all_above_zero)

From 2efe61b5fa1bcd1ba3617ab8735b1d043f93c704 Mon Sep 17 00:00:00 2001
From: steven <steven.bucaille@gmail.com>
Date: Fri, 30 Aug 2024 16:40:20 +0200
Subject: [PATCH 03/32] docs: changed docs to include
 post_process_keypoint_detection method and switched from opencv to matplotlib

---
 docs/source/en/model_doc/superpoint.md        | 30 +++++++++++--------
 .../test_image_processing_superpoint.py       | 10 +++----
 2 files changed, 22 insertions(+), 18 deletions(-)

diff --git a/docs/source/en/model_doc/superpoint.md b/docs/source/en/model_doc/superpoint.md
index 13eec5ffe1b2..4b053e2d88ed 100644
--- a/docs/source/en/model_doc/superpoint.md
+++ b/docs/source/en/model_doc/superpoint.md
@@ -86,23 +86,27 @@ model = SuperPointForKeypointDetection.from_pretrained("magic-leap-community/sup
 
 inputs = processor(images, return_tensors="pt")
 outputs = model(**inputs)
+image_sizes = torch.tensor([image.size for image in images]).flip(1)
+outputs = processor.post_process_keypoint_detection(outputs, image_sizes)
 
-for i in range(len(images)):
-    image_mask = outputs.mask[i]
-    image_indices = torch.nonzero(image_mask).squeeze()
-    image_keypoints = outputs.keypoints[i][image_indices]
-    image_scores = outputs.scores[i][image_indices]
-    image_descriptors = outputs.descriptors[i][image_indices]
+for output in outputs:
+    keypoints = output["keypoints"]
+    scores = output["scores"]
+    descriptors = output["descriptors"]
 ```
 
-You can then print the keypoints on the image to visualize the result :
+You can then print the keypoints on the image of your choice to visualize the result :
 ```python
-import cv2
-for keypoint, score in zip(image_keypoints, image_scores):
-    keypoint_x, keypoint_y = int(keypoint[0].item()), int(keypoint[1].item())
-    color = tuple([score.item() * 255] * 3)
-    image = cv2.circle(image, (keypoint_x, keypoint_y), 2, color)
-cv2.imwrite("output_image.png", image)
+import matplotlib.pyplot as plt
+plt.axis("off")
+plt.imshow(image)
+plt.scatter(
+    keypoints[:, 0],
+    keypoints[:, 1],
+    c=scores * 100,
+    s=scores * 20
+)
+plt.savefig(f"output_image.png")
 ```
 
 This model was contributed by [stevenbucaille](https://huggingface.co/stevenbucaille).
diff --git a/tests/models/superpoint/test_image_processing_superpoint.py b/tests/models/superpoint/test_image_processing_superpoint.py
index 332ac0847157..18aa0496f893 100644
--- a/tests/models/superpoint/test_image_processing_superpoint.py
+++ b/tests/models/superpoint/test_image_processing_superpoint.py
@@ -16,7 +16,7 @@
 import numpy as np
 
 from transformers.testing_utils import require_torch, require_vision, slow
-from transformers.utils import is_vision_available
+from transformers.utils import is_torch_available, is_vision_available
 
 from ...test_image_processing_common import (
     ImageProcessingTestMixin,
@@ -24,12 +24,12 @@
 )
 
 
-if is_vision_available():
-    from transformers import SuperPointForKeypointDetection, SuperPointImageProcessor, is_torch_available
-
 if is_torch_available():
     import torch
 
+if is_vision_available():
+    from transformers import SuperPointForKeypointDetection, SuperPointImageProcessor
+
 
 class SuperPointImageProcessingTester(unittest.TestCase):
     def __init__(
@@ -121,7 +121,7 @@ def test_post_processing_keypoint_detection(self):
         image_inputs = self.image_processor_tester.prepare_image_inputs()
         pre_processed_images = image_processor.preprocess(image_inputs, return_tensors="pt")
         outputs = model(**pre_processed_images)
-        image_sizes = torch.tensor([image.size for image in image_inputs])
+        image_sizes = torch.tensor([image.size for image in image_inputs]).flip(1)
         post_processed_outputs = image_processor.post_process_keypoint_detection(outputs, image_sizes)
 
         self.assertTrue(len(post_processed_outputs) == self.image_processor_tester.batch_size)

From a77b87055462b19f0c36d051ab42fb1e2d7f1876 Mon Sep 17 00:00:00 2001
From: steven <steven.bucaille@gmail.com>
Date: Fri, 30 Aug 2024 22:42:28 +0200
Subject: [PATCH 04/32] test: changed test to not depend on SuperPointModel
 forward

---
 .../superpoint/image_processing_superpoint.py |  2 +-
 .../test_image_processing_superpoint.py       | 23 ++++++++++++++++---
 2 files changed, 21 insertions(+), 4 deletions(-)

diff --git a/src/transformers/models/superpoint/image_processing_superpoint.py b/src/transformers/models/superpoint/image_processing_superpoint.py
index 4ab8a895275d..aa0ffa9ae673 100644
--- a/src/transformers/models/superpoint/image_processing_superpoint.py
+++ b/src/transformers/models/superpoint/image_processing_superpoint.py
@@ -293,7 +293,7 @@ def post_process_keypoint_detection(
             an image in the batch as predicted by the model.
         """
         if len(outputs.mask) != len(target_sizes):
-            raise ValueError("Make sure that you pass in as many target sizes as the batch dimension of the logits")
+            raise ValueError("Make sure that you pass in as many target sizes as the batch dimension of the mask")
         if target_sizes.shape[1] != 2:
             raise ValueError("Each element of target_sizes must contain the size (h, w) of each image of the batch")
 
diff --git a/tests/models/superpoint/test_image_processing_superpoint.py b/tests/models/superpoint/test_image_processing_superpoint.py
index 18aa0496f893..bf9cfcc24903 100644
--- a/tests/models/superpoint/test_image_processing_superpoint.py
+++ b/tests/models/superpoint/test_image_processing_superpoint.py
@@ -15,6 +15,7 @@
 
 import numpy as np
 
+from transformers.models.superpoint.modeling_superpoint import SuperPointKeypointDescriptionOutput
 from transformers.testing_utils import require_torch, require_vision, slow
 from transformers.utils import is_torch_available, is_vision_available
 
@@ -28,7 +29,7 @@
     import torch
 
 if is_vision_available():
-    from transformers import SuperPointForKeypointDetection, SuperPointImageProcessor
+    from transformers import SuperPointImageProcessor
 
 
 class SuperPointImageProcessingTester(unittest.TestCase):
@@ -73,6 +74,23 @@ def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=F
             torchify=torchify,
         )
 
+    def prepare_keypoint_detection_output(self, pixel_values):
+        max_number_keypoints = 50
+        batch_size = len(pixel_values)
+        mask = torch.zeros((batch_size, max_number_keypoints))
+        keypoints = torch.zeros((batch_size, max_number_keypoints, 2))
+        scores = torch.zeros((batch_size, max_number_keypoints))
+        descriptors = torch.zeros((batch_size, max_number_keypoints, 16))
+        for i in range(batch_size):
+            random_number_keypoints = np.random.randint(0, max_number_keypoints)
+            mask[i, :random_number_keypoints] = 1
+            keypoints[i, :random_number_keypoints] = torch.rand((random_number_keypoints, 2))
+            scores[i, :random_number_keypoints] = torch.rand((random_number_keypoints,))
+            descriptors[i, :random_number_keypoints] = torch.rand((random_number_keypoints, 16))
+        return SuperPointKeypointDescriptionOutput(
+            loss=None, keypoints=keypoints, scores=scores, descriptors=descriptors, mask=mask, hidden_states=None
+        )
+
 
 @require_torch
 @require_vision
@@ -117,10 +135,9 @@ def test_input_image_properly_converted_to_grayscale(self):
     @slow
     def test_post_processing_keypoint_detection(self):
         image_processor = self.image_processing_class.from_dict(self.image_processor_dict)
-        model = SuperPointForKeypointDetection.from_pretrained("magic-leap-community/superpoint")
         image_inputs = self.image_processor_tester.prepare_image_inputs()
         pre_processed_images = image_processor.preprocess(image_inputs, return_tensors="pt")
-        outputs = model(**pre_processed_images)
+        outputs = self.image_processor_tester.prepare_keypoint_detection_output(**pre_processed_images)
         image_sizes = torch.tensor([image.size for image in image_inputs]).flip(1)
         post_processed_outputs = image_processor.post_process_keypoint_detection(outputs, image_sizes)
 

From 2ab79cdccc65e09251aed61ef508fcbb4b672c86 Mon Sep 17 00:00:00 2001
From: steven <steven.bucaille@gmail.com>
Date: Fri, 30 Aug 2024 22:59:15 +0200
Subject: [PATCH 05/32] test: added missing require_torch decorator

---
 tests/models/superpoint/test_image_processing_superpoint.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/models/superpoint/test_image_processing_superpoint.py b/tests/models/superpoint/test_image_processing_superpoint.py
index bf9cfcc24903..8acd2f7e3746 100644
--- a/tests/models/superpoint/test_image_processing_superpoint.py
+++ b/tests/models/superpoint/test_image_processing_superpoint.py
@@ -133,6 +133,7 @@ def test_input_image_properly_converted_to_grayscale(self):
             self.assertTrue(np.all(image[0, ...] == image[1, ...]) and np.all(image[1, ...] == image[2, ...]))
 
     @slow
+    @require_torch
     def test_post_processing_keypoint_detection(self):
         image_processor = self.image_processing_class.from_dict(self.image_processor_dict)
         image_inputs = self.image_processor_tester.prepare_image_inputs()

From 419ae5dcf2ed40f58d2a9c419e17cded3bcf0e01 Mon Sep 17 00:00:00 2001
From: steven <steven.bucaille@gmail.com>
Date: Fri, 30 Aug 2024 22:59:51 +0200
Subject: [PATCH 06/32] docs: changed pyplot parameters for the keypoints to be
 more visible in the example

---
 docs/source/en/model_doc/superpoint.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/docs/source/en/model_doc/superpoint.md b/docs/source/en/model_doc/superpoint.md
index 4b053e2d88ed..18f3d76819fd 100644
--- a/docs/source/en/model_doc/superpoint.md
+++ b/docs/source/en/model_doc/superpoint.md
@@ -104,7 +104,8 @@ plt.scatter(
     keypoints[:, 0],
     keypoints[:, 1],
     c=scores * 100,
-    s=scores * 20
+    s=scores * 50,
+    alpha=0.8
 )
 plt.savefig(f"output_image.png")
 ```

From 39b32a2f69500bc7af01715fc7beae2260549afe Mon Sep 17 00:00:00 2001
From: steven <steven.bucaille@gmail.com>
Date: Fri, 30 Aug 2024 23:03:59 +0200
Subject: [PATCH 07/32] tests: changed import torch location to make test_flax
 and test_tf

---
 tests/models/superpoint/test_image_processing_superpoint.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/tests/models/superpoint/test_image_processing_superpoint.py b/tests/models/superpoint/test_image_processing_superpoint.py
index 8acd2f7e3746..36df033c3db0 100644
--- a/tests/models/superpoint/test_image_processing_superpoint.py
+++ b/tests/models/superpoint/test_image_processing_superpoint.py
@@ -25,9 +25,6 @@
 )
 
 
-if is_torch_available():
-    import torch
-
 if is_vision_available():
     from transformers import SuperPointImageProcessor
 
@@ -75,6 +72,7 @@ def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=F
         )
 
     def prepare_keypoint_detection_output(self, pixel_values):
+        import torch
         max_number_keypoints = 50
         batch_size = len(pixel_values)
         mask = torch.zeros((batch_size, max_number_keypoints))

From 144e09a2a8917649fffe5d309a46fc5e033a417e Mon Sep 17 00:00:00 2001
From: steven <steven.bucaille@gmail.com>
Date: Fri, 30 Aug 2024 23:19:15 +0200
Subject: [PATCH 08/32] Revert "tests: changed import torch location to make
 test_flax and test_tf"

This reverts commit 39b32a2f69500bc7af01715fc7beae2260549afe.
---
 tests/models/superpoint/test_image_processing_superpoint.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tests/models/superpoint/test_image_processing_superpoint.py b/tests/models/superpoint/test_image_processing_superpoint.py
index 36df033c3db0..8acd2f7e3746 100644
--- a/tests/models/superpoint/test_image_processing_superpoint.py
+++ b/tests/models/superpoint/test_image_processing_superpoint.py
@@ -25,6 +25,9 @@
 )
 
 
+if is_torch_available():
+    import torch
+
 if is_vision_available():
     from transformers import SuperPointImageProcessor
 
@@ -72,7 +75,6 @@ def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=F
         )
 
     def prepare_keypoint_detection_output(self, pixel_values):
-        import torch
         max_number_keypoints = 50
         batch_size = len(pixel_values)
         mask = torch.zeros((batch_size, max_number_keypoints))

From 21dbdfc0341f55bd1a32007aaca2d12fa47ad551 Mon Sep 17 00:00:00 2001
From: steven <steven.bucaille@gmail.com>
Date: Fri, 30 Aug 2024 23:31:21 +0200
Subject: [PATCH 09/32] tests: fixed import

---
 tests/models/superpoint/test_image_processing_superpoint.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/models/superpoint/test_image_processing_superpoint.py b/tests/models/superpoint/test_image_processing_superpoint.py
index 8acd2f7e3746..3f0a625d8bf8 100644
--- a/tests/models/superpoint/test_image_processing_superpoint.py
+++ b/tests/models/superpoint/test_image_processing_superpoint.py
@@ -15,7 +15,6 @@
 
 import numpy as np
 
-from transformers.models.superpoint.modeling_superpoint import SuperPointKeypointDescriptionOutput
 from transformers.testing_utils import require_torch, require_vision, slow
 from transformers.utils import is_torch_available, is_vision_available
 
@@ -30,6 +29,7 @@
 
 if is_vision_available():
     from transformers import SuperPointImageProcessor
+    from transformers.models.superpoint.modeling_superpoint import SuperPointKeypointDescriptionOutput
 
 
 class SuperPointImageProcessingTester(unittest.TestCase):

From 389b154da0910ed093ec45abbac89d0f77177c42 Mon Sep 17 00:00:00 2001
From: StevenBucaille <steven.bucaille@gmail.com>
Date: Sun, 1 Sep 2024 14:59:16 +0200
Subject: [PATCH 10/32] chore: applied suggestions from code review

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
---
 docs/source/en/model_doc/superpoint.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/docs/source/en/model_doc/superpoint.md b/docs/source/en/model_doc/superpoint.md
index 18f3d76819fd..c49734f9216c 100644
--- a/docs/source/en/model_doc/superpoint.md
+++ b/docs/source/en/model_doc/superpoint.md
@@ -95,9 +95,10 @@ for output in outputs:
     descriptors = output["descriptors"]
 ```
 
-You can then print the keypoints on the image of your choice to visualize the result :
+You can then print the keypoints on the image of your choice to visualize the result:
 ```python
 import matplotlib.pyplot as plt
+
 plt.axis("off")
 plt.imshow(image)
 plt.scatter(

From b7d672e979c0331281131d1302be038ddec5a053 Mon Sep 17 00:00:00 2001
From: steven <steven.bucaille@gmail.com>
Date: Sun, 1 Sep 2024 14:59:57 +0200
Subject: [PATCH 11/32] tests: fixed import

---
 tests/models/superpoint/test_image_processing_superpoint.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tests/models/superpoint/test_image_processing_superpoint.py b/tests/models/superpoint/test_image_processing_superpoint.py
index 3f0a625d8bf8..64112aadda09 100644
--- a/tests/models/superpoint/test_image_processing_superpoint.py
+++ b/tests/models/superpoint/test_image_processing_superpoint.py
@@ -27,9 +27,11 @@
 if is_torch_available():
     import torch
 
+    from transformers.models.superpoint.modeling_superpoint import SuperPointKeypointDescriptionOutput
+
 if is_vision_available():
     from transformers import SuperPointImageProcessor
-    from transformers.models.superpoint.modeling_superpoint import SuperPointKeypointDescriptionOutput
+
 
 
 class SuperPointImageProcessingTester(unittest.TestCase):

From f5d731181cbb48a7a4ad8fbf078a6a2f1b8010a0 Mon Sep 17 00:00:00 2001
From: steven <steven.bucaille@gmail.com>
Date: Sun, 1 Sep 2024 15:05:39 +0200
Subject: [PATCH 12/32] tests: fixed import (bis)

---
 .../models/superpoint/image_processing_superpoint.py          | 4 +++-
 tests/models/superpoint/test_image_processing_superpoint.py   | 4 +---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/transformers/models/superpoint/image_processing_superpoint.py b/src/transformers/models/superpoint/image_processing_superpoint.py
index aa0ffa9ae673..96a18bd130ea 100644
--- a/src/transformers/models/superpoint/image_processing_superpoint.py
+++ b/src/transformers/models/superpoint/image_processing_superpoint.py
@@ -30,15 +30,17 @@
     valid_images,
 )
 from ...utils import TensorType, logging, requires_backends
-from .modeling_superpoint import SuperPointKeypointDescriptionOutput
 
 
 if is_torch_available():
     import torch
 
+
 if is_vision_available():
     import PIL
 
+    from .modeling_superpoint import SuperPointKeypointDescriptionOutput
+
 logger = logging.get_logger(__name__)
 
 
diff --git a/tests/models/superpoint/test_image_processing_superpoint.py b/tests/models/superpoint/test_image_processing_superpoint.py
index 64112aadda09..3f0a625d8bf8 100644
--- a/tests/models/superpoint/test_image_processing_superpoint.py
+++ b/tests/models/superpoint/test_image_processing_superpoint.py
@@ -27,11 +27,9 @@
 if is_torch_available():
     import torch
 
-    from transformers.models.superpoint.modeling_superpoint import SuperPointKeypointDescriptionOutput
-
 if is_vision_available():
     from transformers import SuperPointImageProcessor
-
+    from transformers.models.superpoint.modeling_superpoint import SuperPointKeypointDescriptionOutput
 
 
 class SuperPointImageProcessingTester(unittest.TestCase):

From d89d38541716c02d5ccd937f5d17c88b7a9c7e24 Mon Sep 17 00:00:00 2001
From: steven <steven.bucaille@gmail.com>
Date: Sun, 1 Sep 2024 15:09:02 +0200
Subject: [PATCH 13/32] tests: fixed import (ter)

---
 .../models/superpoint/image_processing_superpoint.py           | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/transformers/models/superpoint/image_processing_superpoint.py b/src/transformers/models/superpoint/image_processing_superpoint.py
index 96a18bd130ea..948cef817d3f 100644
--- a/src/transformers/models/superpoint/image_processing_superpoint.py
+++ b/src/transformers/models/superpoint/image_processing_superpoint.py
@@ -35,11 +35,12 @@
 if is_torch_available():
     import torch
 
+    from .modeling_superpoint import SuperPointKeypointDescriptionOutput
+
 
 if is_vision_available():
     import PIL
 
-    from .modeling_superpoint import SuperPointKeypointDescriptionOutput
 
 logger = logging.get_logger(__name__)
 

From f9e1141256ab5a726dd3fc7c1fcfe9b5a71f7ac3 Mon Sep 17 00:00:00 2001
From: steven <steven.bucaille@gmail.com>
Date: Sun, 1 Sep 2024 15:33:49 +0200
Subject: [PATCH 14/32] feat: added choice of type for target_size and changed
 tests accordingly

---
 .../superpoint/image_processing_superpoint.py | 26 +++++++++-----
 .../test_image_processing_superpoint.py       | 36 +++++++++++--------
 2 files changed, 39 insertions(+), 23 deletions(-)

diff --git a/src/transformers/models/superpoint/image_processing_superpoint.py b/src/transformers/models/superpoint/image_processing_superpoint.py
index 948cef817d3f..5be079992db7 100644
--- a/src/transformers/models/superpoint/image_processing_superpoint.py
+++ b/src/transformers/models/superpoint/image_processing_superpoint.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 """Image processor class for SuperPoint."""
 
-from typing import Dict, Optional, Union
+from typing import Dict, List, Optional, Tuple, Union
 
 import numpy as np
 
@@ -279,7 +279,7 @@ def preprocess(
         return BatchFeature(data=data, tensor_type=return_tensors)
 
     def post_process_keypoint_detection(
-        self, outputs: SuperPointKeypointDescriptionOutput, target_sizes: torch.Tensor
+        self, outputs: SuperPointKeypointDescriptionOutput, target_sizes: Union[TensorType, List[Tuple]]
     ):
         """
         Converts the raw output of [`SuperPointForKeypointDetection`] into lists of keypoints, scores and descriptors
@@ -288,8 +288,9 @@ def post_process_keypoint_detection(
         Args:
             outputs ([`SuperPointKeypointDescriptionOutput`]):
                 Raw outputs of the model.
-            target_sizes (`torch.Tensor` of shape `(batch_size, 2)`):
-                Tensor containing the size (h, w) of each image of the batch. This must be the original
+            target_sizes (`torch.Tensor` or `List[Tuple[int, int]]`, *optional*):
+                Tensor of shape `(batch_size, 2)` or list of tuples (`Tuple[int, int]`) containing the target size
+                `(height, width)` of each image in the batch. This must be the original
                 image size (before any processing).
         Returns:
             `List[Dict]`: A list of dictionaries, each dictionary containing the keypoints, scores and descriptors for
@@ -297,14 +298,21 @@ def post_process_keypoint_detection(
         """
         if len(outputs.mask) != len(target_sizes):
             raise ValueError("Make sure that you pass in as many target sizes as the batch dimension of the mask")
-        if target_sizes.shape[1] != 2:
-            raise ValueError("Each element of target_sizes must contain the size (h, w) of each image of the batch")
+
+        if isinstance(target_sizes, List):
+            image_sizes = torch.tensor(target_sizes)
+        else:
+            if target_sizes.shape[1] != 2:
+                raise ValueError(
+                    "Each element of target_sizes must contain the size (h, w) of each image of the batch"
+                )
+            image_sizes = target_sizes
 
         masked_keypoints = outputs.keypoints.clone()
 
-        for keypoints, target_size in zip(masked_keypoints, target_sizes):
-            keypoints[:, 0] = keypoints[:, 0] * target_size[1]
-            keypoints[:, 1] = keypoints[:, 1] * target_size[0]
+        for keypoints, image_size in zip(masked_keypoints, image_sizes):
+            keypoints[:, 0] = keypoints[:, 0] * image_size[1]
+            keypoints[:, 1] = keypoints[:, 1] * image_size[0]
 
         # Convert masked_keypoints to int
         masked_keypoints = masked_keypoints.to(torch.int32)
diff --git a/tests/models/superpoint/test_image_processing_superpoint.py b/tests/models/superpoint/test_image_processing_superpoint.py
index 3f0a625d8bf8..c06682a20d8d 100644
--- a/tests/models/superpoint/test_image_processing_superpoint.py
+++ b/tests/models/superpoint/test_image_processing_superpoint.py
@@ -139,17 +139,25 @@ def test_post_processing_keypoint_detection(self):
         image_inputs = self.image_processor_tester.prepare_image_inputs()
         pre_processed_images = image_processor.preprocess(image_inputs, return_tensors="pt")
         outputs = self.image_processor_tester.prepare_keypoint_detection_output(**pre_processed_images)
-        image_sizes = torch.tensor([image.size for image in image_inputs]).flip(1)
-        post_processed_outputs = image_processor.post_process_keypoint_detection(outputs, image_sizes)
-
-        self.assertTrue(len(post_processed_outputs) == self.image_processor_tester.batch_size)
-        for post_processed_output, image_size in zip(post_processed_outputs, image_sizes):
-            self.assertTrue("keypoints" in post_processed_output)
-            self.assertTrue("descriptors" in post_processed_output)
-            self.assertTrue("scores" in post_processed_output)
-            keypoints = post_processed_output["keypoints"]
-            all_below_image_size = torch.all(keypoints[:, 0] <= image_size[1]) and torch.all(
-                keypoints[:, 1] <= image_size[0]
-            )
-            all_above_zero = torch.all(keypoints[:, 0] >= 0) and torch.all(keypoints[:, 1] >= 0)
-            self.assertTrue(all_below_image_size and all_above_zero)
+
+        def check_post_processed_output(post_processed_output, image_size):
+            for post_processed_output, image_size in zip(post_processed_output, image_size):
+                self.assertTrue("keypoints" in post_processed_output)
+                self.assertTrue("descriptors" in post_processed_output)
+                self.assertTrue("scores" in post_processed_output)
+                keypoints = post_processed_output["keypoints"]
+                all_below_image_size = torch.all(keypoints[:, 0] <= image_size[1]) and torch.all(
+                    keypoints[:, 1] <= image_size[0]
+                )
+                all_above_zero = torch.all(keypoints[:, 0] >= 0) and torch.all(keypoints[:, 1] >= 0)
+                self.assertTrue(all_below_image_size and all_above_zero)
+
+        tuple_image_sizes = [(image.size[0], image.size[1]) for image in image_inputs]
+        tuple_post_processed_outputs = image_processor.post_process_keypoint_detection(outputs, tuple_image_sizes)
+
+        check_post_processed_output(tuple_post_processed_outputs, tuple_image_sizes)
+
+        tensor_image_sizes = torch.tensor([image.size for image in image_inputs]).flip(1)
+        tensor_post_processed_outputs = image_processor.post_process_keypoint_detection(outputs, tensor_image_sizes)
+
+        check_post_processed_output(tensor_post_processed_outputs, tensor_image_sizes)

From 32a2e96acfccbe3d23ffca6195f5e6d0c6505ef6 Mon Sep 17 00:00:00 2001
From: steven <steven.bucaille@gmail.com>
Date: Sun, 1 Sep 2024 15:36:27 +0200
Subject: [PATCH 15/32] docs: updated code snippet to reflect the addition of
 target size type choice in post process method

---
 docs/source/en/model_doc/superpoint.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/en/model_doc/superpoint.md b/docs/source/en/model_doc/superpoint.md
index c49734f9216c..0663030c529d 100644
--- a/docs/source/en/model_doc/superpoint.md
+++ b/docs/source/en/model_doc/superpoint.md
@@ -86,7 +86,7 @@ model = SuperPointForKeypointDetection.from_pretrained("magic-leap-community/sup
 
 inputs = processor(images, return_tensors="pt")
 outputs = model(**inputs)
-image_sizes = torch.tensor([image.size for image in images]).flip(1)
+image_sizes = [(image[1], image[0]) for image in images]
 outputs = processor.post_process_keypoint_detection(outputs, image_sizes)
 
 for output in outputs:

From 560194e8619ee3cfdb64ef0e19e51f5e7f79c9cc Mon Sep 17 00:00:00 2001
From: Steven Bucaille <steven.bucaille@buawei.com>
Date: Mon, 2 Sep 2024 08:27:41 +0000
Subject: [PATCH 16/32] tests: fixed imports (...)

---
 tests/models/superpoint/test_image_processing_superpoint.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/models/superpoint/test_image_processing_superpoint.py b/tests/models/superpoint/test_image_processing_superpoint.py
index c06682a20d8d..80da823cde16 100644
--- a/tests/models/superpoint/test_image_processing_superpoint.py
+++ b/tests/models/superpoint/test_image_processing_superpoint.py
@@ -27,9 +27,10 @@
 if is_torch_available():
     import torch
 
+    from transformers.models.superpoint.modeling_superpoint import SuperPointKeypointDescriptionOutput
+
 if is_vision_available():
     from transformers import SuperPointImageProcessor
-    from transformers.models.superpoint.modeling_superpoint import SuperPointKeypointDescriptionOutput
 
 
 class SuperPointImageProcessingTester(unittest.TestCase):

From 2d28aba57f513f4a7e706c3590d42904ae315bca Mon Sep 17 00:00:00 2001
From: Steven Bucaille <steven.bucaille@buawei.com>
Date: Mon, 2 Sep 2024 08:33:07 +0000
Subject: [PATCH 17/32] tests: fixed imports (...)

---
 .../models/superpoint/image_processing_superpoint.py            | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/superpoint/image_processing_superpoint.py b/src/transformers/models/superpoint/image_processing_superpoint.py
index 5be079992db7..a6666c18a853 100644
--- a/src/transformers/models/superpoint/image_processing_superpoint.py
+++ b/src/transformers/models/superpoint/image_processing_superpoint.py
@@ -279,7 +279,7 @@ def preprocess(
         return BatchFeature(data=data, tensor_type=return_tensors)
 
     def post_process_keypoint_detection(
-        self, outputs: SuperPointKeypointDescriptionOutput, target_sizes: Union[TensorType, List[Tuple]]
+        self, outputs: 'SuperPointKeypointDescriptionOutput', target_sizes: Union[TensorType, List[Tuple]]
     ):
         """
         Converts the raw output of [`SuperPointForKeypointDetection`] into lists of keypoints, scores and descriptors

From bd23baa6c7a966f1b68ecf0dfa7e74a26c67613e Mon Sep 17 00:00:00 2001
From: Steven Bucaille <steven.bucaille@buawei.com>
Date: Mon, 2 Sep 2024 08:36:26 +0000
Subject: [PATCH 18/32] style: formatting file

---
 .../models/superpoint/image_processing_superpoint.py            | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/superpoint/image_processing_superpoint.py b/src/transformers/models/superpoint/image_processing_superpoint.py
index a6666c18a853..ada2ca1c5b8c 100644
--- a/src/transformers/models/superpoint/image_processing_superpoint.py
+++ b/src/transformers/models/superpoint/image_processing_superpoint.py
@@ -279,7 +279,7 @@ def preprocess(
         return BatchFeature(data=data, tensor_type=return_tensors)
 
     def post_process_keypoint_detection(
-        self, outputs: 'SuperPointKeypointDescriptionOutput', target_sizes: Union[TensorType, List[Tuple]]
+        self, outputs: "SuperPointKeypointDescriptionOutput", target_sizes: Union[TensorType, List[Tuple]]
     ):
         """
         Converts the raw output of [`SuperPointForKeypointDetection`] into lists of keypoints, scores and descriptors

From 5bb0baf0d5313800661f08e6af16fbbdd5a5bf30 Mon Sep 17 00:00:00 2001
From: steven <steven.bucaille@gmail.com>
Date: Mon, 2 Sep 2024 21:03:23 +0200
Subject: [PATCH 19/32] docs: fixed typo from image[0] to image.size[0]

---
 docs/source/en/model_doc/superpoint.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/en/model_doc/superpoint.md b/docs/source/en/model_doc/superpoint.md
index 0663030c529d..b83685f12997 100644
--- a/docs/source/en/model_doc/superpoint.md
+++ b/docs/source/en/model_doc/superpoint.md
@@ -86,7 +86,7 @@ model = SuperPointForKeypointDetection.from_pretrained("magic-leap-community/sup
 
 inputs = processor(images, return_tensors="pt")
 outputs = model(**inputs)
-image_sizes = [(image[1], image[0]) for image in images]
+image_sizes = [(image.size[1], image.size[0]) for image in images]
 outputs = processor.post_process_keypoint_detection(outputs, image_sizes)
 
 for output in outputs:

From ed28314b975eb0b1ee105f41aa9b2f5dd50362f4 Mon Sep 17 00:00:00 2001
From: steven <steven.bucaille@gmail.com>
Date: Thu, 5 Sep 2024 21:46:03 +0200
Subject: [PATCH 20/32] docs: added output image and fixed some tests

---
 docs/source/en/model_doc/superpoint.md                      | 1 +
 tests/models/superpoint/test_image_processing_superpoint.py | 6 +++---
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/docs/source/en/model_doc/superpoint.md b/docs/source/en/model_doc/superpoint.md
index b83685f12997..94ac3d3ec56e 100644
--- a/docs/source/en/model_doc/superpoint.md
+++ b/docs/source/en/model_doc/superpoint.md
@@ -110,6 +110,7 @@ plt.scatter(
 )
 plt.savefig(f"output_image.png")
 ```
+![image/png](https://cdn-uploads.huggingface.co/production/uploads/632885ba1558dac67c440aa8/ZtFmphEhx8tcbEQqOolyE.png)
 
 This model was contributed by [stevenbucaille](https://huggingface.co/stevenbucaille).
 The original code can be found [here](https://github.com/magicleap/SuperPointPretrainedNetwork).
diff --git a/tests/models/superpoint/test_image_processing_superpoint.py b/tests/models/superpoint/test_image_processing_superpoint.py
index 80da823cde16..c2eae872004c 100644
--- a/tests/models/superpoint/test_image_processing_superpoint.py
+++ b/tests/models/superpoint/test_image_processing_superpoint.py
@@ -15,7 +15,7 @@
 
 import numpy as np
 
-from transformers.testing_utils import require_torch, require_vision, slow
+from transformers.testing_utils import require_torch, require_vision
 from transformers.utils import is_torch_available, is_vision_available
 
 from ...test_image_processing_common import (
@@ -133,7 +133,6 @@ def test_input_image_properly_converted_to_grayscale(self):
         for image in pre_processed_images["pixel_values"]:
             self.assertTrue(np.all(image[0, ...] == image[1, ...]) and np.all(image[1, ...] == image[2, ...]))
 
-    @slow
     @require_torch
     def test_post_processing_keypoint_detection(self):
         image_processor = self.image_processing_class.from_dict(self.image_processor_dict)
@@ -151,7 +150,8 @@ def check_post_processed_output(post_processed_output, image_size):
                     keypoints[:, 1] <= image_size[0]
                 )
                 all_above_zero = torch.all(keypoints[:, 0] >= 0) and torch.all(keypoints[:, 1] >= 0)
-                self.assertTrue(all_below_image_size and all_above_zero)
+                self.assertTrue(all_below_image_size)
+                self.assertTrue(all_above_zero)
 
         tuple_image_sizes = [(image.size[0], image.size[1]) for image in image_inputs]
         tuple_post_processed_outputs = image_processor.post_process_keypoint_detection(outputs, tuple_image_sizes)

From 192448d0a2affbca6b8cd21cfb1be327b6d0a129 Mon Sep 17 00:00:00 2001
From: StevenBucaille <steven.bucaille@gmail.com>
Date: Wed, 2 Oct 2024 21:53:14 +0200
Subject: [PATCH 21/32] Update docs/source/en/model_doc/superpoint.md

Co-authored-by: Pavel Iakubovskii <qubvel@gmail.com>
---
 docs/source/en/model_doc/superpoint.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/en/model_doc/superpoint.md b/docs/source/en/model_doc/superpoint.md
index 94ac3d3ec56e..d8bb7e95f185 100644
--- a/docs/source/en/model_doc/superpoint.md
+++ b/docs/source/en/model_doc/superpoint.md
@@ -86,7 +86,7 @@ model = SuperPointForKeypointDetection.from_pretrained("magic-leap-community/sup
 
 inputs = processor(images, return_tensors="pt")
 outputs = model(**inputs)
-image_sizes = [(image.size[1], image.size[0]) for image in images]
+image_sizes = [(image.height, image.width) for image in images]
 outputs = processor.post_process_keypoint_detection(outputs, image_sizes)
 
 for output in outputs:

From e89af7f80420e9768722df0c22cd6d3cc9e922e1 Mon Sep 17 00:00:00 2001
From: steven <steven.bucaille@gmail.com>
Date: Wed, 2 Oct 2024 22:27:20 +0200
Subject: [PATCH 22/32] fix: included SuperPointKeypointDescriptionOutput in
 TYPE_CHECKING if statement and changed tests results to reflect changes to
 SuperPoint from absolute keypoints coordinates to relative

---
 .../models/superpoint/image_processing_superpoint.py      | 5 ++---
 tests/models/superpoint/test_modeling_superpoint.py       | 8 ++++----
 2 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/src/transformers/models/superpoint/image_processing_superpoint.py b/src/transformers/models/superpoint/image_processing_superpoint.py
index ada2ca1c5b8c..c023a40a5164 100644
--- a/src/transformers/models/superpoint/image_processing_superpoint.py
+++ b/src/transformers/models/superpoint/image_processing_superpoint.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 """Image processor class for SuperPoint."""
 
-from typing import Dict, List, Optional, Tuple, Union
+from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union
 
 import numpy as np
 
@@ -35,13 +35,12 @@
 if is_torch_available():
     import torch
 
+if TYPE_CHECKING:
     from .modeling_superpoint import SuperPointKeypointDescriptionOutput
 
-
 if is_vision_available():
     import PIL
 
-
 logger = logging.get_logger(__name__)
 
 
diff --git a/tests/models/superpoint/test_modeling_superpoint.py b/tests/models/superpoint/test_modeling_superpoint.py
index 25c384a79557..d00a1b928470 100644
--- a/tests/models/superpoint/test_modeling_superpoint.py
+++ b/tests/models/superpoint/test_modeling_superpoint.py
@@ -260,7 +260,7 @@ def test_inference(self):
         inputs = preprocessor(images=images, return_tensors="pt").to(torch_device)
         with torch.no_grad():
             outputs = model(**inputs)
-        expected_number_keypoints_image0 = 567
+        expected_number_keypoints_image0 = 568
         expected_number_keypoints_image1 = 830
         expected_max_number_keypoints = max(expected_number_keypoints_image0, expected_number_keypoints_image1)
         expected_keypoints_shape = torch.Size((len(images), expected_max_number_keypoints, 2))
@@ -275,11 +275,11 @@ def test_inference(self):
         self.assertEqual(outputs.keypoints.shape, expected_keypoints_shape)
         self.assertEqual(outputs.scores.shape, expected_scores_shape)
         self.assertEqual(outputs.descriptors.shape, expected_descriptors_shape)
-        expected_keypoints_image0_values = torch.tensor([[480.0, 9.0], [494.0, 9.0], [489.0, 16.0]]).to(torch_device)
+        expected_keypoints_image0_values = torch.tensor([[0.75, 0.0188],[0.7719, 0.0188], [0.7641, 0.0333]]).to(torch_device)
         expected_scores_image0_values = torch.tensor(
-            [0.0064, 0.0137, 0.0589, 0.0723, 0.5166, 0.0174, 0.1515, 0.2054, 0.0334]
+            [0.0064, 0.0139, 0.0591, 0.0727, 0.5170, 0.0175, 0.1526, 0.2057, 0.0335]
         ).to(torch_device)
-        expected_descriptors_image0_value = torch.tensor(-0.1096).to(torch_device)
+        expected_descriptors_image0_value = torch.tensor(0.0449).to(torch_device)
         predicted_keypoints_image0_values = outputs.keypoints[0, :3]
         predicted_scores_image0_values = outputs.scores[0, :9]
         predicted_descriptors_image0_value = outputs.descriptors[0, 0, 0]

From 4e77a4fe09ef72d1fc5210356b66d89296e585e0 Mon Sep 17 00:00:00 2001
From: steven <steven.bucaille@gmail.com>
Date: Wed, 2 Oct 2024 22:29:48 +0200
Subject: [PATCH 23/32] docs: changed SuperPoint's docs to print output instead
 of just accessing

---
 docs/source/en/model_doc/superpoint.md | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/docs/source/en/model_doc/superpoint.md b/docs/source/en/model_doc/superpoint.md
index d8bb7e95f185..3abd1ff03af0 100644
--- a/docs/source/en/model_doc/superpoint.md
+++ b/docs/source/en/model_doc/superpoint.md
@@ -90,9 +90,10 @@ image_sizes = [(image.height, image.width) for image in images]
 outputs = processor.post_process_keypoint_detection(outputs, image_sizes)
 
 for output in outputs:
-    keypoints = output["keypoints"]
-    scores = output["scores"]
-    descriptors = output["descriptors"]
+    for keypoints, scores, descriptors in zip(output["keypoints"], output["scores"], output["descriptors"]):
+        print(f"Keypoints: {keypoints}")
+        print(f"Scores: {scores}")
+        print(f"Descriptors: {descriptors}")
 ```
 
 You can then print the keypoints on the image of your choice to visualize the result:

From e9b642abe9baef789055b48e788ed55d5fb2eb01 Mon Sep 17 00:00:00 2001
From: steven <steven.bucaille@gmail.com>
Date: Wed, 2 Oct 2024 22:31:04 +0200
Subject: [PATCH 24/32] style: applied make style

---
 tests/models/superpoint/test_modeling_superpoint.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tests/models/superpoint/test_modeling_superpoint.py b/tests/models/superpoint/test_modeling_superpoint.py
index d00a1b928470..94a2542a62c5 100644
--- a/tests/models/superpoint/test_modeling_superpoint.py
+++ b/tests/models/superpoint/test_modeling_superpoint.py
@@ -275,7 +275,9 @@ def test_inference(self):
         self.assertEqual(outputs.keypoints.shape, expected_keypoints_shape)
         self.assertEqual(outputs.scores.shape, expected_scores_shape)
         self.assertEqual(outputs.descriptors.shape, expected_descriptors_shape)
-        expected_keypoints_image0_values = torch.tensor([[0.75, 0.0188],[0.7719, 0.0188], [0.7641, 0.0333]]).to(torch_device)
+        expected_keypoints_image0_values = torch.tensor([[0.75, 0.0188], [0.7719, 0.0188], [0.7641, 0.0333]]).to(
+            torch_device
+        )
         expected_scores_image0_values = torch.tensor(
             [0.0064, 0.0139, 0.0591, 0.0727, 0.5170, 0.0175, 0.1526, 0.2057, 0.0335]
         ).to(torch_device)

From e08586115c780d77aadd4917553d127b2a2dcf0a Mon Sep 17 00:00:00 2001
From: steven <steven.bucaille@gmail.com>
Date: Thu, 3 Oct 2024 23:12:27 +0200
Subject: [PATCH 25/32] docs: added missing output type and precision in
 docstring of post_process_keypoint_detection

---
 .../models/superpoint/image_processing_superpoint.py   | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/transformers/models/superpoint/image_processing_superpoint.py b/src/transformers/models/superpoint/image_processing_superpoint.py
index c023a40a5164..11c130dffd42 100644
--- a/src/transformers/models/superpoint/image_processing_superpoint.py
+++ b/src/transformers/models/superpoint/image_processing_superpoint.py
@@ -279,21 +279,21 @@ def preprocess(
 
     def post_process_keypoint_detection(
         self, outputs: "SuperPointKeypointDescriptionOutput", target_sizes: Union[TensorType, List[Tuple]]
-    ):
+    ) -> List[Dict[str, torch.Tensor]]:
         """
         Converts the raw output of [`SuperPointForKeypointDetection`] into lists of keypoints, scores and descriptors
         with coordinates absolute to the original image sizes.
 
         Args:
             outputs ([`SuperPointKeypointDescriptionOutput`]):
-                Raw outputs of the model.
-            target_sizes (`torch.Tensor` or `List[Tuple[int, int]]`, *optional*):
+                Raw outputs of the model containing keypoints in a relative (x, y) format, with scores and descriptors.
+            target_sizes (`torch.Tensor` or `List[Tuple[int, int]]`):
                 Tensor of shape `(batch_size, 2)` or list of tuples (`Tuple[int, int]`) containing the target size
                 `(height, width)` of each image in the batch. This must be the original
                 image size (before any processing).
         Returns:
-            `List[Dict]`: A list of dictionaries, each dictionary containing the keypoints, scores and descriptors for
-            an image in the batch as predicted by the model.
+            `List[Dict]`: A list of dictionaries, each dictionary containing the keypoints in absolute format according
+            to target_sizes, scores and descriptors for an image in the batch as predicted by the model.
         """
         if len(outputs.mask) != len(target_sizes):
             raise ValueError("Make sure that you pass in as many target sizes as the batch dimension of the mask")

From 91275455a0cdea8d5745f4e2d50010e605499409 Mon Sep 17 00:00:00 2001
From: steven <steven.bucaille@gmail.com>
Date: Thu, 3 Oct 2024 23:27:15 +0200
Subject: [PATCH 26/32] perf: deleted loop to perform keypoint conversion in
 one statement

---
 .../models/superpoint/image_processing_superpoint.py      | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/src/transformers/models/superpoint/image_processing_superpoint.py b/src/transformers/models/superpoint/image_processing_superpoint.py
index 11c130dffd42..820ee7abf9ed 100644
--- a/src/transformers/models/superpoint/image_processing_superpoint.py
+++ b/src/transformers/models/superpoint/image_processing_superpoint.py
@@ -307,11 +307,9 @@ def post_process_keypoint_detection(
                 )
             image_sizes = target_sizes
 
-        masked_keypoints = outputs.keypoints.clone()
-
-        for keypoints, image_size in zip(masked_keypoints, image_sizes):
-            keypoints[:, 0] = keypoints[:, 0] * image_size[1]
-            keypoints[:, 1] = keypoints[:, 1] * image_size[0]
+        # Flip the image sizes to (width, height) and convert keypoints to absolute coordinates
+        image_sizes = torch.flip(image_sizes, [1])
+        masked_keypoints = outputs.keypoints * image_sizes[:, None]
 
         # Convert masked_keypoints to int
         masked_keypoints = masked_keypoints.to(torch.int32)

From 1ffa4659aa409a47ed79b20facf6a766288e5977 Mon Sep 17 00:00:00 2001
From: steven <steven.bucaille@gmail.com>
Date: Thu, 3 Oct 2024 23:40:18 +0200
Subject: [PATCH 27/32] fix: moved keypoint conversion at the end of model
 forward

---
 src/transformers/models/superpoint/modeling_superpoint.py | 8 ++++----
 tests/models/superpoint/test_modeling_superpoint.py       | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/transformers/models/superpoint/modeling_superpoint.py b/src/transformers/models/superpoint/modeling_superpoint.py
index b77a90367d4a..163147d4d19b 100644
--- a/src/transformers/models/superpoint/modeling_superpoint.py
+++ b/src/transformers/models/superpoint/modeling_superpoint.py
@@ -258,9 +258,6 @@ def _extract_keypoints(self, scores: torch.Tensor) -> Tuple[torch.Tensor, torch.
         # Convert (y, x) to (x, y)
         keypoints = torch.flip(keypoints, [1]).float()
 
-        # Convert to relative coordinates
-        keypoints = keypoints / torch.tensor([width, height], device=keypoints.device)
-
         return keypoints, scores
 
 
@@ -450,7 +447,7 @@ def forward(
 
         pixel_values = self.extract_one_channel_pixel_values(pixel_values)
 
-        batch_size = pixel_values.shape[0]
+        batch_size, _, height, width = pixel_values.shape
 
         encoder_outputs = self.encoder(
             pixel_values,
@@ -488,6 +485,9 @@ def forward(
             descriptors[i, : _descriptors.shape[0]] = _descriptors
             mask[i, : _scores.shape[0]] = 1
 
+        # Convert to relative coordinates
+        keypoints[:, :] = keypoints[:, :] / torch.tensor([width, height], device=keypoints.device)
+
         hidden_states = encoder_outputs[1] if output_hidden_states else None
         if not return_dict:
             return tuple(v for v in [loss, keypoints, scores, descriptors, mask, hidden_states] if v is not None)
diff --git a/tests/models/superpoint/test_modeling_superpoint.py b/tests/models/superpoint/test_modeling_superpoint.py
index 94a2542a62c5..8db435502ca5 100644
--- a/tests/models/superpoint/test_modeling_superpoint.py
+++ b/tests/models/superpoint/test_modeling_superpoint.py
@@ -281,7 +281,7 @@ def test_inference(self):
         expected_scores_image0_values = torch.tensor(
             [0.0064, 0.0139, 0.0591, 0.0727, 0.5170, 0.0175, 0.1526, 0.2057, 0.0335]
         ).to(torch_device)
-        expected_descriptors_image0_value = torch.tensor(0.0449).to(torch_device)
+        expected_descriptors_image0_value = torch.tensor(-0.1095).to(torch_device)
         predicted_keypoints_image0_values = outputs.keypoints[0, :3]
         predicted_scores_image0_values = outputs.scores[0, :9]
         predicted_descriptors_image0_value = outputs.descriptors[0, 0, 0]

From b0d25a35f47b27c8942551724617e60c433c8c36 Mon Sep 17 00:00:00 2001
From: steven <steven.bucaille@gmail.com>
Date: Thu, 3 Oct 2024 23:46:30 +0200
Subject: [PATCH 28/32] docs: changed SuperPointInterestPointDecoder to
 SuperPointKeypointDecoder class name and added relative (x, y) coordinates
 information to its method

---
 .../models/superpoint/modeling_superpoint.py             | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/superpoint/modeling_superpoint.py b/src/transformers/models/superpoint/modeling_superpoint.py
index 163147d4d19b..34981938d5a2 100644
--- a/src/transformers/models/superpoint/modeling_superpoint.py
+++ b/src/transformers/models/superpoint/modeling_superpoint.py
@@ -192,7 +192,7 @@ def forward(
         )
 
 
-class SuperPointInterestPointDecoder(nn.Module):
+class SuperPointKeypointDecoder(nn.Module):
     """
     The SuperPointInterestPointDecoder uses the output of the SuperPointEncoder to compute the keypoint with scores.
     The scores are first computed by a convolutional layer, then a softmax is applied to get a probability distribution
@@ -239,7 +239,10 @@ def _get_pixel_scores(self, encoded: torch.Tensor) -> torch.Tensor:
         return scores
 
     def _extract_keypoints(self, scores: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
-        """Based on their scores, extract the pixels that represent the keypoints that will be used for descriptors computation"""
+        """
+        Based on their scores, extract the pixels that represent the keypoints that will be used for descriptors computation.
+        The keypoints are in the form of relative (x, y) coordinates.
+        """
         _, height, width = scores.shape
 
         # Threshold keypoints by score value
@@ -405,7 +408,7 @@ def __init__(self, config: SuperPointConfig) -> None:
         self.config = config
 
         self.encoder = SuperPointEncoder(config)
-        self.keypoint_decoder = SuperPointInterestPointDecoder(config)
+        self.keypoint_decoder = SuperPointKeypointDecoder(config)
         self.descriptor_decoder = SuperPointDescriptorDecoder(config)
 
         self.post_init()

From 1fb5705dd82b6dc17e57502df22c28fde72c729c Mon Sep 17 00:00:00 2001
From: steven <steven.bucaille@gmail.com>
Date: Thu, 3 Oct 2024 23:53:34 +0200
Subject: [PATCH 29/32] fix: changed type hint

---
 .../models/superpoint/image_processing_superpoint.py            | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/superpoint/image_processing_superpoint.py b/src/transformers/models/superpoint/image_processing_superpoint.py
index 820ee7abf9ed..65309b1c1826 100644
--- a/src/transformers/models/superpoint/image_processing_superpoint.py
+++ b/src/transformers/models/superpoint/image_processing_superpoint.py
@@ -279,7 +279,7 @@ def preprocess(
 
     def post_process_keypoint_detection(
         self, outputs: "SuperPointKeypointDescriptionOutput", target_sizes: Union[TensorType, List[Tuple]]
-    ) -> List[Dict[str, torch.Tensor]]:
+    ) -> List[Dict[str, "torch.Tensor"]]:
         """
         Converts the raw output of [`SuperPointForKeypointDetection`] into lists of keypoints, scores and descriptors
         with coordinates absolute to the original image sizes.

From 13cb7e5be54eb8a15462473160f6df013049c7b2 Mon Sep 17 00:00:00 2001
From: steven <steven.bucaille@gmail.com>
Date: Fri, 4 Oct 2024 13:44:42 +0200
Subject: [PATCH 30/32] refactor: removed unnecessary brackets

---
 src/transformers/models/superpoint/modeling_superpoint.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/superpoint/modeling_superpoint.py b/src/transformers/models/superpoint/modeling_superpoint.py
index 34981938d5a2..7ddd57d37c49 100644
--- a/src/transformers/models/superpoint/modeling_superpoint.py
+++ b/src/transformers/models/superpoint/modeling_superpoint.py
@@ -489,7 +489,7 @@ def forward(
             mask[i, : _scores.shape[0]] = 1
 
         # Convert to relative coordinates
-        keypoints[:, :] = keypoints[:, :] / torch.tensor([width, height], device=keypoints.device)
+        keypoints = keypoints / torch.tensor([width, height], device=keypoints.device)
 
         hidden_states = encoder_outputs[1] if output_hidden_states else None
         if not return_dict:

From eb6a5aad563b00c80650a92bbbab8eac277a8f2c Mon Sep 17 00:00:00 2001
From: steven <steven.bucaille@gmail.com>
Date: Fri, 4 Oct 2024 13:48:38 +0200
Subject: [PATCH 31/32] revert: SuperPointKeypointDecoder to
 SuperPointInterestPointDecoder

---
 src/transformers/models/superpoint/modeling_superpoint.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/superpoint/modeling_superpoint.py b/src/transformers/models/superpoint/modeling_superpoint.py
index 7ddd57d37c49..1075de299a9f 100644
--- a/src/transformers/models/superpoint/modeling_superpoint.py
+++ b/src/transformers/models/superpoint/modeling_superpoint.py
@@ -192,7 +192,7 @@ def forward(
         )
 
 
-class SuperPointKeypointDecoder(nn.Module):
+class SuperPointInterestPointDecoder(nn.Module):
     """
     The SuperPointInterestPointDecoder uses the output of the SuperPointEncoder to compute the keypoint with scores.
     The scores are first computed by a convolutional layer, then a softmax is applied to get a probability distribution
@@ -408,7 +408,7 @@ def __init__(self, config: SuperPointConfig) -> None:
         self.config = config
 
         self.encoder = SuperPointEncoder(config)
-        self.keypoint_decoder = SuperPointKeypointDecoder(config)
+        self.keypoint_decoder = SuperPointInterestPointDecoder(config)
         self.descriptor_decoder = SuperPointDescriptorDecoder(config)
 
         self.post_init()

From 4c34d752402cdb3a518038a6452e32e02e6c05d6 Mon Sep 17 00:00:00 2001
From: StevenBucaille <steven.bucaille@gmail.com>
Date: Fri, 4 Oct 2024 20:16:37 +0200
Subject: [PATCH 32/32] Update docs/source/en/model_doc/superpoint.md

Co-authored-by: Pavel Iakubovskii <qubvel@gmail.com>
---
 docs/source/en/model_doc/superpoint.md | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/docs/source/en/model_doc/superpoint.md b/docs/source/en/model_doc/superpoint.md
index 3abd1ff03af0..59e451adceb8 100644
--- a/docs/source/en/model_doc/superpoint.md
+++ b/docs/source/en/model_doc/superpoint.md
@@ -101,12 +101,12 @@ You can then print the keypoints on the image of your choice to visualize the re
 import matplotlib.pyplot as plt
 
 plt.axis("off")
-plt.imshow(image)
+plt.imshow(image_1)
 plt.scatter(
-    keypoints[:, 0],
-    keypoints[:, 1],
-    c=scores * 100,
-    s=scores * 50,
+    outputs[0]["keypoints"][:, 0],
+    outputs[0]["keypoints"][:, 1],
+    c=outputs[0]["scores"] * 100,
+    s=outputs[0]["scores"] * 50,
     alpha=0.8
 )
 plt.savefig(f"output_image.png")