diff --git a/src/transformers/models/maskformer/feature_extraction_maskformer.py b/src/transformers/models/maskformer/feature_extraction_maskformer.py
index 3a5fd49d80fa..9801a4545e90 100644
--- a/src/transformers/models/maskformer/feature_extraction_maskformer.py
+++ b/src/transformers/models/maskformer/feature_extraction_maskformer.py
@@ -206,9 +206,11 @@ def __call__(
                 instance id. To convert it to a binary mask of shape (`batch, num_labels, height, width`) we need a
                 dictionary mapping instance ids to label ids to create a semantic segmentation map.
 
-            return_tensors (`str` or [`~file_utils.TensorType`], *optional*):
-                If set, will return tensors instead of NumPy arrays. If set to `'pt'`, return PyTorch `torch.Tensor`
-                objects.
+            return_tensors (`str` or [`~utils.TensorType`], *optional*, defaults to `None`):
+                If set, will return a tensor of a particular framework.
+
+                Acceptable values are:
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
 
         Returns:
             [`BatchFeature`]: A [`BatchFeature`] with the following fields:
@@ -283,8 +285,19 @@ def __call__(
                         image=image, target=None, size=self.size, max_size=self.max_size
                     )[0]
 
+        # if do_normalize=False, the casting to a numpy array won't happen, so we need to do it here
+        make_channel_first = True if isinstance(images[0], Image.Image) else images[0].shape[-1] in (1, 3)
+        images = [self.to_numpy_array(image, rescale=False, channel_first=make_channel_first) for image in images]
+        if segmentation_maps is not None:
+            segmentation_maps = [
+                self.to_numpy_array(segmap, rescale=False, channel_first=True) for segmap in segmentation_maps
+            ]
+
         if self.do_normalize:
-            images = [self.normalize(image=image, mean=self.image_mean, std=self.image_std) for image in images]
+            images = [
+                self.normalize(image=image, mean=self.image_mean, std=self.image_std, rescale=True) for image in images
+            ]
+
         # NOTE I will be always forced to pad them them since they have to be stacked in the batch dim
         encoded_inputs = self.encode_inputs(
             images,
diff --git a/tests/models/maskformer/test_feature_extraction_maskformer.py b/tests/models/maskformer/test_feature_extraction_maskformer.py
index 461add8c0355..a8d14502aff9 100644
--- a/tests/models/maskformer/test_feature_extraction_maskformer.py
+++ b/tests/models/maskformer/test_feature_extraction_maskformer.py
@@ -18,6 +18,7 @@
 
 import numpy as np
 
+from parameterized import parameterized
 from transformers.testing_utils import require_torch, require_vision
 from transformers.utils import is_torch_available, is_vision_available
 
@@ -401,3 +402,43 @@ def test_post_process_panoptic_segmentation(self):
             self.assertEqual(
                 el["segmentation"].shape, (self.feature_extract_tester.height, self.feature_extract_tester.width)
             )
+
+    @require_torch
+    @parameterized.expand(
+        [
+            ("do_resize_True_do_normalize_True", True, True),
+            ("do_resize_True_do_normalize_False", True, False),
+            ("do_resize_True_do_normalize_True", True, True),
+            ("do_resize_True_do_normalize_False", True, False),
+            ("do_resize_False_do_normalize_True", False, True),
+            ("do_resize_False_do_normalize_False", False, False),
+            ("do_resize_False_do_normalize_True", False, True),
+            ("do_resize_False_do_normalize_False", False, False),
+        ]
+    )
+    def test_call_flags(self, _, do_resize, do_normalize):
+        # Initialize feature_extractor
+        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
+        feature_extractor.do_resize = do_resize
+        feature_extractor.do_normalize = do_normalize
+        # create random PIL images
+        image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False)
+
+        all_image_shapes = [img.size[::-1] for img in image_inputs]
+        if do_resize:
+            all_image_shapes = [
+                self.feature_extract_tester.get_expected_values([image], batched=False) for image in image_inputs
+            ]
+
+        max_across_dim = [max(shape) for shape in zip(*all_image_shapes)]
+        expected_shape = (
+            self.feature_extract_tester.batch_size,
+            self.feature_extract_tester.num_channels,
+            *max_across_dim,
+        )
+
+        pixel_values = feature_extractor(image_inputs, return_tensors="pt")["pixel_values"]
+        self.assertEqual(len(pixel_values), self.feature_extract_tester.batch_size)
+
+        self.assertEqual(pixel_values.shape, expected_shape)
+        self.assertIsInstance(pixel_values, torch.Tensor)