diff --git a/src/transformers/models/clip/feature_extraction_clip.py b/src/transformers/models/clip/feature_extraction_clip.py
index 4784955b9b63..7f01b5e02b94 100644
--- a/src/transformers/models/clip/feature_extraction_clip.py
+++ b/src/transformers/models/clip/feature_extraction_clip.py
@@ -108,15 +108,13 @@ def __call__(
                 tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a
                 number of channels, H and W are image height and width.
 
-            return_tensors (`str` or [`~utils.TensorType`], *optional*, defaults to `None`):
-                If set, will return a tensor of a particular framework.
+            return_tensors (`str` or [`~utils.TensorType`], *optional*, defaults to `'np'`):
+                If set, will return tensors of a particular framework. Acceptable values are:
 
-                Acceptable values are:
-                - `'tf'`: Return TensorFlow `tf.constant` object.
-                - `'pt'`: Return PyTorch `torch.Tensor` object.
-                - `'np'`: Return NumPy `np.ndarray` object.
-                - `'jax'`: Return JAX `jnp.ndarray` object.
-                - None: Return list of `np.ndarray` objects.
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return NumPy `np.ndarray` objects.
+                - `'jax'`: Return JAX `jnp.ndarray` objects.
 
         Returns:
             [`BatchFeature`]: A [`BatchFeature`] with the following fields:
@@ -157,15 +155,8 @@ def __call__(
             ]
         if self.do_center_crop and self.crop_size is not None:
             images = [self.center_crop(image, self.crop_size) for image in images]
-
-        # if do_normalize=False, the casting to a numpy array won't happen, so we need to do it here
-        make_channel_first = True if isinstance(images[0], Image.Image) else images[0].shape[-1] in (1, 3)
-        images = [self.to_numpy_array(image, rescale=False, channel_first=make_channel_first) for image in images]
-
         if self.do_normalize:
-            images = [
-                self.normalize(image=image, mean=self.image_mean, std=self.image_std, rescale=True) for image in images
-            ]
+            images = [self.normalize(image=image, mean=self.image_mean, std=self.image_std) for image in images]
 
         # return as BatchFeature
         data = {"pixel_values": images}
diff --git a/tests/models/clip/test_feature_extraction_clip.py b/tests/models/clip/test_feature_extraction_clip.py
index 13b8a2cc5fe3..8f36a65ae2d5 100644
--- a/tests/models/clip/test_feature_extraction_clip.py
+++ b/tests/models/clip/test_feature_extraction_clip.py
@@ -17,7 +17,6 @@
 import unittest
 
 import numpy as np
-from parameterized import parameterized
 
 from transformers.testing_utils import require_torch, require_vision
 from transformers.utils import is_torch_available, is_vision_available
@@ -233,58 +232,6 @@ def test_call_pytorch(self):
             ),
         )
 
-    @parameterized.expand(
-        [
-            ("do_resize_True_do_center_crop_True_do_normalize_True", True, True, True),
-            ("do_resize_True_do_center_crop_True_do_normalize_False", True, True, False),
-            ("do_resize_True_do_center_crop_False_do_normalize_True", True, False, True),
-            ("do_resize_True_do_center_crop_False_do_normalize_False", True, False, False),
-            ("do_resize_False_do_center_crop_True_do_normalize_True", False, True, True),
-            ("do_resize_False_do_center_crop_True_do_normalize_False", False, True, False),
-            ("do_resize_False_do_center_crop_False_do_normalize_True", False, False, True),
-            ("do_resize_False_do_center_crop_False_do_normalize_False", False, False, False),
-        ]
-    )
-    def test_call_flags(self, _, do_resize, do_center_crop, do_normalize):
-        # Initialize feature_extractor
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
-        feature_extractor.do_center_crop = do_center_crop
-        feature_extractor.do_resize = do_resize
-        feature_extractor.do_normalize = do_normalize
-        # create random PIL images
-        image_inputs = self.feature_extract_tester.prepare_inputs(equal_resolution=False, torchify=True)
-
-        expected_shapes = [x.shape for x in image_inputs]
-        if do_resize:
-            # Same size logic inside resized
-            resized_shapes = []
-            for shape in expected_shapes:
-                c, h, w = shape
-                short, long = (w, h) if w <= h else (h, w)
-                min_size = self.feature_extract_tester.size
-                if short == min_size:
-                    resized_shapes.append((c, h, w))
-                else:
-                    short, long = min_size, int(long * min_size / short)
-                    resized_shape = (c, long, short) if w <= h else (c, short, long)
-                    resized_shapes.append(resized_shape)
-            expected_shapes = resized_shapes
-        if do_center_crop:
-            expected_shapes = [
-                (
-                    self.feature_extract_tester.num_channels,
-                    self.feature_extract_tester.crop_size,
-                    self.feature_extract_tester.crop_size,
-                )
-                for _ in range(self.feature_extract_tester.batch_size)
-            ]
-
-        pixel_values = feature_extractor(image_inputs, return_tensors=None)["pixel_values"]
-        self.assertEqual(len(pixel_values), self.feature_extract_tester.batch_size)
-        for idx, image in enumerate(pixel_values):
-            self.assertEqual(image.shape, expected_shapes[idx])
-            self.assertIsInstance(image, np.ndarray)
-
 
 @require_torch
 @require_vision
@@ -345,52 +292,3 @@ def test_call_pil_four_channels(self):
                 self.feature_extract_tester.crop_size,
             ),
         )
-
-    @parameterized.expand(
-        [
-            ("do_resize_True_do_center_crop_True_do_normalize_True", True, True, True),
-            ("do_resize_True_do_center_crop_True_do_normalize_False", True, True, False),
-            ("do_resize_True_do_center_crop_False_do_normalize_True", True, False, True),
-            ("do_resize_True_do_center_crop_False_do_normalize_False", True, False, False),
-            ("do_resize_False_do_center_crop_True_do_normalize_True", False, True, True),
-            ("do_resize_False_do_center_crop_True_do_normalize_False", False, True, False),
-            ("do_resize_False_do_center_crop_False_do_normalize_True", False, False, True),
-            ("do_resize_False_do_center_crop_False_do_normalize_False", False, False, False),
-        ]
-    )
-    def test_call_flags_four_channels(self, _, do_resize, do_center_crop, do_normalize):
-        # Initialize feature_extractor
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
-        feature_extractor.do_center_crop = do_center_crop
-        feature_extractor.do_resize = do_resize
-        feature_extractor.do_normalize = do_normalize
-        # create random PIL images
-        # We can't currently pass in 4 channel pytorch images
-        image_inputs = self.feature_extract_tester.prepare_inputs(equal_resolution=False)
-
-        output_channels = self.expected_encoded_image_num_channels
-        crop_size = self.feature_extract_tester.crop_size
-        batch_size = self.feature_extract_tester.batch_size
-        expected_shapes = [(output_channels, *x.size[::-1]) for x in image_inputs]
-        if do_resize:
-            # Same size logic inside resized
-            resized_shapes = []
-            for shape in expected_shapes:
-                c, h, w = shape
-                short, long = (w, h) if w <= h else (h, w)
-                min_size = self.feature_extract_tester.size
-                if short == min_size:
-                    resized_shapes.append((c, h, w))
-                else:
-                    short, long = min_size, int(long * min_size / short)
-                    resized_shape = (c, long, short) if w <= h else (c, short, long)
-                    resized_shapes.append(resized_shape)
-            expected_shapes = resized_shapes
-        if do_center_crop:
-            expected_shapes = [(output_channels, crop_size, crop_size) for _ in range(batch_size)]
-
-        pixel_values = feature_extractor(image_inputs, return_tensors=None)["pixel_values"]
-        self.assertEqual(len(pixel_values), self.feature_extract_tester.batch_size)
-        for idx, image in enumerate(pixel_values):
-            self.assertEqual(image.shape, expected_shapes[idx])
-            self.assertIsInstance(image, np.ndarray)