diff --git a/src/transformers/generation/continuous_batching/cache.py b/src/transformers/generation/continuous_batching/cache.py
index 05de093f661f..8d6e057be84a 100644
--- a/src/transformers/generation/continuous_batching/cache.py
+++ b/src/transformers/generation/continuous_batching/cache.py
@@ -79,7 +79,7 @@ class PagedAttentionCache:
         layer group, and the shape of the cache tensor is `[num_blocks * block_size, num_heads, head_size]`.
 
     Grouping layers into groups is useful because when we allocate one block to a group N, the block allocated is the
-        same for all layers in group N, equivalently it is allocated accross all cache tensors. This allows us to
+        same for all layers in group N, equivalently it is allocated across all cache tensors. This allows us to
         efficiently allocate and free blocks, and to efficiently read and write key and value states.
 
     For instance, imagine we have 8 blocks of cache and a model with two layer groups: a full-attention group with 3
@@ -349,7 +349,7 @@ class PagedAttentionMemoryHandler:
     The memory footprint consists of three main components:
     - Cache memory: the space needed to store the cache tensors:
         2 * layer_group_size * [num_pages, page_size] * cache_dtype
-    - Activation memory: the space temporarly taken by the largest activation during the model forward pass:
+    - Activation memory: the space temporarily taken by the largest activation during the model forward pass:
         peak_activation_per_token * max_tokens_per_batch * activation_dtype_size
     - Static tensors: the space taken by the input/output buffers and metadata tensors for batch processing, sum of:
         - inputs_ids + outputs_ids + position_ids + logits_indices: 4 * max_tokens_per_batch * int32_size
diff --git a/src/transformers/models/big_bird/modeling_big_bird.py b/src/transformers/models/big_bird/modeling_big_bird.py
index 69dc11a7cb69..a25c412e688a 100755
--- a/src/transformers/models/big_bird/modeling_big_bird.py
+++ b/src/transformers/models/big_bird/modeling_big_bird.py
@@ -1108,14 +1108,14 @@ def _get_single_block_row_attention(
         if block_id == to_end_block_id - 2:
             illegal_blocks.append(1)
 
-        selected_random_blokcs = []
+        selected_random_blocks = []
 
         for i in range(to_end_block_id - to_start_block_id):
             if perm_block[i] not in illegal_blocks:
-                selected_random_blokcs.append(perm_block[i])
-            if len(selected_random_blokcs) == num_rand_blocks:
+                selected_random_blocks.append(perm_block[i])
+            if len(selected_random_blocks) == num_rand_blocks:
                 break
-        return np.array(selected_random_blokcs, dtype=np.int32)
+        return np.array(selected_random_blocks, dtype=np.int32)
 
 
 # Copied from transformers.models.bert.modeling_bert.BertSelfOutput with Bert->BigBird
diff --git a/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py b/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py
index 70644c8d3df2..2433660a687f 100755
--- a/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py
+++ b/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py
@@ -1086,14 +1086,14 @@ def _get_single_block_row_attention(
         if block_id == to_end_block_id - 2:
             illegal_blocks.append(1)
 
-        selected_random_blokcs = []
+        selected_random_blocks = []
 
         for i in range(to_end_block_id - to_start_block_id):
             if perm_block[i] not in illegal_blocks:
-                selected_random_blokcs.append(perm_block[i])
-            if len(selected_random_blokcs) == num_rand_blocks:
+                selected_random_blocks.append(perm_block[i])
+            if len(selected_random_blocks) == num_rand_blocks:
                 break
-        return np.array(selected_random_blokcs, dtype=np.int32)
+        return np.array(selected_random_blocks, dtype=np.int32)
 
 
 class BigBirdPegasusEncoderAttention(nn.Module):
diff --git a/src/transformers/models/cpmant/modeling_cpmant.py b/src/transformers/models/cpmant/modeling_cpmant.py
index 1930cc0e8793..15881a64eb37 100755
--- a/src/transformers/models/cpmant/modeling_cpmant.py
+++ b/src/transformers/models/cpmant/modeling_cpmant.py
@@ -351,7 +351,7 @@ def forward(
         output_hidden_states: Optional[bool] = None,
         past_key_values: Optional[Cache] = None,
         use_cache: Optional[bool] = None,
-        cache_postion: Optional[torch.Tensor] = None,
+        cache_position: Optional[torch.Tensor] = None,
     ):
         """
         Args:
@@ -492,16 +492,16 @@ def _position_bucket(self, relative_position, num_buckets=32, max_distance=128):
         relative_position = torch.abs(relative_position)
         max_exact = num_buckets // 2
         is_small = relative_position < max_exact
-        relative_postion_if_large = max_exact + (
+        relative_position_if_large = max_exact + (
             torch.log(relative_position.float() / max_exact)
             / math.log(max_distance / max_exact)
             * (num_buckets - max_exact)
         ).to(torch.int32)
-        relative_postion_if_large = torch.min(
-            relative_postion_if_large,
-            torch.full_like(relative_postion_if_large, num_buckets - 1),
+        relative_position_if_large = torch.min(
+            relative_position_if_large,
+            torch.full_like(relative_position_if_large, num_buckets - 1),
         )
-        relative_buckets += torch.where(is_small, relative_position.to(torch.int32), relative_postion_if_large)
+        relative_buckets += torch.where(is_small, relative_position.to(torch.int32), relative_position_if_large)
         return relative_buckets
 
 
diff --git a/src/transformers/models/gemma3/convert_gemma3_weights.py b/src/transformers/models/gemma3/convert_gemma3_weights.py
index 8d7a21219197..aefd9648d3fe 100644
--- a/src/transformers/models/gemma3/convert_gemma3_weights.py
+++ b/src/transformers/models/gemma3/convert_gemma3_weights.py
@@ -439,9 +439,9 @@ def convert_transformer_weights(
         decoder_block_start = path.find(_TRANSFORMER_DECODER_BLOCK)
         decoder_block_offset = decoder_block_start + _TRANSFORMER_DECODER_BLOCK_LEN
         decoder_block_path = path[decoder_block_offset:]
-        next_path_seperator_idx = decoder_block_path.find("/")
-        layer_idx = decoder_block_path[:next_path_seperator_idx]
-        decoder_block_path = decoder_block_path[next_path_seperator_idx:]
+        next_path_separator_idx = decoder_block_path.find("/")
+        layer_idx = decoder_block_path[:next_path_separator_idx]
+        decoder_block_path = decoder_block_path[next_path_separator_idx:]
 
         base_path = f"language_model.model.layers.{layer_idx}"
 
diff --git a/src/transformers/models/git/modeling_git.py b/src/transformers/models/git/modeling_git.py
index b98d2b1c231c..82a1d5e451ca 100644
--- a/src/transformers/models/git/modeling_git.py
+++ b/src/transformers/models/git/modeling_git.py
@@ -950,7 +950,7 @@ def __init__(self, config):
         self.visual_projection = GitProjection(config)
 
         if config.num_image_with_embedding is not None:
-            self.img_temperal_embedding = nn.ParameterList(
+            self.img_temporal_embedding = nn.ParameterList(
                 nn.Parameter(torch.zeros(1, 1, config.vision_config.hidden_size))
                 for _ in range(config.num_image_with_embedding)
             )
@@ -1115,7 +1115,7 @@ def forward(
                     visual_features_frame = self.image_encoder(
                         pixel_values[:, frame_idx, :, :], interpolate_pos_encoding=interpolate_pos_encoding
                     ).last_hidden_state
-                    visual_features_frame += self.img_temperal_embedding[frame_idx]
+                    visual_features_frame += self.img_temporal_embedding[frame_idx]
                     visual_features.append(visual_features_frame)
 
                 # finally, concatenate all features along sequence dimension
diff --git a/src/transformers/models/groupvit/modeling_groupvit.py b/src/transformers/models/groupvit/modeling_groupvit.py
index 65fdaaa784d3..598845750da2 100644
--- a/src/transformers/models/groupvit/modeling_groupvit.py
+++ b/src/transformers/models/groupvit/modeling_groupvit.py
@@ -74,7 +74,7 @@ def gumbel_softmax(logits: torch.Tensor, tau: float = 1, hard: bool = False, dim
         y_hard = torch.zeros_like(logits, memory_format=torch.legacy_contiguous_format).scatter_(dim, index, 1.0)
         ret = y_hard - y_soft.detach() + y_soft
     else:
-        # Reparametrization trick.
+        # Reparameterization trick.
         ret = y_soft
     return ret
 
@@ -662,7 +662,7 @@ def forward(
         attn_weights = nn.functional.softmax(attn_weights, dim=-1)
 
         if output_attentions:
-            # this operation is a bit akward, but it's required to
+            # this operation is a bit awkward, but it's required to
             # make sure that attn_weights keeps its gradient.
             # In order to do so, attn_weights have to reshaped
             # twice and have to be reused in the following
diff --git a/src/transformers/models/imagegpt/image_processing_imagegpt.py b/src/transformers/models/imagegpt/image_processing_imagegpt.py
index 97086ed45e07..ee8fe04771b7 100644
--- a/src/transformers/models/imagegpt/image_processing_imagegpt.py
+++ b/src/transformers/models/imagegpt/image_processing_imagegpt.py
@@ -242,7 +242,7 @@ def preprocess(
             raise ValueError("Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, or torch.Tensor")
 
         # Here, normalize() is using a constant factor to divide pixel values.
-        # hence, the method does not need iamge_mean and image_std.
+        # hence, the method does not need image_mean and image_std.
         validate_preprocess_arguments(
             do_resize=do_resize,
             size=size,
diff --git a/src/transformers/models/kosmos2_5/image_processing_kosmos2_5_fast.py b/src/transformers/models/kosmos2_5/image_processing_kosmos2_5_fast.py
index c539288d9913..028ccf6bf8a2 100644
--- a/src/transformers/models/kosmos2_5/image_processing_kosmos2_5_fast.py
+++ b/src/transformers/models/kosmos2_5/image_processing_kosmos2_5_fast.py
@@ -34,7 +34,7 @@
 # Similar to transformers.models.pix2struct.image_processing_pix2struct.torch_extract_patches but dealing with a batch of images directly.
 def torch_extract_patches(image_tensor, patch_height, patch_width):
     """
-    Utiliy function to extract patches from a given tensor representing a batch of images. Returns a tensor of shape
+    Utility function to extract patches from a given tensor representing a batch of images. Returns a tensor of shape
     (batch_size, `rows`, `columns`, `num_channels` x `patch_height` x `patch_width`).
 
     Args:
diff --git a/src/transformers/models/kyutai_speech_to_text/feature_extraction_kyutai_speech_to_text.py b/src/transformers/models/kyutai_speech_to_text/feature_extraction_kyutai_speech_to_text.py
index fa0ce5e11ded..8fa8f0a78875 100644
--- a/src/transformers/models/kyutai_speech_to_text/feature_extraction_kyutai_speech_to_text.py
+++ b/src/transformers/models/kyutai_speech_to_text/feature_extraction_kyutai_speech_to_text.py
@@ -203,7 +203,7 @@ def __call__(
             if padding:
                 padded_inputs["padding_mask"] = padded_inputs.pop("attention_mask")
 
-        # now let's padd left and right
+        # now let's pad left and right
         pad_left = int(self.audio_silence_prefix_seconds * self.sampling_rate)
         pad_right = int((self.audio_delay_seconds + 1.0) * self.sampling_rate)
         padded_inputs["input_values"] = np.pad(
diff --git a/src/transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py b/src/transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py
index 9eba7e163670..77c636570d58 100644
--- a/src/transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py
+++ b/src/transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py
@@ -1078,7 +1078,7 @@ def __init__(self, config):
         self.codec_model = AutoModel.from_config(config.codec_config)
 
         # we are in an edge case where for the codec_model self.can_generate is False, setting self.codec_model.generation_config to None
-        # yet the codec_model needs a generation config to initalize it's cache for streaming inference
+        # yet the codec_model needs a generation config to initialize it's cache for streaming inference
         # we therefore initialize a generation config for the codec model
         self.codec_model.generation_config = GenerationConfig.from_model_config(config.codec_config)
 
diff --git a/src/transformers/models/kyutai_speech_to_text/modular_kyutai_speech_to_text.py b/src/transformers/models/kyutai_speech_to_text/modular_kyutai_speech_to_text.py
index 16e8f6cd6dcb..af8c182f226e 100644
--- a/src/transformers/models/kyutai_speech_to_text/modular_kyutai_speech_to_text.py
+++ b/src/transformers/models/kyutai_speech_to_text/modular_kyutai_speech_to_text.py
@@ -182,7 +182,7 @@ def __call__(
             if padding:
                 padded_inputs["padding_mask"] = padded_inputs.pop("attention_mask")
 
-        # now let's padd left and right
+        # now let's pad left and right
         pad_left = int(self.audio_silence_prefix_seconds * self.sampling_rate)
         pad_right = int((self.audio_delay_seconds + 1.0) * self.sampling_rate)
         padded_inputs["input_values"] = np.pad(
@@ -258,7 +258,7 @@ def __init__(self, config):
         self.codec_model = AutoModel.from_config(config.codec_config)
 
         # we are in an edge case where for the codec_model self.can_generate is False, setting self.codec_model.generation_config to None
-        # yet the codec_model needs a generation config to initalize it's cache for streaming inference
+        # yet the codec_model needs a generation config to initialize it's cache for streaming inference
         # we therefore initialize a generation config for the codec model
         self.codec_model.generation_config = GenerationConfig.from_model_config(config.codec_config)
 
diff --git a/src/transformers/models/oneformer/modeling_oneformer.py b/src/transformers/models/oneformer/modeling_oneformer.py
index a5336f6fc490..dc44ad67f71f 100644
--- a/src/transformers/models/oneformer/modeling_oneformer.py
+++ b/src/transformers/models/oneformer/modeling_oneformer.py
@@ -2882,7 +2882,7 @@ def forward(
             Task inputs. Task inputs can be obtained using [`AutoImageProcessor`]. See [`OneFormerProcessor.__call__`]
             for details.
         text_inputs (`list[torch.Tensor]`, *optional*):
-            Tensor fof shape `(num_queries, sequence_length)` to be fed to a model
+            Tensor of shape `(num_queries, sequence_length)` to be fed to a model
 
         Example:
 
@@ -3068,7 +3068,7 @@ def forward(
             Task inputs. Task inputs can be obtained using [`AutoImageProcessor`]. See [`OneFormerProcessor.__call__`]
             for details.
         text_inputs (`list[torch.Tensor]`, *optional*):
-            Tensor fof shape `(num_queries, sequence_length)` to be fed to a model
+            Tensor of shape `(num_queries, sequence_length)` to be fed to a model
         mask_labels (`list[torch.Tensor]`, *optional*):
             List of mask labels of shape `(num_labels, height, width)` to be fed to a model
         class_labels (`list[torch.LongTensor]`, *optional*):
diff --git a/src/transformers/models/perception_lm/image_processing_perception_lm_fast.py b/src/transformers/models/perception_lm/image_processing_perception_lm_fast.py
index be55c39572d5..c26132a48439 100644
--- a/src/transformers/models/perception_lm/image_processing_perception_lm_fast.py
+++ b/src/transformers/models/perception_lm/image_processing_perception_lm_fast.py
@@ -190,7 +190,7 @@ def _fit_image_to_canvas(self, img_width: int, img_height: int, tile_size: int):
                         target_width=n_w * tile_size,
                         target_height=n_h * tile_size,
                     )
-                    # Llama3V dynamic tiling. Priortize biggest canvas.
+                    # Llama3V dynamic tiling. Prioritize biggest canvas.
                     if (scale < 1.0 and (image_width_height[0] >= optimal_image_width_height[0])) or (
                         scale >= 1.0 and (image_width_height[1] >= optimal_image_width_height[1])
                     ):
diff --git a/src/transformers/models/phi4_multimodal/configuration_phi4_multimodal.py b/src/transformers/models/phi4_multimodal/configuration_phi4_multimodal.py
index 3b6c2ca1d979..3c1fdb8b0a8c 100644
--- a/src/transformers/models/phi4_multimodal/configuration_phi4_multimodal.py
+++ b/src/transformers/models/phi4_multimodal/configuration_phi4_multimodal.py
@@ -137,7 +137,7 @@ class Phi4MultimodalAudioConfig(PretrainedConfig):
             The dropout ratio.
         ext_pw_out_channel (`int`, *optional*, defaults to 1024):
             Number of out channels in the point-wise conv modules.
-        depthwise_seperable_out_channel (`int`, *optional*, defaults to 1024):
+        depthwise_separable_out_channel (`int`, *optional*, defaults to 1024):
             Number of out channels in the depth-wise separable conv modules.
         depthwise_multiplier (`int`, *optional*, defaults to 1):
             Input size multiplier for the depth-wise separable conv modules.
@@ -190,7 +190,7 @@ def __init__(
         left_chunk: int = 18,
         dropout_rate: float = 0.0,
         ext_pw_out_channel: int = 1024,
-        depthwise_seperable_out_channel: int = 1024,
+        depthwise_separable_out_channel: int = 1024,
         depthwise_multiplier: int = 1,
         kernel_size: int = 3,
         conv_activation: str = "swish",
@@ -217,7 +217,7 @@ def __init__(
         self.num_blocks = num_blocks
         self.dropout_rate = dropout_rate
         self.ext_pw_out_channel = ext_pw_out_channel
-        self.depthwise_seperable_out_channel = depthwise_seperable_out_channel
+        self.depthwise_separable_out_channel = depthwise_separable_out_channel
         self.depthwise_multiplier = depthwise_multiplier
         self.kernel_size = kernel_size
         self.conv_activation = conv_activation
diff --git a/src/transformers/models/phi4_multimodal/modeling_phi4_multimodal.py b/src/transformers/models/phi4_multimodal/modeling_phi4_multimodal.py
index 349f2e02e2f2..ad2ef3e07124 100644
--- a/src/transformers/models/phi4_multimodal/modeling_phi4_multimodal.py
+++ b/src/transformers/models/phi4_multimodal/modeling_phi4_multimodal.py
@@ -746,7 +746,7 @@ def forward(
         return attn_output
 
 
-class Phi4MultimodalAudioDepthWiseSeperableConv1d(nn.Module):
+class Phi4MultimodalAudioDepthWiseSeparableConv1d(nn.Module):
     def __init__(self, config: Phi4MultimodalAudioConfig, padding: int = 0):
         super().__init__()
         self.dw_conv = nn.Conv1d(
@@ -758,7 +758,7 @@ def __init__(self, config: Phi4MultimodalAudioConfig, padding: int = 0):
             groups=config.hidden_size,
         )
         self.pw_conv = nn.Conv1d(
-            config.hidden_size * config.depthwise_multiplier, config.depthwise_seperable_out_channel, 1, 1, 0
+            config.hidden_size * config.depthwise_multiplier, config.depthwise_separable_out_channel, 1, 1, 0
         )
 
     def forward(self, hidden_states):
@@ -794,7 +794,7 @@ def __init__(self, config: Phi4MultimodalAudioConfig):
 
         self.layer_norm = nn.LayerNorm(config.hidden_size)
         self.glu = Phi4MultimodalAudioGluPointWiseConv(config)
-        self.dw_sep_conv_1d = Phi4MultimodalAudioDepthWiseSeperableConv1d(config, padding=config.kernel_size - 1)
+        self.dw_sep_conv_1d = Phi4MultimodalAudioDepthWiseSeparableConv1d(config, padding=config.kernel_size - 1)
         self.act = ACT2FN[config.conv_activation]
         self.ext_pw_conv_1d = nn.Conv1d(config.hidden_size, config.ext_pw_out_channel, kernel_size=1, stride=1)
         self.dropout = nn.Dropout(config.dropout_rate)
diff --git a/src/transformers/models/phi4_multimodal/modular_phi4_multimodal.py b/src/transformers/models/phi4_multimodal/modular_phi4_multimodal.py
index ea226e4e1981..0514136cad85 100644
--- a/src/transformers/models/phi4_multimodal/modular_phi4_multimodal.py
+++ b/src/transformers/models/phi4_multimodal/modular_phi4_multimodal.py
@@ -174,7 +174,7 @@ class Phi4MultimodalAudioConfig(PretrainedConfig):
             The dropout ratio.
         ext_pw_out_channel (`int`, *optional*, defaults to 1024):
             Number of out channels in the point-wise conv modules.
-        depthwise_seperable_out_channel (`int`, *optional*, defaults to 1024):
+        depthwise_separable_out_channel (`int`, *optional*, defaults to 1024):
             Number of out channels in the depth-wise separable conv modules.
         depthwise_multiplier (`int`, *optional*, defaults to 1):
             Input size multiplier for the depth-wise separable conv modules.
@@ -227,7 +227,7 @@ def __init__(
         left_chunk: int = 18,
         dropout_rate: float = 0.0,
         ext_pw_out_channel: int = 1024,
-        depthwise_seperable_out_channel: int = 1024,
+        depthwise_separable_out_channel: int = 1024,
         depthwise_multiplier: int = 1,
         kernel_size: int = 3,
         conv_activation: str = "swish",
@@ -254,7 +254,7 @@ def __init__(
         self.num_blocks = num_blocks
         self.dropout_rate = dropout_rate
         self.ext_pw_out_channel = ext_pw_out_channel
-        self.depthwise_seperable_out_channel = depthwise_seperable_out_channel
+        self.depthwise_separable_out_channel = depthwise_separable_out_channel
         self.depthwise_multiplier = depthwise_multiplier
         self.kernel_size = kernel_size
         self.conv_activation = conv_activation
@@ -930,7 +930,7 @@ def forward(
         return attn_output
 
 
-class Phi4MultimodalAudioDepthWiseSeperableConv1d(nn.Module):
+class Phi4MultimodalAudioDepthWiseSeparableConv1d(nn.Module):
     def __init__(self, config: Phi4MultimodalAudioConfig, padding: int = 0):
         super().__init__()
         self.dw_conv = nn.Conv1d(
@@ -942,7 +942,7 @@ def __init__(self, config: Phi4MultimodalAudioConfig, padding: int = 0):
             groups=config.hidden_size,
         )
         self.pw_conv = nn.Conv1d(
-            config.hidden_size * config.depthwise_multiplier, config.depthwise_seperable_out_channel, 1, 1, 0
+            config.hidden_size * config.depthwise_multiplier, config.depthwise_separable_out_channel, 1, 1, 0
         )
 
     def forward(self, hidden_states):
@@ -978,7 +978,7 @@ def __init__(self, config: Phi4MultimodalAudioConfig):
 
         self.layer_norm = nn.LayerNorm(config.hidden_size)
         self.glu = Phi4MultimodalAudioGluPointWiseConv(config)
-        self.dw_sep_conv_1d = Phi4MultimodalAudioDepthWiseSeperableConv1d(config, padding=config.kernel_size - 1)
+        self.dw_sep_conv_1d = Phi4MultimodalAudioDepthWiseSeparableConv1d(config, padding=config.kernel_size - 1)
         self.act = ACT2FN[config.conv_activation]
         self.ext_pw_conv_1d = nn.Conv1d(config.hidden_size, config.ext_pw_out_channel, kernel_size=1, stride=1)
         self.dropout = nn.Dropout(config.dropout_rate)
diff --git a/src/transformers/models/rwkv/convert_rwkv_checkpoint_to_hf.py b/src/transformers/models/rwkv/convert_rwkv_checkpoint_to_hf.py
index 87d35db22363..33044a4d1271 100644
--- a/src/transformers/models/rwkv/convert_rwkv_checkpoint_to_hf.py
+++ b/src/transformers/models/rwkv/convert_rwkv_checkpoint_to_hf.py
@@ -36,7 +36,7 @@
     "14B": 40,
 }
 
-HIDEN_SIZE_MAPPING = {
+HIDDEN_SIZE_MAPPING = {
     "169M": 768,
     "430M": 1024,
     "1B5": 2048,
@@ -106,7 +106,7 @@ def convert_rmkv_checkpoint_to_hf_format(
     config = RwkvConfig(
         vocab_size=vocab_size,
         num_hidden_layers=NUM_HIDDEN_LAYERS_MAPPING[size],
-        hidden_size=HIDEN_SIZE_MAPPING[size],
+        hidden_size=HIDDEN_SIZE_MAPPING[size],
     )
     config.save_pretrained(output_dir)
 
diff --git a/src/transformers/models/seamless_m4t/modeling_seamless_m4t.py b/src/transformers/models/seamless_m4t/modeling_seamless_m4t.py
index 5078d437e978..687d802b24ba 100755
--- a/src/transformers/models/seamless_m4t/modeling_seamless_m4t.py
+++ b/src/transformers/models/seamless_m4t/modeling_seamless_m4t.py
@@ -2187,7 +2187,7 @@ def __init__(self, config):
             kernel_size=kernel_size,
             padding=(kernel_size - 1) // 2,
         )
-        self.activation_fuction = nn.ReLU()
+        self.activation_function = nn.ReLU()
         self.ln1 = nn.LayerNorm(embed_dim)
         self.dropout_module = nn.Dropout(p=var_pred_dropout)
         self.conv2 = nn.Conv1d(
@@ -2202,10 +2202,10 @@ def __init__(self, config):
     def forward(self, hidden_states: Tensor) -> Tensor:
         # Input: B x T x C; Output: B x T
         hidden_states = self.conv1(hidden_states.transpose(1, 2))
-        hidden_states = self.activation_fuction(hidden_states).transpose(1, 2)
+        hidden_states = self.activation_function(hidden_states).transpose(1, 2)
         hidden_states = self.dropout_module(self.ln1(hidden_states))
         hidden_states = self.conv2(hidden_states.transpose(1, 2))
-        hidden_states = self.activation_fuction(hidden_states).transpose(1, 2)
+        hidden_states = self.activation_function(hidden_states).transpose(1, 2)
         hidden_states = self.dropout_module(self.ln2(hidden_states))
         return self.proj(hidden_states).squeeze(dim=2)
 
diff --git a/src/transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py b/src/transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py
index 7aa15cb84ddd..0ff3593bbaf2 100644
--- a/src/transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py
+++ b/src/transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py
@@ -2380,7 +2380,7 @@ def __init__(self, embed_dim, hidden_dim, kernel_size, var_pred_dropout):
             kernel_size=kernel_size,
             padding="same",
         )
-        self.activation_fuction = nn.ReLU()
+        self.activation_function = nn.ReLU()
         self.ln1 = nn.LayerNorm(hidden_dim)
         self.dropout_module = nn.Dropout(p=var_pred_dropout)
         self.conv2 = nn.Conv1d(
@@ -2397,12 +2397,12 @@ def forward(self, hidden_states: Tensor, padding_mask: Optional[Tensor] = None)
         if padding_mask is not None:
             hidden_states = hidden_states.masked_fill(~padding_mask.bool().unsqueeze(-1), 0.0)
         hidden_states = self.conv1(hidden_states.transpose(1, 2))
-        hidden_states = self.activation_fuction(hidden_states).transpose(1, 2)
+        hidden_states = self.activation_function(hidden_states).transpose(1, 2)
         hidden_states = self.dropout_module(self.ln1(hidden_states))
         if padding_mask is not None:
             hidden_states = hidden_states.masked_fill(~padding_mask.bool().unsqueeze(-1), 0.0)
         hidden_states = self.conv2(hidden_states.transpose(1, 2))
-        hidden_states = self.activation_fuction(hidden_states).transpose(1, 2)
+        hidden_states = self.activation_function(hidden_states).transpose(1, 2)
         hidden_states = self.dropout_module(self.ln2(hidden_states))
         return self.proj(hidden_states).squeeze(dim=2)
 
diff --git a/src/transformers/models/voxtral/processing_voxtral.py b/src/transformers/models/voxtral/processing_voxtral.py
index 1166c9636307..1b812ba60a4b 100644
--- a/src/transformers/models/voxtral/processing_voxtral.py
+++ b/src/transformers/models/voxtral/processing_voxtral.py
@@ -88,7 +88,7 @@ def __init__(
 
         super().__init__(feature_extractor, tokenizer)
 
-    def _retreive_input_features(self, audio, max_source_positions, **kwargs):
+    def _retrieve_input_features(self, audio, max_source_positions, **kwargs):
         """
         Handles specific logic of Voxtral expected input features: audio arrays should be padded to next multiple of 480000 (duration is a multiple of 30s), see VoxtralProcessorKwargs' default audio_kwargs.
         Then mel input features are extracted and stacked along batch dimension, splitting into chunks of max_source_positions.
@@ -222,7 +222,7 @@ def apply_chat_template(
                 data = dict(encoded_instruct_inputs)
                 if audio is not None:
                     max_source_positions = audio_kwargs.pop("max_source_positions")
-                    data["input_features"] = self._retreive_input_features(audio, max_source_positions, **audio_kwargs)
+                    data["input_features"] = self._retrieve_input_features(audio, max_source_positions, **audio_kwargs)
 
                 return BatchFeature(data=data, tensor_type=return_tensors)
 
@@ -421,7 +421,7 @@ def apply_transcription_request(
 
                 # extract the input features
                 max_source_positions = audio_kwargs.pop("max_source_positions")
-                data["input_features"] = self._retreive_input_features(
+                data["input_features"] = self._retrieve_input_features(
                     audio_arrays, max_source_positions, **audio_kwargs
                 )
 
diff --git a/src/transformers/utils/generic.py b/src/transformers/utils/generic.py
index ef5e356bcd1c..451b98193d43 100644
--- a/src/transformers/utils/generic.py
+++ b/src/transformers/utils/generic.py
@@ -850,7 +850,7 @@ def wrapper(self, *args, **kwargs):
         }
 
         # We let cross attentions to be saved separately because some models add `cross-attn` layer
-        # when certain condtions are met. Let's output cross attention if attentions are requested (for BC)
+        # when certain conditions are met. Let's output cross attention if attentions are requested (for BC)
         if "output_attentions" in recordable_keys:
             recordable_keys["output_cross_attentions"] = recordable_keys["output_attentions"]
 
diff --git a/src/transformers/utils/metrics.py b/src/transformers/utils/metrics.py
index 62b41995a6d9..33623b385ce3 100644
--- a/src/transformers/utils/metrics.py
+++ b/src/transformers/utils/metrics.py
@@ -339,7 +339,7 @@ def record_kv_cache_memory_metrics(self, cache) -> None:
             page_size = cache.head_dim * cache.num_key_value_heads
             page_mem_in_bytes = page_size * cache.dtype.itemsize
             # When a block is allocated, it is for both K and V, so we multiply by 2
-            # It's also allocated accross all cache tensors, so we multiply by the nb of tensors: len(cache.key_cache)
+            # It's also allocated across all cache tensors, so we multiply by the nb of tensors: len(cache.key_cache)
             block_mem_in_bytes = 2 * len(cache.key_cache) * cache.block_size * page_mem_in_bytes
 
             # Retrieve the number of used and free blocks
diff --git a/tests/models/bloom/test_modeling_bloom.py b/tests/models/bloom/test_modeling_bloom.py
index 9480aac9ae94..e8cdd43ff5f7 100644
--- a/tests/models/bloom/test_modeling_bloom.py
+++ b/tests/models/bloom/test_modeling_bloom.py
@@ -446,7 +446,7 @@ def test_batch_generation(self):
 
     @slow
     @require_torch_accelerator
-    def test_batch_generation_padd(self):
+    def test_batch_generation_padding(self):
         path_560m = "bigscience/bloom-560m"
         model = BloomForCausalLM.from_pretrained(path_560m, use_cache=True, revision="gs555750").to(torch_device)
         model = model.eval()
diff --git a/tests/models/clipseg/test_modeling_clipseg.py b/tests/models/clipseg/test_modeling_clipseg.py
index 08a21f9dcf3b..788a60021a88 100644
--- a/tests/models/clipseg/test_modeling_clipseg.py
+++ b/tests/models/clipseg/test_modeling_clipseg.py
@@ -393,7 +393,7 @@ def create_and_check_model(self, config, input_ids, attention_mask, pixel_values
             result.logits_per_text.shape, (self.text_model_tester.batch_size, self.vision_model_tester.batch_size)
         )
 
-    def create_and_check_model_for_image_segmentation(self, config, input_ids, attention_maks, pixel_values):
+    def create_and_check_model_for_image_segmentation(self, config, input_ids, attention_mask, pixel_values):
         model = CLIPSegForImageSegmentation(config).to(torch_device).eval()
         with torch.no_grad():
             result = model(input_ids, pixel_values)
diff --git a/tests/models/codegen/test_modeling_codegen.py b/tests/models/codegen/test_modeling_codegen.py
index ee16a5347ad6..5f97cfad359d 100644
--- a/tests/models/codegen/test_modeling_codegen.py
+++ b/tests/models/codegen/test_modeling_codegen.py
@@ -379,7 +379,7 @@ def test_batch_generation(self):
         model.config.pad_token_id = model.config.eos_token_id
 
         # use different length sentences to test batching
-        sentences = ["def hellow_world():", "def greet(name):"]
+        sentences = ["def hello_world():", "def greet(name):"]
 
         inputs = tokenizer(sentences, return_tensors="pt", padding=True)
         input_ids = inputs["input_ids"].to(torch_device)
@@ -415,7 +415,7 @@ def test_batch_generation(self):
         padded_sentence = tokenizer.decode(output_padded[0], skip_special_tokens=True)
 
         expected_output_sentence = [
-            'def hellow_world():\n    print("Hello World")\n\nhellow_world()',
+            'def hello_world():\n    print("Hello World")\n\nhellow_world()',
             'def greet(name):\n    print(f"Hello {name}")\n\ng',
         ]
         self.assertListEqual(expected_output_sentence, batch_out_sentence)
diff --git a/tests/models/cohere2/test_modeling_cohere2.py b/tests/models/cohere2/test_modeling_cohere2.py
index cdb78895e866..4619c7a7f19d 100644
--- a/tests/models/cohere2/test_modeling_cohere2.py
+++ b/tests/models/cohere2/test_modeling_cohere2.py
@@ -241,7 +241,7 @@ def test_generation_beyond_sliding_window(self, attn_implementation: str):
             self.skipTest("FlashAttention2 is required for this test.")
 
         if torch_device == "xpu" and attn_implementation == "flash_attention_2":
-            self.skipTest(reason="Intel XPU doesn't support falsh_attention_2 as of now.")
+            self.skipTest(reason="Intel XPU doesn't support flash_attention_2 as of now.")
 
         model_id = "CohereForAI/c4ai-command-r7b-12-2024"
         EXPECTED_COMPLETIONS = [
diff --git a/tests/models/csm/test_modeling_csm.py b/tests/models/csm/test_modeling_csm.py
index 19e0beb39cb9..204ef79831f3 100644
--- a/tests/models/csm/test_modeling_csm.py
+++ b/tests/models/csm/test_modeling_csm.py
@@ -362,7 +362,7 @@ def _load_conversation(self):
     def test_1b_model_integration_generate(self):
         """
         Tests the generated tokens match the ones from the original model implementation.
-        Such tokens are to be retreived using https://gist.github.com/eustlb/d25577a357ddcf8f4a8cd0d00baca551, which is a script that infers the original model.
+        Such tokens are to be retrieved using https://gist.github.com/eustlb/d25577a357ddcf8f4a8cd0d00baca551, which is a script that infers the original model.
         """
         processor = AutoProcessor.from_pretrained(self.model_checkpoint)
         prompt = "<|begin_of_text|>[0]What are you working on?<|end_of_text|><|AUDIO|><|audio_eos|><|begin_of_text|>[1]I'm figuring out my budget.<|end_of_text|>"
@@ -406,7 +406,7 @@ def test_1b_model_integration_generate(self):
     def test_1b_model_integration_generate_no_audio(self):
         """
         Tests the generated tokens match the ones from the original model implementation.
-        Such tokens are to be retreived using https://gist.github.com/eustlb/aed822f765e928b9612e01b0d8836d69, which is a script that infers the original model.
+        Such tokens are to be retrieved using https://gist.github.com/eustlb/aed822f765e928b9612e01b0d8836d69, which is a script that infers the original model.
         """
 
         processor = AutoProcessor.from_pretrained(self.model_checkpoint)
@@ -467,7 +467,7 @@ def test_1b_model_integration_generate_no_audio(self):
     def test_1b_model_integration_generate_multiple_audio(self):
         """
         Test the generated tokens match the ones from the original model implementation.
-        Such tokens are to be retreived using https://gist.github.com/eustlb/0c94de002e1325abb61d32217f74c0f8, which is a script that infers the original model.
+        Such tokens are to be retrieved using https://gist.github.com/eustlb/0c94de002e1325abb61d32217f74c0f8, which is a script that infers the original model.
         """
         processor = AutoProcessor.from_pretrained(self.model_checkpoint)
 
@@ -526,7 +526,7 @@ def test_1b_model_integration_generate_multiple_audio(self):
     def test_1b_model_integration_generate_batched(self):
         """
         Test the generated tokens match the ones from the original model implementation.
-        Such tokens are to be retreived using https://gist.github.com/eustlb/bcc532b53161bc31da3d66cb07ae193f, which is a script that infers the original model.
+        Such tokens are to be retrieved using https://gist.github.com/eustlb/bcc532b53161bc31da3d66cb07ae193f, which is a script that infers the original model.
         """
         processor = AutoProcessor.from_pretrained(self.model_checkpoint)
 
diff --git a/tests/models/deepseek_v3/test_modeling_deepseek_v3.py b/tests/models/deepseek_v3/test_modeling_deepseek_v3.py
index 9ed521509408..ed366fb2405e 100644
--- a/tests/models/deepseek_v3/test_modeling_deepseek_v3.py
+++ b/tests/models/deepseek_v3/test_modeling_deepseek_v3.py
@@ -326,7 +326,9 @@ def test_model_rope_scaling(self):
         long_input_length = int(config.max_position_embeddings * 1.5)
 
         # Inputs
-        x = torch.randn(1, dtype=torch.float32, device=torch_device)  # used exlusively to get the dtype and the device
+        x = torch.randn(
+            1, dtype=torch.float32, device=torch_device
+        )  # used exclusively to get the dtype and the device
         position_ids_short = torch.arange(short_input_length, dtype=torch.long, device=torch_device)
         position_ids_short = position_ids_short.unsqueeze(0)
         position_ids_long = torch.arange(long_input_length, dtype=torch.long, device=torch_device)
diff --git a/tests/models/distilbert/test_modeling_distilbert.py b/tests/models/distilbert/test_modeling_distilbert.py
index db90233b438a..a22d229a0405 100644
--- a/tests/models/distilbert/test_modeling_distilbert.py
+++ b/tests/models/distilbert/test_modeling_distilbert.py
@@ -383,7 +383,7 @@ def test_flash_attn_2_inference_equivalence_right_padding(self):
 
 
 @require_torch
-class DistilBertModelIntergrationTest(unittest.TestCase):
+class DistilBertModelIntegrationTest(unittest.TestCase):
     @slow
     def test_inference_no_head_absolute_embedding(self):
         model = DistilBertModel.from_pretrained("distilbert-base-uncased")
diff --git a/tests/models/evolla/test_modeling_evolla.py b/tests/models/evolla/test_modeling_evolla.py
index 50574c7c5096..b518c0db956d 100644
--- a/tests/models/evolla/test_modeling_evolla.py
+++ b/tests/models/evolla/test_modeling_evolla.py
@@ -257,7 +257,7 @@ def test_generate_multiple_proteins(self):
     def test_saprot_output(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
         config.return_dict = True
-        protein_informations = {
+        protein_information = {
             "input_ids": inputs_dict["protein_input_ids"],
             "attention_mask": inputs_dict["protein_attention_mask"],
         }
@@ -267,13 +267,13 @@ def test_saprot_output(self):
             model = model_class(config)
             model.to(torch_device)
             model.eval()
-            protein_encoder_outputs = model.protein_encoder.model(**protein_informations, return_dict=True)
+            protein_encoder_outputs = model.protein_encoder.model(**protein_information, return_dict=True)
             print(model_class, protein_encoder_outputs)
 
     def test_protein_encoder_output(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
         config.return_dict = True
-        protein_informations = {
+        protein_information = {
             "input_ids": inputs_dict["protein_input_ids"],
             "attention_mask": inputs_dict["protein_attention_mask"],
         }
@@ -283,7 +283,7 @@ def test_protein_encoder_output(self):
             model = model_class(config)
             model.to(torch_device)
             model.eval()
-            protein_encoder_outputs = model.protein_encoder(**protein_informations, return_dict=True)
+            protein_encoder_outputs = model.protein_encoder(**protein_information, return_dict=True)
             print(model_class, protein_encoder_outputs)
 
     def test_single_forward(self):
diff --git a/tests/models/gemma3/test_modeling_gemma3.py b/tests/models/gemma3/test_modeling_gemma3.py
index ddef6e0d6bc1..95c33187eb7c 100644
--- a/tests/models/gemma3/test_modeling_gemma3.py
+++ b/tests/models/gemma3/test_modeling_gemma3.py
@@ -814,8 +814,8 @@ def test_dynamic_sliding_window_is_default(self):
         prompt = "What is the capital of France?"
         model_inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
 
-        foward_outputs = model(**model_inputs)
-        self.assertIn("DynamicSlidingWindowLayer", str(foward_outputs.past_key_values))
+        forward_outputs = model(**model_inputs)
+        self.assertIn("DynamicSlidingWindowLayer", str(forward_outputs.past_key_values))
 
         generate_outputs = model.generate(
             **model_inputs, max_new_tokens=2, do_sample=False, return_dict_in_generate=True
diff --git a/tests/models/granite_speech/test_modeling_granite_speech.py b/tests/models/granite_speech/test_modeling_granite_speech.py
index 1ea1f73b4344..adb925934548 100644
--- a/tests/models/granite_speech/test_modeling_granite_speech.py
+++ b/tests/models/granite_speech/test_modeling_granite_speech.py
@@ -127,7 +127,7 @@ def __init__(
         self.audio_token_index = audio_token_index
         self.tie_word_embeddings = tie_word_embeddings
         self.initializer_range = initializer_range
-        self.has_lora_adapater = has_lora_adapter
+        self.has_lora_adapter = has_lora_adapter
         self.downsample_rate = downsample_rate
         self.window_size = window_size
         self.is_training = is_training
@@ -152,7 +152,7 @@ def get_config(self):
             audio_token_index=self.audio_token_index,
             tie_word_embeddings=self.tie_word_embeddings,
             initializer_range=self.initializer_range,
-            has_lora_adapter=self.has_lora_adapater,
+            has_lora_adapter=self.has_lora_adapter,
         )
 
     def prepare_config_and_inputs(self):
diff --git a/tests/models/kosmos2/test_processing_kosmos2.py b/tests/models/kosmos2/test_processing_kosmos2.py
index d167ad4ebe57..c2c98882ef02 100644
--- a/tests/models/kosmos2/test_processing_kosmos2.py
+++ b/tests/models/kosmos2/test_processing_kosmos2.py
@@ -97,7 +97,7 @@ def get_image_processor(self, **kwargs):
     def tearDownClass(cls):
         shutil.rmtree(cls.tmpdirname, ignore_errors=True)
 
-    def test_image_procesor_load_save_reload(self):
+    def test_image_processor_load_save_reload(self):
         # make sure load from Hub repo. -> save -> reload locally work
         image_processor = CLIPImageProcessor.from_pretrained("microsoft/kosmos-2-patch14-224")
         with TemporaryDirectory() as tmp_dir:
diff --git a/tests/models/kyutai_speech_to_text/test_modeling_kyutai_speech_to_text.py b/tests/models/kyutai_speech_to_text/test_modeling_kyutai_speech_to_text.py
index b7c4537006dd..8325c0f699ed 100644
--- a/tests/models/kyutai_speech_to_text/test_modeling_kyutai_speech_to_text.py
+++ b/tests/models/kyutai_speech_to_text/test_modeling_kyutai_speech_to_text.py
@@ -717,7 +717,7 @@ def test_generation(self):
         reproduce test expected outputs using original codebase: https://gist.github.com/eustlb/7a9aa6139d11e0103c6b65bac103da52
 
         DISCLAIMER: we are testing for pretty short inputs. Indeed, reproducing correct expected outputs for longer is not possible
-        as implementation choices (qkv matrix in one linear for original code vs three for hf) create growing divergence with context lenght,
+        as implementation choices (qkv matrix in one linear for original code vs three for hf) create growing divergence with context length,
         ultimately giving different outputs.
         """
         processor = KyutaiSpeechToTextProcessor.from_pretrained(self.model_checkpoint)
@@ -747,7 +747,7 @@ def test_generation_batched(self):
         reproduce test expected outputs using original codebase: https://gist.github.com/eustlb/b58c217c75124d405ec1c13877c7ece8
 
         DISCLAIMER: we are testing for pretty short inputs. Indeed, reproducing correct expected outputs for longer is not possible
-        as implementation choices (qkv matrix in one linear for original code vs three for hf) create growing divergence with context lenght,
+        as implementation choices (qkv matrix in one linear for original code vs three for hf) create growing divergence with context length,
         ultimately giving different outputs.
         """
         processor = KyutaiSpeechToTextProcessor.from_pretrained(self.model_checkpoint)
diff --git a/tests/models/layoutlmv3/test_tokenization_layoutlmv3.py b/tests/models/layoutlmv3/test_tokenization_layoutlmv3.py
index 729a7f4034f7..5ca0499805ef 100644
--- a/tests/models/layoutlmv3/test_tokenization_layoutlmv3.py
+++ b/tests/models/layoutlmv3/test_tokenization_layoutlmv3.py
@@ -1697,11 +1697,11 @@ def test_added_token_with_space_before(self):
         words_without_space = tokens_to_add + list(tokenizer_s.added_tokens_encoder.keys())
         boxes = [[i, i, i, i] for i in range(len(words_with_space))]
 
-        tokens_to_add_formated = [
+        tokens_to_add_formatted = [
             AddedToken(token, rstrip=True, lstrip=True, single_word=False) for token in tokens_to_add
         ]
-        tokenizer_s.add_tokens(tokens_to_add_formated)
-        tokenizer_f.add_tokens(tokens_to_add_formated)
+        tokenizer_s.add_tokens(tokens_to_add_formatted)
+        tokenizer_f.add_tokens(tokens_to_add_formatted)
 
         ids_s = tokenizer_s(words_with_space, boxes=boxes).input_ids
         ids_f = tokenizer_f(words_with_space, boxes=boxes).input_ids
diff --git a/tests/models/m2m_100/test_modeling_m2m_100.py b/tests/models/m2m_100/test_modeling_m2m_100.py
index 20cd88baa534..32c5edd3071f 100644
--- a/tests/models/m2m_100/test_modeling_m2m_100.py
+++ b/tests/models/m2m_100/test_modeling_m2m_100.py
@@ -117,7 +117,7 @@ def prepare_config_and_inputs(self):
         # all pad tokens have pos id = 2 and rest are between 2..seq_length
         # and the seq_length here is seq_length - num_pad_tokens
         # but when using past, there is no way of knowing if the past input ids had
-        # pad tokens in them, which results in incorrect seq_lenth and which in turn results in
+        # pad tokens in them, which results in incorrect seq_length and which in turn results in
         # position_ids being off by num_pad_tokens in past input
         input_ids = input_ids.clamp(self.pad_token_id + 1)
         decoder_input_ids = decoder_input_ids.clamp(self.pad_token_id + 1)
diff --git a/tests/models/mllama/test_processing_mllama.py b/tests/models/mllama/test_processing_mllama.py
index be1472496823..e9acdddcd0c3 100644
--- a/tests/models/mllama/test_processing_mllama.py
+++ b/tests/models/mllama/test_processing_mllama.py
@@ -274,12 +274,14 @@ def test_process_interleaved_images_prompts_image_splitting(self):
             [self.image_token_id, self.bos_token_id, 2028, 374, 264, 1296, 11914, 13],
             [self.bos_token_id, 2028, 374, 264, 1296, 11914, 13, self.image_token_id, self.image_token_id, 2028, 374, 264, 1296, 11914, 13],
         ]
-        # fmt: onn
+        # fmt: on
         images = [[self.image1], [self.image1, self.image2]]
         inputs = processor(text=text, images=images, padding=True, size={"width": 256, "height": 256})
 
         self.assertEqual(inputs["pixel_values"].shape, (2, 2, 4, 3, 256, 256))
-        for input_ids_i, attention_mask_i, expected_ids_i in zip(inputs["input_ids"], inputs["attention_mask"], expected_ids):
+        for input_ids_i, attention_mask_i, expected_ids_i in zip(
+            inputs["input_ids"], inputs["attention_mask"], expected_ids
+        ):
             pad_ids = [id for id, m in zip(input_ids_i, attention_mask_i) if m == 0]
             input_ids = [id for id, m in zip(input_ids_i, attention_mask_i) if m == 1]
             self.assertEqual(input_ids, expected_ids_i)
@@ -291,24 +293,38 @@ def test_process_interleaved_images_prompts_image_splitting(self):
         # Check that only first tile of first sample is attended to all text tokens
         first_sample_mask = cross_attention_mask[0].copy()
         first_image_first_tile_attention = first_sample_mask[:, :1, :1]  # text tokens, images, tiles
-        self.assertTrue(np.all(first_image_first_tile_attention == 1), f"Cross attention mask is not all ones: {first_image_first_tile_attention}")
+        self.assertTrue(
+            np.all(first_image_first_tile_attention == 1),
+            f"Cross attention mask is not all ones: {first_image_first_tile_attention}",
+        )
 
         # zero out first tile of first image
         first_image_first_tile_attention[:, :1, :1] = 0
-        self.assertTrue(np.all(first_image_first_tile_attention == 0), f"Cross attention mask is not all zeros: {first_image_first_tile_attention}")
+        self.assertTrue(
+            np.all(first_image_first_tile_attention == 0),
+            f"Cross attention mask is not all zeros: {first_image_first_tile_attention}",
+        )
 
         # second sample
         second_sample_mask = cross_attention_mask[1].copy()
         first_image_first_tile_attention = second_sample_mask[7:, :1, :1]  # text tokens, images, tiles
-        self.assertTrue(np.all(first_image_first_tile_attention == 1), f"Cross attention mask is not all ones: {first_image_first_tile_attention}")
+        self.assertTrue(
+            np.all(first_image_first_tile_attention == 1),
+            f"Cross attention mask is not all ones: {first_image_first_tile_attention}",
+        )
 
         second_image_two_tiles_attention = second_sample_mask[8:, 1:2, :2]  # text tokens, images, tiles
-        self.assertTrue(np.all(second_image_two_tiles_attention == 1), f"Cross attention mask is not all ones: {second_image_two_tiles_attention}")
+        self.assertTrue(
+            np.all(second_image_two_tiles_attention == 1),
+            f"Cross attention mask is not all ones: {second_image_two_tiles_attention}",
+        )
 
         # zero out both images masks
         second_sample_mask[7:, :1, :1] = 0
         second_sample_mask[8:, 1:2, :2] = 0
-        self.assertTrue(np.all(second_sample_mask == 0), f"Cross attention mask is not all zeros: {second_sample_mask}")
+        self.assertTrue(
+            np.all(second_sample_mask == 0), f"Cross attention mask is not all zeros: {second_sample_mask}"
+        )
 
     def test_process_interleaved_images_prompts_image_error(self):
         text = [
@@ -406,6 +422,6 @@ def test_special_mm_token_truncation(self):
                 max_length=3,
             )
 
-    @unittest.skip("Mllama can't process inouts with no image ttogether with multimodal inputs")
+    @unittest.skip("Mllama can't process inputs with no image ttogether with multimodal inputs")
     def test_processor_text_has_no_visual(self):
         pass
diff --git a/tests/models/modernbert/test_modeling_modernbert.py b/tests/models/modernbert/test_modeling_modernbert.py
index 2a9c63089819..b1f0ce468a38 100644
--- a/tests/models/modernbert/test_modeling_modernbert.py
+++ b/tests/models/modernbert/test_modeling_modernbert.py
@@ -402,7 +402,7 @@ def test_saved_config_excludes_reference_compile(self):
     @require_flash_attn
     @require_torch_gpu
     @pytest.mark.flash_attn_test
-    def test_flash_attention_dispatches_by_defaul(self):
+    def test_flash_attention_dispatches_by_default(self):
         "ModernBert should dispatch to FA2 by default, not SDPA"
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
         for model_class in self.all_model_classes:
diff --git a/tests/models/phi4_multimodal/test_modeling_phi4_multimodal.py b/tests/models/phi4_multimodal/test_modeling_phi4_multimodal.py
index 84dbf95301c1..b8e3232dc005 100644
--- a/tests/models/phi4_multimodal/test_modeling_phi4_multimodal.py
+++ b/tests/models/phi4_multimodal/test_modeling_phi4_multimodal.py
@@ -85,7 +85,7 @@ def __init__(
             hidden_size=32,
             num_attention_heads=8,
             intermediate_size=48,
-            depthwise_seperable_out_channel=128,
+            depthwise_separable_out_channel=128,
             nemo_conv_channels=128,
             initializer_range=1e-5,
         ),
diff --git a/tests/models/vit_mae/test_modeling_vit_mae.py b/tests/models/vit_mae/test_modeling_vit_mae.py
index b28d4711d589..689256de2d0d 100644
--- a/tests/models/vit_mae/test_modeling_vit_mae.py
+++ b/tests/models/vit_mae/test_modeling_vit_mae.py
@@ -328,7 +328,7 @@ def test_initialization(self):
         for model_class in self.all_model_classes:
             model = model_class(config=configs_no_init)
             for name, param in model.named_parameters():
-                # This is an excepton in the module, it's initialized with xavier_uniform without using initializer_range
+                # This is an exception in the module, it's initialized with xavier_uniform without using initializer_range
                 if name.endswith("patch_embeddings.projection.weight"):
                     continue
                 if param.requires_grad:
diff --git a/tests/test_configuration_common.py b/tests/test_configuration_common.py
index f7836dca6db3..4bf85697c4cc 100644
--- a/tests/test_configuration_common.py
+++ b/tests/test_configuration_common.py
@@ -160,7 +160,7 @@ def create_and_test_config_from_pretrained_custom_kwargs(self):
         for composite configs. We should overwrite only the requested keys, keeping all values of the
         subconfig that are loaded from the checkpoint.
         """
-        # Check only composite configs. We can't know which attributes each type fo config has so check
+        # Check only composite configs. We can't know which attributes each type of config has so check
         # only text config because we are sure that all text configs have a `vocab_size`
         config = self.config_class(**self.inputs_dict)
         if config.get_text_config() is config or not hasattr(self.parent.model_tester, "get_config"):
diff --git a/tests/utils/test_add_new_model_like.py b/tests/utils/test_add_new_model_like.py
index dffe71897806..5ba84bab5501 100644
--- a/tests/utils/test_add_new_model_like.py
+++ b/tests/utils/test_add_new_model_like.py
@@ -481,7 +481,7 @@ def test_phi4_with_all_processors(self):
                 Phi4MultimodalAudioAttention,
                 Phi4MultimodalAudioConformerEncoderLayer,
                 Phi4MultimodalAudioConvModule,
-                Phi4MultimodalAudioDepthWiseSeperableConv1d,
+                Phi4MultimodalAudioDepthWiseSeparableConv1d,
                 Phi4MultimodalAudioEmbedding,
                 Phi4MultimodalAudioGluPointWiseConv,
                 Phi4MultimodalAudioMeanVarianceNormLayer,
@@ -567,7 +567,7 @@ class MyTest2AudioAttention(Phi4MultimodalAudioAttention):
                 pass
 
 
-            class MyTest2AudioDepthWiseSeperableConv1d(Phi4MultimodalAudioDepthWiseSeperableConv1d):
+            class MyTest2AudioDepthWiseSeparableConv1d(Phi4MultimodalAudioDepthWiseSeparableConv1d):
                 pass
 
 
diff --git a/utils/check_copies.py b/utils/check_copies.py
index 56530dab8829..2bb00776af98 100644
--- a/utils/check_copies.py
+++ b/utils/check_copies.py
@@ -504,7 +504,7 @@ def find_code_and_splits(object_name: str, base_path: str, buffer: Optional[dict
         code (`str`):
             The object's code.
         code_splits (`List[Tuple[str, int, int]]`):
-            `code` splitted into blocks. See `split_code_into_blocks`.
+            `code` split into blocks. See `split_code_into_blocks`.
     """
     if buffer is None:
         buffer = {}