huggingface · Rocketknight1 · Oct 2, 2025 · Aug 15, 2025
diff --git a/examples/legacy/run_chinese_ref.py b/examples/legacy/run_chinese_ref.py
@@ -55,7 +55,7 @@ def get_chinese_word(tokens: list[str]):
 def add_sub_symbol(bert_tokens: list[str], chinese_word_set: set()):
     if not chinese_word_set:
         return bert_tokens
-    max_word_len = max([len(w) for w in chinese_word_set])
+    max_word_len = max(len(w) for w in chinese_word_set)
 
     bert_word = bert_tokens
     start, end = 0, len(bert_word)

diff --git a/examples/pytorch/question-answering/run_qa_no_trainer.py b/examples/pytorch/question-answering/run_qa_no_trainer.py
@@ -950,7 +950,7 @@ def create_and_fill_np_array(start_or_end_logits, dataset, max_len):
             all_start_logits.append(accelerator.gather_for_metrics(start_logits).cpu().numpy())
             all_end_logits.append(accelerator.gather_for_metrics(end_logits).cpu().numpy())
 
-    max_len = max([x.shape[1] for x in all_start_logits])  # Get the max_length of the tensor
+    max_len = max(x.shape[1] for x in all_start_logits)  # Get the max_length of the tensor
 
     # concatenate the numpy array
     start_logits_concat = create_and_fill_np_array(all_start_logits, eval_dataset, max_len)
@@ -989,7 +989,7 @@ def create_and_fill_np_array(start_or_end_logits, dataset, max_len):
                 all_start_logits.append(accelerator.gather_for_metrics(start_logits).cpu().numpy())
                 all_end_logits.append(accelerator.gather_for_metrics(end_logits).cpu().numpy())
 
-        max_len = max([x.shape[1] for x in all_start_logits])  # Get the max_length of the tensor
+        max_len = max(x.shape[1] for x in all_start_logits)  # Get the max_length of the tensor
         # concatenate the numpy array
         start_logits_concat = create_and_fill_np_array(all_start_logits, predict_dataset, max_len)
         end_logits_concat = create_and_fill_np_array(all_end_logits, predict_dataset, max_len)

diff --git a/src/transformers/generation/stopping_criteria.py b/src/transformers/generation/stopping_criteria.py
@@ -249,7 +249,7 @@ def __init__(self, tokenizer: PreTrainedTokenizerBase, stop_strings: Union[str,
             token_list, token_indices, tokenizer
         )
 
-        self.maximum_token_len = max([len(stop_string) for stop_string in self.stop_strings])
+        self.maximum_token_len = max(len(stop_string) for stop_string in self.stop_strings)
         self.num_stop_strings = len(self.stop_strings)
         self.target_lens = torch.tensor([len(stop_string) for stop_string in stop_strings], dtype=torch.int32)
 

diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
@@ -4103,9 +4103,9 @@ def get_memory_footprint(self, return_buffers=True):
                 are tensors that do not require gradients and not registered as parameters. E.g. mean and std in batch
                 norm layers. Please see: https://discuss.pytorch.org/t/what-pytorch-means-by-buffers/120266/2
         """
-        mem = sum([param.nelement() * param.element_size() for param in self.parameters()])
+        mem = sum(param.nelement() * param.element_size() for param in self.parameters())
         if return_buffers:
-            mem_bufs = sum([buf.nelement() * buf.element_size() for buf in self.buffers()])
+            mem_bufs = sum(buf.nelement() * buf.element_size() for buf in self.buffers())
             mem = mem + mem_bufs
         return mem
 

diff --git a/src/transformers/models/deprecated/mctct/modeling_mctct.py b/src/transformers/models/deprecated/mctct/modeling_mctct.py
@@ -96,7 +96,7 @@ def __init__(self, config):
     def forward(self, input_features):
         # NOTE: in reference to the NOTE in __init__, right now it just calculates padding as if
         # there will be just one conv layer.
-        padding = sum([size // 2 for size in self.kernel_size])  # (7, 7) -> (3, 3)
+        padding = sum(size // 2 for size in self.kernel_size)  # (7, 7) -> (3, 3)
 
         input_features = torch.nn.functional.pad(input_features, (0, 0, padding, padding), "constant", 0)
         hidden_states = input_features.transpose(1, 2).contiguous()  # -> Batch x Frame x Time

diff --git a/src/transformers/models/deprecated/transfo_xl/modeling_transfo_xl.py b/src/transformers/models/deprecated/transfo_xl/modeling_transfo_xl.py
@@ -425,8 +425,8 @@ def _get_new_num_tokens_layer(self, new_num_tokens, layer):
 
         new_num_tokens_layer = (
             new_num_tokens
-            - sum([emb.weight.shape[0] for emb in embeddings.emb_layers[:layer]])
-            - sum([emb.weight.shape[0] for emb in embeddings.emb_layers[layer + 1 :]])
+            - sum(emb.weight.shape[0] for emb in embeddings.emb_layers[:layer])
+            - sum(emb.weight.shape[0] for emb in embeddings.emb_layers[layer + 1 :])
         )
         return new_num_tokens_layer, layer
 

diff --git a/src/transformers/models/deprecated/tvlt/feature_extraction_tvlt.py b/src/transformers/models/deprecated/tvlt/feature_extraction_tvlt.py
@@ -202,7 +202,7 @@ def __call__(
 
         # Create audio attention mask
         max_patch_len = max(
-            [ceil(feature.shape[0] / self.patch_size[0]) * self.freq_len for feature in audio_features]
+            ceil(feature.shape[0] / self.patch_size[0]) * self.freq_len for feature in audio_features
         )  # The maximum number of audio patches in a batch
         if return_attention_mask:
             audio_mask = [

diff --git a/src/transformers/models/deprecated/tvlt/image_processing_tvlt.py b/src/transformers/models/deprecated/tvlt/image_processing_tvlt.py
@@ -392,7 +392,7 @@ def preprocess(
                     f"number of frames must not be greater than the maximum frames of the model {self.num_frames}."
                 )
 
-        max_num_frames = max([len(video) for video in videos])
+        max_num_frames = max(len(video) for video in videos)
         num_patches_per_image = (size["shortest_edge"] // patch_size[0]) ** 2
         video_masks = np.array(
             [

diff --git a/src/transformers/models/emu3/image_processing_emu3.py b/src/transformers/models/emu3/image_processing_emu3.py
@@ -266,8 +266,8 @@ def _pad_for_batching(
         """
 
         max_shape = (
-            max([size[0] for size in image_sizes]),
-            max([size[1] for size in image_sizes]),
+            max(size[0] for size in image_sizes),
+            max(size[1] for size in image_sizes),
         )
         pixel_values = [
             pad(

diff --git a/src/transformers/models/eomt/modeling_eomt.py b/src/transformers/models/eomt/modeling_eomt.py
@@ -628,7 +628,7 @@ def get_num_masks(self, class_labels: torch.Tensor, device: torch.device) -> tor
         """
         Computes the average number of target masks across the batch, for normalization purposes.
         """
-        num_masks = sum([len(classes) for classes in class_labels])
+        num_masks = sum(len(classes) for classes in class_labels)
         num_masks = torch.as_tensor(num_masks, dtype=torch.float, device=device)
         world_size = 1
         if is_accelerate_available():

diff --git a/src/transformers/models/esm/openfold_utils/protein.py b/src/transformers/models/esm/openfold_utils/protein.py
@@ -159,7 +159,7 @@ def add_pdb_headers(prot: Protein, pdb_str: str) -> str:
                 parent_dict.setdefault(str(i), [])
                 parent_dict[str(i)].append(p)
 
-            max_idx = max([int(chain_idx) for chain_idx in parent_dict])
+            max_idx = max(int(chain_idx) for chain_idx in parent_dict)
             for i in range(max_idx + 1):
                 chain_parents = parent_dict.get(str(i), ["N/A"])
                 parents_per_chain.append(chain_parents)

diff --git a/src/transformers/models/idefics/modeling_idefics.py b/src/transformers/models/idefics/modeling_idefics.py
@@ -997,7 +997,7 @@ def forward(
         elif position_ids is None:
             position_ids = cache_position.unsqueeze(0)
 
-        if sum([x is None for x in [pixel_values, image_encoder_embeddings, perceiver_embeddings]]) != 2:
+        if sum(x is None for x in [pixel_values, image_encoder_embeddings, perceiver_embeddings]) != 2:
             raise ValueError(
                 "Exactly 1 of pixel_values, image_encoder_embeddings or perceiver_embeddings has to be not-None."
             )

diff --git a/src/transformers/models/layoutlmv3/tokenization_layoutlmv3.py b/src/transformers/models/layoutlmv3/tokenization_layoutlmv3.py
@@ -522,7 +522,7 @@ def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs):
         if (
             (is_split_into_words or add_prefix_space)
             and (len(text) > 0 and not text[0].isspace())
-            and sum([text.startswith(no_split_token) for no_split_token in self.added_tokens_encoder]) == 0
+            and sum(text.startswith(no_split_token) for no_split_token in self.added_tokens_encoder) == 0
         ):
             text = " " + text
         return (text, kwargs)

diff --git a/src/transformers/models/mask2former/modeling_mask2former.py b/src/transformers/models/mask2former/modeling_mask2former.py
@@ -783,7 +783,7 @@ def get_num_masks(self, class_labels: torch.Tensor, device: torch.device) -> tor
         """
         Computes the average number of target masks across the batch, for normalization purposes.
         """
-        num_masks = sum([len(classes) for classes in class_labels])
+        num_masks = sum(len(classes) for classes in class_labels)
         num_masks = torch.as_tensor(num_masks, dtype=torch.float, device=device)
         world_size = 1
         if is_accelerate_available():

diff --git a/src/transformers/models/maskformer/modeling_maskformer.py b/src/transformers/models/maskformer/modeling_maskformer.py
@@ -1088,7 +1088,7 @@ def get_num_masks(self, class_labels: torch.Tensor, device: torch.device) -> tor
         """
         Computes the average number of target masks across the batch, for normalization purposes.
         """
-        num_masks = sum([len(classes) for classes in class_labels])
+        num_masks = sum(len(classes) for classes in class_labels)
         num_masks = torch.as_tensor(num_masks, dtype=torch.float, device=device)
         world_size = 1
         if is_accelerate_available():

diff --git a/src/transformers/models/mllama/image_processing_mllama.py b/src/transformers/models/mllama/image_processing_mllama.py
@@ -327,7 +327,7 @@ def build_aspect_ratio_mask(aspect_ratios: list[list[tuple[int, int]]], max_imag
             The mask contains 1s for valid tiles and 0s for padding.
     """
     batch_size = len(aspect_ratios)
-    max_num_images = max([len(row) for row in aspect_ratios])
+    max_num_images = max(len(row) for row in aspect_ratios)
 
     aspect_ratio_mask = np.zeros((batch_size, max_num_images, max_image_tiles), dtype=np.int64)
 
@@ -374,7 +374,7 @@ def pack_images(
 
     # Determine output shape
     batch_size = len(batch_images)
-    max_num_images = max([len(images) for images in batch_images])
+    max_num_images = max(len(images) for images in batch_images)
     shapes = [image.shape for images in batch_images for image in images]
     _, channels, tile_height, tile_width = shapes[0]
 
@@ -412,7 +412,7 @@ def pack_aspect_ratios(aspect_ratios: list[list[tuple[int, int]]], pad_value: in
             The aspect ratios stacked into a numpy array with shape (batch_size, max_num_images, 2).
     """
     batch_size = len(aspect_ratios)
-    max_num_images = max([len(row) for row in aspect_ratios])
+    max_num_images = max(len(row) for row in aspect_ratios)
 
     aspect_ratios_stacked = np.full((batch_size, max_num_images, 2), pad_value, dtype=np.int64)
     for i, row in enumerate(aspect_ratios):
@@ -442,7 +442,7 @@ def convert_aspect_ratios_to_ids(aspect_ratios: list[list[tuple[int, int]]], max
     """
 
     batch_size = len(aspect_ratios)
-    max_num_images = max([len(row) for row in aspect_ratios])
+    max_num_images = max(len(row) for row in aspect_ratios)
     supported_aspect_ratios = get_all_supported_aspect_ratios(max_image_tiles)
 
     aspect_ratios_ids = np.zeros((batch_size, max_num_images), dtype=np.int64)

diff --git a/src/transformers/models/mllama/processing_mllama.py b/src/transformers/models/mllama/processing_mllama.py
@@ -117,7 +117,7 @@ def convert_sparse_cross_attention_mask_to_dense(
     """
 
     batch_size = len(cross_attention_token_mask)
-    max_num_images = max([len(masks) for masks in cross_attention_token_mask])
+    max_num_images = max(len(masks) for masks in cross_attention_token_mask)
 
     cross_attention_mask = np.zeros(
         shape=(batch_size, length, max_num_images, max_num_tiles),

diff --git a/src/transformers/models/moshi/modeling_moshi.py b/src/transformers/models/moshi/modeling_moshi.py
@@ -1702,7 +1702,7 @@ def forward(
 
             if audio_codes is not None:
                 audio_inputs_embeds = sum(
-                    [self.embed_tokens[codebook](audio_codes[:, codebook]) for codebook in range(audio_codes.shape[1])]
+                    self.embed_tokens[codebook](audio_codes[:, codebook]) for codebook in range(audio_codes.shape[1])
                 )
                 inputs_embeds = (
                     audio_inputs_embeds
@@ -1872,20 +1872,18 @@ def _prepare_inputs_embeds_for_generation(
             if user_audio_codes is not None and moshi_audio_codes is not None:
                 audio_codes = torch.cat([moshi_audio_codes, user_audio_codes], dim=1)
                 audio_inputs_embeds = sum(
-                    [self.embed_tokens[codebook](audio_codes[:, codebook]) for codebook in range(audio_codes.shape[1])]
+                    self.embed_tokens[codebook](audio_codes[:, codebook]) for codebook in range(audio_codes.shape[1])
                 )
             elif moshi_audio_codes is not None:
                 audio_codes = moshi_audio_codes
                 audio_inputs_embeds = sum(
-                    [self.embed_tokens[codebook](audio_codes[:, codebook]) for codebook in range(audio_codes.shape[1])]
+                    self.embed_tokens[codebook](audio_codes[:, codebook]) for codebook in range(audio_codes.shape[1])
                 )
             elif user_audio_codes is not None:
                 audio_codes = user_audio_codes
                 audio_inputs_embeds = sum(
-                    [
-                        self.embed_tokens[codebook](audio_codes[:, codebook + self.num_codebooks])
-                        for codebook in range(audio_codes.shape[1])
-                    ]
+                    self.embed_tokens[codebook](audio_codes[:, codebook + self.num_codebooks])
+                    for codebook in range(audio_codes.shape[1])
                 )
 
             if input_ids is not None:

diff --git a/src/transformers/models/musicgen_melody/modeling_musicgen_melody.py b/src/transformers/models/musicgen_melody/modeling_musicgen_melody.py
@@ -523,7 +523,7 @@ def forward(
 
         past_key_values_length = past_key_values.get_seq_length() if past_key_values is not None else 0
         if inputs_embeds is None:
-            inputs_embeds = sum([self.embed_tokens[codebook](input[:, codebook]) for codebook in range(num_codebooks)])
+            inputs_embeds = sum(self.embed_tokens[codebook](input[:, codebook]) for codebook in range(num_codebooks))
 
         if encoder_hidden_states is not None:
             # take care of attention masks

diff --git a/src/transformers/models/nllb_moe/convert_nllb_moe_sharded_original_checkpoint_to_pytorch.py b/src/transformers/models/nllb_moe/convert_nllb_moe_sharded_original_checkpoint_to_pytorch.py
@@ -85,7 +85,7 @@ def shard_on_the_fly(switch_checkpoint_path, dump_path, num_experts, dtype, weig
             )
             torch.save(expert_state, save_path)
             sharded_state_dicts.append(expert_state.keys())
-            total_size += sum([value.numel() for key, value in expert_state.items()]) * (
+            total_size += sum(value.numel() for key, value in expert_state.items()) * (
                 expert_state[list(expert_state)[0]].element_size()
             )
 

diff --git a/src/transformers/models/omdet_turbo/modeling_omdet_turbo.py b/src/transformers/models/omdet_turbo/modeling_omdet_turbo.py
@@ -352,7 +352,7 @@ def forward(
         batch_size, num_queries, _ = hidden_states.shape
         batch_size, sequence_length, _ = encoder_hidden_states.shape
         # Ignore copy
-        total_elements = sum([shape[0] * shape[1] for shape in spatial_shapes_list])
+        total_elements = sum(shape[0] * shape[1] for shape in spatial_shapes_list)
         if total_elements != sequence_length:
             raise ValueError(
                 "Make sure to align the spatial shapes with the sequence length of the encoder hidden states"
@@ -1086,7 +1086,7 @@ def get_cached_task_embeddings(self, tasks_input_ids, tasks_attention_mask):
                 self.language_cache_prompt.put(not_cached_tasks[idx], (emb, cur_mask))
 
         # pad before concat if needed
-        max_len = max([task.shape[0] for task in total_task_features])
+        max_len = max(task.shape[0] for task in total_task_features)
         for idx, task in enumerate(total_task_features):
             if task.shape[0] < max_len:
                 pad_size = max_len - task.shape[0]

diff --git a/src/transformers/models/owlv2/processing_owlv2.py b/src/transformers/models/owlv2/processing_owlv2.py
@@ -139,7 +139,7 @@ def __call__(
                 encodings = []
 
                 # Maximum number of queries across batch
-                max_num_queries = max([len(text_single) for text_single in text])
+                max_num_queries = max(len(text_single) for text_single in text)
 
                 # Pad all batch samples to max number of text queries
                 for text_single in text:

diff --git a/src/transformers/models/owlvit/processing_owlvit.py b/src/transformers/models/owlvit/processing_owlvit.py
@@ -149,7 +149,7 @@ def __call__(
                 encodings = []
 
                 # Maximum number of queries across batch
-                max_num_queries = max([len(text_single) for text_single in text])
+                max_num_queries = max(len(text_single) for text_single in text)
 
                 # Pad all batch samples to max number of text queries
                 for text_single in text:

diff --git a/src/transformers/models/pixtral/image_processing_pixtral.py b/src/transformers/models/pixtral/image_processing_pixtral.py
@@ -302,8 +302,8 @@ def _pad_for_batching(
         """
 
         max_shape = (
-            max([size[0] for size in image_sizes]),
-            max([size[1] for size in image_sizes]),
+            max(size[0] for size in image_sizes),
+            max(size[1] for size in image_sizes),
         )
         pixel_values = [
             pad(

diff --git a/src/transformers/models/pixtral/image_processing_pixtral_fast.py b/src/transformers/models/pixtral/image_processing_pixtral_fast.py
@@ -126,7 +126,7 @@ def _pad_for_batching(
             list[`torch.Tensor`]: The padded images.
         """
 
-        max_shape = (max([size[0] for size in image_sizes]), max([size[1] for size in image_sizes]))
+        max_shape = (max(size[0] for size in image_sizes), max(size[1] for size in image_sizes))
         pixel_values = [
             torch.nn.functional.pad(image, pad=(0, max_shape[1] - size[1], 0, max_shape[0] - size[0]))
             for image, size in zip(pixel_values, image_sizes)

diff --git a/src/transformers/models/rag/modeling_rag.py b/src/transformers/models/rag/modeling_rag.py
@@ -1080,9 +1080,7 @@ def _mask_pads(ll, smooth_obj):
 
     @staticmethod
     def _cat_and_pad(tensors, pad_token_id):
-        output = (
-            tensors[0].new(sum([t.shape[0] for t in tensors]), max([t.shape[1] for t in tensors])).fill_(pad_token_id)
-        )
+        output = tensors[0].new(sum(t.shape[0] for t in tensors), max(t.shape[1] for t in tensors)).fill_(pad_token_id)
         ind = 0
         for t in tensors:
             output[ind : ind + t.shape[0], : t.shape[1]] = t

diff --git a/src/transformers/models/sam/processing_sam.py b/src/transformers/models/sam/processing_sam.py
@@ -190,7 +190,7 @@ def _pad_points_and_labels(self, input_points, input_labels, point_pad_value):
         r"""
         The method pads the 2D points and labels to the maximum number of points in the batch.
         """
-        expected_nb_points = max([point.shape[0] for point in input_points])
+        expected_nb_points = max(point.shape[0] for point in input_points)
         processed_input_points = []
         for i, point in enumerate(input_points):
             if point.shape[0] != expected_nb_points:

diff --git a/src/transformers/models/smolvlm/processing_smolvlm.py b/src/transformers/models/smolvlm/processing_smolvlm.py
@@ -316,7 +316,7 @@ def __call__(
                 text = [text]
             elif not isinstance(text, list) and not isinstance(text[0], str):
                 raise ValueError("Invalid input text. Please provide a string, or a list of strings")
-            n_images_in_text = sum([sample.count(self.image_token) for sample in text])
+            n_images_in_text = sum(sample.count(self.image_token) for sample in text)
             if n_images_in_text > 0 and (images is None and videos is None):
                 raise ValueError(f"We detected {n_images_in_text} tokens in the text but no images/videos were passed")
 

diff --git a/src/transformers/models/vivit/convert_vivit_flax_to_pytorch.py b/src/transformers/models/vivit/convert_vivit_flax_to_pytorch.py
@@ -129,7 +129,7 @@ def transform_state_encoder_block(state_dict, i):
 
 
 def get_n_layers(state_dict):
-    return sum([1 if "encoderblock_" in k else 0 for k in state_dict["optimizer"]["target"]["Transformer"]])
+    return sum(1 if "encoderblock_" in k else 0 for k in state_dict["optimizer"]["target"]["Transformer"])
 
 
 def transform_state(state_dict, classification_head=False):

diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
@@ -5233,7 +5233,7 @@ def _get_num_items_in_batch(self, batch_samples: list, device: torch.device) ->
         if count_num_items_in_batch:
             # For now we don't support object detection
             try:
-                num_items_in_batch = sum([(batch["labels"].ne(-100)).sum() for batch in batch_samples])
+                num_items_in_batch = sum((batch["labels"].ne(-100)).sum() for batch in batch_samples)
             except (TypeError, AttributeError):
                 pass
 

diff --git a/tests/trainer/test_trainer_seq2seq.py b/tests/trainer/test_trainer_seq2seq.py
@@ -77,7 +77,7 @@ def _compute_metrics(pred):
             pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
             label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)
 
-            accuracy = sum([int(pred_str[i] == label_str[i]) for i in range(len(pred_str))]) / len(pred_str)
+            accuracy = sum(int(pred_str[i] == label_str[i]) for i in range(len(pred_str))) / len(pred_str)
 
             return {"accuracy": accuracy}