diff --git a/src/transformers/models/idefics3/processing_idefics3.py b/src/transformers/models/idefics3/processing_idefics3.py index 451af1d8a38f..143805ef6f43 100644 --- a/src/transformers/models/idefics3/processing_idefics3.py +++ b/src/transformers/models/idefics3/processing_idefics3.py @@ -286,8 +286,8 @@ def __call__( f"The number of images in the text {n_images_in_text} and images {n_images_in_images} should be the same." ) - image_rows = inputs.pop("rows", [[0] * len(text)]) - image_cols = inputs.pop("cols", [[0] * len(text)]) + image_rows = inputs.pop("rows", [[0] * n_images for n_images in n_images_in_text]) + image_cols = inputs.pop("cols", [[0] * n_images for n_images in n_images_in_text]) fake_image_token = self.fake_image_token image_token = self.image_token diff --git a/src/transformers/models/smolvlm/processing_smolvlm.py b/src/transformers/models/smolvlm/processing_smolvlm.py index 86d07e238f1b..a621af06e647 100644 --- a/src/transformers/models/smolvlm/processing_smolvlm.py +++ b/src/transformers/models/smolvlm/processing_smolvlm.py @@ -172,8 +172,6 @@ def __init__( def expand_text_with_image_tokens(self, text, image_rows, image_cols): prompt_strings = [] - image_rows = image_rows if image_rows is not None else [[0] * len(text)] - image_cols = image_cols if image_cols is not None else [[0] * len(text)] for sample, sample_rows, sample_cols in zip(text, image_rows, image_cols): # Replace the image token with fake tokens around the expanded image token sequence of length `image_seq_len` image_prompt_strings = [] @@ -330,6 +328,11 @@ def __call__( raise ValueError( f"The number of images in the text {n_images_in_text} and images {n_images_in_images} should be the same." ) + # Set default values for image_rows and image_cols if not provided + if image_rows is None: + image_rows = [[0] * n_images for n_images in n_images_in_text] + if image_cols is None: + image_cols = [[0] * n_images for n_images in n_images_in_text] text = self.expand_text_with_image_tokens(text, image_rows=image_rows, image_cols=image_cols) elif videos is not None: