diff --git a/src/transformers/models/llava_next/processing_llava_next.py b/src/transformers/models/llava_next/processing_llava_next.py index ce11be6d6309..34457ba441d9 100644 --- a/src/transformers/models/llava_next/processing_llava_next.py +++ b/src/transformers/models/llava_next/processing_llava_next.py @@ -152,7 +152,9 @@ def __call__( for sample in text: while self.image_token in sample: image_size = next(image_sizes) - orig_height, orig_width = image_size + if not isinstance(image_size, (list, tuple)): + # cast to list to avoid numerical precision errors when calculating unpadding + orig_height, orig_width = image_size.tolist() num_image_tokens = self._get_number_of_features(orig_height, orig_width, height, width) if self.vision_feature_select_strategy == "default": num_image_tokens -= 1 diff --git a/src/transformers/models/llava_next_video/processing_llava_next_video.py b/src/transformers/models/llava_next_video/processing_llava_next_video.py index e0e4534e42b5..9a09f85b6cd5 100644 --- a/src/transformers/models/llava_next_video/processing_llava_next_video.py +++ b/src/transformers/models/llava_next_video/processing_llava_next_video.py @@ -177,7 +177,9 @@ def __call__( for sample in text: while self.image_token in sample: image_size = next(image_sizes) - orig_height, orig_width = image_size + if not isinstance(image_size, (list, tuple)): + # cast to list to avoid numerical precision errors when calculating unpadding + orig_height, orig_width = image_size.tolist() num_image_tokens = self._get_number_of_features(orig_height, orig_width, height, width) if self.vision_feature_select_strategy == "default": num_image_tokens -= 1 diff --git a/src/transformers/models/llava_onevision/processing_llava_onevision.py b/src/transformers/models/llava_onevision/processing_llava_onevision.py index 039e05a7ec19..f73951b84309 100644 --- a/src/transformers/models/llava_onevision/processing_llava_onevision.py +++ b/src/transformers/models/llava_onevision/processing_llava_onevision.py @@ -188,7 +188,10 @@ def _expand_image_tokens( for sample in text: while special_token in sample: image_size_list = next(image_sizes) - orig_height, orig_width = image_size_list[0] if num_frames != 1 else image_size_list + original_size = image_size_list[0] if num_frames != 1 else image_size_list + if not isinstance(original_size, (list, tuple)): + # cast to list to avoid numerical precision errors when calculating unpadding + orig_height, orig_width = original_size.tolist() num_image_tokens = self._get_number_of_features(orig_height, orig_width, height, width) if self.vision_feature_select_strategy == "default": num_image_tokens -= 1