From 3f4dbcd1b741b6a4380df9558d9de1a729f5acde Mon Sep 17 00:00:00 2001 From: raushan Date: Mon, 18 Nov 2024 12:16:08 +0100 Subject: [PATCH 1/3] fix --- .../models/llava_onevision/processing_llava_onevision.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/transformers/models/llava_onevision/processing_llava_onevision.py b/src/transformers/models/llava_onevision/processing_llava_onevision.py index 039e05a7ec19..f22add515c8a 100644 --- a/src/transformers/models/llava_onevision/processing_llava_onevision.py +++ b/src/transformers/models/llava_onevision/processing_llava_onevision.py @@ -188,7 +188,10 @@ def _expand_image_tokens( for sample in text: while special_token in sample: image_size_list = next(image_sizes) - orig_height, orig_width = image_size_list[0] if num_frames != 1 else image_size_list + original_size = image_size_list[0] if num_frames != 1 else image_size_list + orig_height, orig_width = ( + original_size.tolist() + ) # cast to list to avoid numerical precision errors when calculating unpadding num_image_tokens = self._get_number_of_features(orig_height, orig_width, height, width) if self.vision_feature_select_strategy == "default": num_image_tokens -= 1 From f5313efb12b7cf0e8f333a7a83c96a5d6eaf4ac8 Mon Sep 17 00:00:00 2001 From: raushan Date: Mon, 18 Nov 2024 12:19:23 +0100 Subject: [PATCH 2/3] propagate --- src/transformers/models/llava_next/processing_llava_next.py | 3 ++- .../models/llava_next_video/processing_llava_next_video.py | 3 ++- .../models/llava_onevision/processing_llava_onevision.py | 5 ++--- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/src/transformers/models/llava_next/processing_llava_next.py b/src/transformers/models/llava_next/processing_llava_next.py index ce11be6d6309..a296e59e4901 100644 --- a/src/transformers/models/llava_next/processing_llava_next.py +++ b/src/transformers/models/llava_next/processing_llava_next.py @@ -152,7 +152,8 @@ def __call__( for sample in text: while self.image_token in sample: image_size = next(image_sizes) - orig_height, orig_width = image_size + # cast to list to avoid numerical precision errors when calculating unpadding + orig_height, orig_width = image_size.tolist() num_image_tokens = self._get_number_of_features(orig_height, orig_width, height, width) if self.vision_feature_select_strategy == "default": num_image_tokens -= 1 diff --git a/src/transformers/models/llava_next_video/processing_llava_next_video.py b/src/transformers/models/llava_next_video/processing_llava_next_video.py index e0e4534e42b5..b12069182ec9 100644 --- a/src/transformers/models/llava_next_video/processing_llava_next_video.py +++ b/src/transformers/models/llava_next_video/processing_llava_next_video.py @@ -177,7 +177,8 @@ def __call__( for sample in text: while self.image_token in sample: image_size = next(image_sizes) - orig_height, orig_width = image_size + # cast to list to avoid numerical precision errors when calculating unpadding + orig_height, orig_width = image_size.tolist() num_image_tokens = self._get_number_of_features(orig_height, orig_width, height, width) if self.vision_feature_select_strategy == "default": num_image_tokens -= 1 diff --git a/src/transformers/models/llava_onevision/processing_llava_onevision.py b/src/transformers/models/llava_onevision/processing_llava_onevision.py index f22add515c8a..3ad5f104e742 100644 --- a/src/transformers/models/llava_onevision/processing_llava_onevision.py +++ b/src/transformers/models/llava_onevision/processing_llava_onevision.py @@ -189,9 +189,8 @@ def _expand_image_tokens( while special_token in sample: image_size_list = next(image_sizes) original_size = image_size_list[0] if num_frames != 1 else image_size_list - orig_height, orig_width = ( - original_size.tolist() - ) # cast to list to avoid numerical precision errors when calculating unpadding + # cast to list to avoid numerical precision errors when calculating unpadding + orig_height, orig_width = original_size.tolist() num_image_tokens = self._get_number_of_features(orig_height, orig_width, height, width) if self.vision_feature_select_strategy == "default": num_image_tokens -= 1 From 496e56c850fce51637c2276071729fec1a21c31a Mon Sep 17 00:00:00 2001 From: raushan Date: Mon, 18 Nov 2024 14:33:25 +0100 Subject: [PATCH 3/3] type check --- src/transformers/models/llava_next/processing_llava_next.py | 5 +++-- .../models/llava_next_video/processing_llava_next_video.py | 5 +++-- .../models/llava_onevision/processing_llava_onevision.py | 5 +++-- 3 files changed, 9 insertions(+), 6 deletions(-) diff --git a/src/transformers/models/llava_next/processing_llava_next.py b/src/transformers/models/llava_next/processing_llava_next.py index a296e59e4901..34457ba441d9 100644 --- a/src/transformers/models/llava_next/processing_llava_next.py +++ b/src/transformers/models/llava_next/processing_llava_next.py @@ -152,8 +152,9 @@ def __call__( for sample in text: while self.image_token in sample: image_size = next(image_sizes) - # cast to list to avoid numerical precision errors when calculating unpadding - orig_height, orig_width = image_size.tolist() + if not isinstance(image_size, (list, tuple)): + # cast to list to avoid numerical precision errors when calculating unpadding + orig_height, orig_width = image_size.tolist() num_image_tokens = self._get_number_of_features(orig_height, orig_width, height, width) if self.vision_feature_select_strategy == "default": num_image_tokens -= 1 diff --git a/src/transformers/models/llava_next_video/processing_llava_next_video.py b/src/transformers/models/llava_next_video/processing_llava_next_video.py index b12069182ec9..9a09f85b6cd5 100644 --- a/src/transformers/models/llava_next_video/processing_llava_next_video.py +++ b/src/transformers/models/llava_next_video/processing_llava_next_video.py @@ -177,8 +177,9 @@ def __call__( for sample in text: while self.image_token in sample: image_size = next(image_sizes) - # cast to list to avoid numerical precision errors when calculating unpadding - orig_height, orig_width = image_size.tolist() + if not isinstance(image_size, (list, tuple)): + # cast to list to avoid numerical precision errors when calculating unpadding + orig_height, orig_width = image_size.tolist() num_image_tokens = self._get_number_of_features(orig_height, orig_width, height, width) if self.vision_feature_select_strategy == "default": num_image_tokens -= 1 diff --git a/src/transformers/models/llava_onevision/processing_llava_onevision.py b/src/transformers/models/llava_onevision/processing_llava_onevision.py index 3ad5f104e742..f73951b84309 100644 --- a/src/transformers/models/llava_onevision/processing_llava_onevision.py +++ b/src/transformers/models/llava_onevision/processing_llava_onevision.py @@ -189,8 +189,9 @@ def _expand_image_tokens( while special_token in sample: image_size_list = next(image_sizes) original_size = image_size_list[0] if num_frames != 1 else image_size_list - # cast to list to avoid numerical precision errors when calculating unpadding - orig_height, orig_width = original_size.tolist() + if not isinstance(original_size, (list, tuple)): + # cast to list to avoid numerical precision errors when calculating unpadding + orig_height, orig_width = original_size.tolist() num_image_tokens = self._get_number_of_features(orig_height, orig_width, height, width) if self.vision_feature_select_strategy == "default": num_image_tokens -= 1