diff --git a/src/transformers/models/llava/modeling_llava.py b/src/transformers/models/llava/modeling_llava.py index cd015f83ae97..09c63e39b3df 100644 --- a/src/transformers/models/llava/modeling_llava.py +++ b/src/transformers/models/llava/modeling_llava.py @@ -240,6 +240,7 @@ def __init__(self, config: LlavaConfig): self.vocab_size = config.text_config.vocab_size self.language_model = AutoModelForCausalLM.from_config(config.text_config) self.pad_token_id = self.config.pad_token_id if self.config.pad_token_id is not None else -1 + self.post_init() def get_input_embeddings(self): diff --git a/src/transformers/models/llava/processing_llava.py b/src/transformers/models/llava/processing_llava.py index 098b6fb379b6..630ccdce1434 100644 --- a/src/transformers/models/llava/processing_llava.py +++ b/src/transformers/models/llava/processing_llava.py @@ -157,7 +157,9 @@ def __call__( # Replace the image token with the expanded image token sequence pixel_values = image_inputs["pixel_values"] height, width = get_image_size(to_numpy_array(pixel_values[0])) - num_image_tokens = (height // self.patch_size) * (width // self.patch_size) + 1 + num_image_tokens = (height // self.patch_size) * ( + width // self.patch_size + ) + self.num_additional_image_tokens if self.vision_feature_select_strategy == "default": num_image_tokens -= 1 diff --git a/src/transformers/models/llava_next/processing_llava_next.py b/src/transformers/models/llava_next/processing_llava_next.py index cc293a416b38..d8034ca9fa56 100644 --- a/src/transformers/models/llava_next/processing_llava_next.py +++ b/src/transformers/models/llava_next/processing_llava_next.py @@ -155,6 +155,9 @@ def __call__( for sample in text: while self.image_token in sample: image_size = next(image_sizes) + if not isinstance(image_size, (list, tuple)): + # cast to list to avoid numerical precision errors when calculating unpadding + image_size = image_size.tolist() orig_height, orig_width = image_size num_image_tokens = self._get_number_of_features(orig_height, orig_width, height, width) if self.vision_feature_select_strategy == "default": diff --git a/src/transformers/models/llava_next_video/processing_llava_next_video.py b/src/transformers/models/llava_next_video/processing_llava_next_video.py index f3b2b78f7aa6..e9307ee37c9e 100644 --- a/src/transformers/models/llava_next_video/processing_llava_next_video.py +++ b/src/transformers/models/llava_next_video/processing_llava_next_video.py @@ -180,6 +180,9 @@ def __call__( for sample in text: while self.image_token in sample: image_size = next(image_sizes) + if not isinstance(image_size, (list, tuple)): + # cast to list to avoid numerical precision errors when calculating unpadding + image_size = image_size.tolist() orig_height, orig_width = image_size num_image_tokens = self._get_number_of_features(orig_height, orig_width, height, width) if self.vision_feature_select_strategy == "default": @@ -193,6 +196,8 @@ def __call__( one_video = to_numpy_array(videos_inputs.get("pixel_values_videos")[0]) height, width = get_image_size(one_video[0]) num_frames = one_video.shape[0] # frame dim is always after batch dim + + # no `self.num_additional_image_tokens` added because video always has a default feature selection strategy num_image_tokens = (height // self.patch_size) * (width // self.patch_size) num_video_tokens = num_image_tokens // 4 * num_frames # divide by 4 needed for avg pooling layer prompt_strings = [] diff --git a/src/transformers/models/video_llava/processing_video_llava.py b/src/transformers/models/video_llava/processing_video_llava.py index 3f58675d047a..9d11f4268518 100644 --- a/src/transformers/models/video_llava/processing_video_llava.py +++ b/src/transformers/models/video_llava/processing_video_llava.py @@ -179,7 +179,7 @@ def __call__( ) + self.num_additional_image_tokens num_video_tokens = num_image_tokens * num_frames if self.vision_feature_select_strategy == "default": - num_image_tokens -= self.num_additional_image_tokens + num_image_tokens -= 1 prompt_strings = [] for sample in text: diff --git a/src/transformers/models/vipllava/modeling_vipllava.py b/src/transformers/models/vipllava/modeling_vipllava.py index 0eb65b0fc722..084b92c9771c 100644 --- a/src/transformers/models/vipllava/modeling_vipllava.py +++ b/src/transformers/models/vipllava/modeling_vipllava.py @@ -243,6 +243,7 @@ def __init__(self, config: VipLlavaConfig): self.vocab_size = config.text_config.vocab_size self.language_model = AutoModelForCausalLM.from_config(config.text_config) self.pad_token_id = self.config.pad_token_id if self.config.pad_token_id is not None else -1 + self.post_init() def get_input_embeddings(self):