diff --git a/src/transformers/models/aria/modeling_aria.py b/src/transformers/models/aria/modeling_aria.py index ebddfcfe7115..55b0ed79d707 100644 --- a/src/transformers/models/aria/modeling_aria.py +++ b/src/transformers/models/aria/modeling_aria.py @@ -1225,6 +1225,18 @@ def set_decoder(self, decoder): def get_decoder(self): return self.model + def get_image_features( + self, + pixel_values: torch.FloatTensor, + pixel_mask: Optional[torch.FloatTensor] = None, + vision_feature_layer: int = -1, + ): + return self.model.get_image_features( + pixel_values=pixel_values, + pixel_mask=pixel_mask, + vision_feature_layer=vision_feature_layer, + ) + # Make modules available throught conditional class for BC @property def language_model(self): diff --git a/src/transformers/models/aria/modular_aria.py b/src/transformers/models/aria/modular_aria.py index c80351cd9a81..b8ae65d1d516 100644 --- a/src/transformers/models/aria/modular_aria.py +++ b/src/transformers/models/aria/modular_aria.py @@ -1497,6 +1497,18 @@ def forward( """ ) class AriaForConditionalGeneration(LlavaForConditionalGeneration): + def get_image_features( + self, + pixel_values: torch.FloatTensor, + pixel_mask: Optional[torch.FloatTensor] = None, + vision_feature_layer: int = -1, + ): + return self.model.get_image_features( + pixel_values=pixel_values, + pixel_mask=pixel_mask, + vision_feature_layer=vision_feature_layer, + ) + @can_return_tuple @auto_docstring def forward( diff --git a/src/transformers/models/aya_vision/modeling_aya_vision.py b/src/transformers/models/aya_vision/modeling_aya_vision.py index 1d723f1b5aee..f3b9d77b2ce5 100644 --- a/src/transformers/models/aya_vision/modeling_aya_vision.py +++ b/src/transformers/models/aya_vision/modeling_aya_vision.py @@ -394,6 +394,20 @@ def set_decoder(self, decoder): def get_decoder(self): return self.model + def get_image_features( + self, + pixel_values: torch.FloatTensor, + vision_feature_layer: Optional[Union[int, List[int]]] = None, + vision_feature_select_strategy: Optional[str] = None, + **kwargs, + ): + return self.model.get_image_features( + pixel_values=pixel_values, + vision_feature_layer=vision_feature_layer, + vision_feature_select_strategy=vision_feature_select_strategy, + **kwargs, + ) + # Make modules available throught conditional class for BC @property def language_model(self): diff --git a/src/transformers/models/chameleon/modeling_chameleon.py b/src/transformers/models/chameleon/modeling_chameleon.py index 5644f68baa92..63ec521882c8 100644 --- a/src/transformers/models/chameleon/modeling_chameleon.py +++ b/src/transformers/models/chameleon/modeling_chameleon.py @@ -1229,6 +1229,12 @@ def set_decoder(self, decoder): def get_decoder(self): return self.model + def get_image_tokens(self, pixel_values): + return self.model.get_image_tokens(pixel_values) + + def get_image_features(self, pixel_values): + return self.model.get_image_features(pixel_values) + @can_return_tuple @auto_docstring def forward( diff --git a/src/transformers/models/gemma3/modeling_gemma3.py b/src/transformers/models/gemma3/modeling_gemma3.py index 08740173009d..dd8587e9d114 100644 --- a/src/transformers/models/gemma3/modeling_gemma3.py +++ b/src/transformers/models/gemma3/modeling_gemma3.py @@ -1019,6 +1019,9 @@ def set_decoder(self, decoder): def get_decoder(self): return self.model + def get_image_features(self, pixel_values): + return self.model.get_image_features(pixel_values) + # Make modules available throught conditional class for BC @property def language_model(self): diff --git a/src/transformers/models/got_ocr2/modeling_got_ocr2.py b/src/transformers/models/got_ocr2/modeling_got_ocr2.py index fc3ab807df82..044013bed45e 100644 --- a/src/transformers/models/got_ocr2/modeling_got_ocr2.py +++ b/src/transformers/models/got_ocr2/modeling_got_ocr2.py @@ -762,6 +762,20 @@ def set_decoder(self, decoder): def get_decoder(self): return self.model + def get_image_features( + self, + pixel_values: torch.FloatTensor, + vision_feature_layer: Optional[Union[int, List[int]]] = None, + vision_feature_select_strategy: Optional[str] = None, + **kwargs, + ): + return self.model.get_image_features( + pixel_values=pixel_values, + vision_feature_layer=vision_feature_layer, + vision_feature_select_strategy=vision_feature_select_strategy, + **kwargs, + ) + # Make modules available throught conditional class for BC @property def language_model(self): diff --git a/src/transformers/models/idefics2/modeling_idefics2.py b/src/transformers/models/idefics2/modeling_idefics2.py index f7008ad33e83..4607051b195a 100644 --- a/src/transformers/models/idefics2/modeling_idefics2.py +++ b/src/transformers/models/idefics2/modeling_idefics2.py @@ -1186,6 +1186,9 @@ def get_output_embeddings(self): def set_output_embeddings(self, new_embeddings): self.lm_head = new_embeddings + def get_image_features(self, pixel_values: torch.FloatTensor, pixel_attention_mask: torch.LongTensor = None): + return self.model.get_image_features(pixel_values=pixel_values, pixel_attention_mask=pixel_attention_mask) + @can_return_tuple @auto_docstring def forward( diff --git a/src/transformers/models/idefics3/modeling_idefics3.py b/src/transformers/models/idefics3/modeling_idefics3.py index 41043b3e7455..99ea7fd0d3d0 100644 --- a/src/transformers/models/idefics3/modeling_idefics3.py +++ b/src/transformers/models/idefics3/modeling_idefics3.py @@ -909,6 +909,9 @@ def get_output_embeddings(self): def set_output_embeddings(self, new_embeddings): self.lm_head = new_embeddings + def get_image_features(self, pixel_values: torch.FloatTensor, pixel_attention_mask: torch.LongTensor = None): + return self.model.get_image_features(pixel_values=pixel_values, pixel_attention_mask=pixel_attention_mask) + @can_return_tuple @auto_docstring def forward( diff --git a/src/transformers/models/internvl/modeling_internvl.py b/src/transformers/models/internvl/modeling_internvl.py index 4c747a2394e5..84a2d412ff42 100644 --- a/src/transformers/models/internvl/modeling_internvl.py +++ b/src/transformers/models/internvl/modeling_internvl.py @@ -883,6 +883,20 @@ def set_decoder(self, decoder): def get_decoder(self): return self.model + def get_image_features( + self, + pixel_values: torch.FloatTensor, + vision_feature_layer: Optional[Union[int, List[int]]] = None, + vision_feature_select_strategy: Optional[str] = None, + **kwargs, + ): + return self.model.get_image_features( + pixel_values=pixel_values, + vision_feature_layer=vision_feature_layer, + vision_feature_select_strategy=vision_feature_select_strategy, + **kwargs, + ) + # Make modules available throught conditional class for BC @property def language_model(self): diff --git a/src/transformers/models/llava/modeling_llava.py b/src/transformers/models/llava/modeling_llava.py index 4321dea59d94..aa83b57678b6 100644 --- a/src/transformers/models/llava/modeling_llava.py +++ b/src/transformers/models/llava/modeling_llava.py @@ -376,6 +376,20 @@ def set_decoder(self, decoder): def get_decoder(self): return self.model + def get_image_features( + self, + pixel_values: torch.FloatTensor, + vision_feature_layer: Optional[Union[int, List[int]]] = None, + vision_feature_select_strategy: Optional[str] = None, + **kwargs, + ): + return self.model.get_image_features( + pixel_values=pixel_values, + vision_feature_layer=vision_feature_layer, + vision_feature_select_strategy=vision_feature_select_strategy, + **kwargs, + ) + # Make modules available throught conditional class for BC @property def language_model(self): diff --git a/src/transformers/models/llava_next/modeling_llava_next.py b/src/transformers/models/llava_next/modeling_llava_next.py index e5b597e819ed..010de9060a1b 100644 --- a/src/transformers/models/llava_next/modeling_llava_next.py +++ b/src/transformers/models/llava_next/modeling_llava_next.py @@ -574,6 +574,28 @@ def set_decoder(self, decoder): def get_decoder(self): return self.model + def pack_image_features(self, image_features, image_sizes, vision_feature_select_strategy, image_newline=None): + return self.model.pack_image_features( + image_features=image_features, + image_sizes=image_sizes, + vision_feature_select_strategy=vision_feature_select_strategy, + image_newline=image_newline, + ) + + def get_image_features( + self, + pixel_values: torch.FloatTensor, + image_sizes: torch.Tensor, + vision_feature_layer: Optional[Union[int, List[int]]] = None, + vision_feature_select_strategy: Optional[str] = None, + ): + return self.model.get_image_features( + pixel_values=pixel_values, + image_sizes=image_sizes, + vision_feature_layer=vision_feature_layer, + vision_feature_select_strategy=vision_feature_select_strategy, + ) + # Make modules available throught conditional class for BC @property def language_model(self): diff --git a/src/transformers/models/llava_next_video/modeling_llava_next_video.py b/src/transformers/models/llava_next_video/modeling_llava_next_video.py index 368fd09ef32f..9b663bd39032 100644 --- a/src/transformers/models/llava_next_video/modeling_llava_next_video.py +++ b/src/transformers/models/llava_next_video/modeling_llava_next_video.py @@ -706,6 +706,28 @@ def set_decoder(self, decoder): def get_decoder(self): return self.model + def pack_image_features(self, image_features, image_sizes, vision_feature_select_strategy, image_newline=None): + return self.model.pack_image_features( + image_features=image_features, + image_sizes=image_sizes, + vision_feature_select_strategy=vision_feature_select_strategy, + image_newline=image_newline, + ) + + def get_image_features( + self, + pixel_values: torch.FloatTensor, + image_sizes: torch.Tensor, + vision_feature_layer: Optional[Union[int, List[int]]] = None, + vision_feature_select_strategy: Optional[str] = None, + ): + return self.model.get_image_features( + pixel_values=pixel_values, + image_sizes=image_sizes, + vision_feature_layer=vision_feature_layer, + vision_feature_select_strategy=vision_feature_select_strategy, + ) + # Make modules available throught conditional class for BC @property def language_model(self): @@ -952,5 +974,17 @@ def _prepare_4d_causal_attention_mask_with_cache_position( return causal_mask + def get_video_features( + self, + pixel_values: torch.FloatTensor, + vision_feature_layer: Optional[Union[int, List[int]]] = None, + vision_feature_select_strategy: Optional[str] = None, + ): + return self.model.get_video_features( + pixel_values=pixel_values, + vision_feature_layer=vision_feature_layer, + vision_feature_select_strategy=vision_feature_select_strategy, + ) + __all__ = ["LlavaNextVideoForConditionalGeneration", "LlavaNextVideoModel", "LlavaNextVideoPreTrainedModel"] diff --git a/src/transformers/models/llava_next_video/modular_llava_next_video.py b/src/transformers/models/llava_next_video/modular_llava_next_video.py index e5d2fd92cb47..97ea22ae2e64 100644 --- a/src/transformers/models/llava_next_video/modular_llava_next_video.py +++ b/src/transformers/models/llava_next_video/modular_llava_next_video.py @@ -496,6 +496,18 @@ def forward( class LlavaNextVideoForConditionalGeneration(LlavaNextForConditionalGeneration): + def get_video_features( + self, + pixel_values: torch.FloatTensor, + vision_feature_layer: Optional[Union[int, List[int]]] = None, + vision_feature_select_strategy: Optional[str] = None, + ): + return self.model.get_video_features( + pixel_values=pixel_values, + vision_feature_layer=vision_feature_layer, + vision_feature_select_strategy=vision_feature_select_strategy, + ) + def forward( self, input_ids: torch.LongTensor = None, diff --git a/src/transformers/models/llava_onevision/modeling_llava_onevision.py b/src/transformers/models/llava_onevision/modeling_llava_onevision.py index 5a1157edebf3..4205abf85715 100644 --- a/src/transformers/models/llava_onevision/modeling_llava_onevision.py +++ b/src/transformers/models/llava_onevision/modeling_llava_onevision.py @@ -747,6 +747,28 @@ def set_decoder(self, decoder): def get_decoder(self): return self.model + def pack_image_features(self, image_features, image_sizes, vision_feature_select_strategy, image_newline=None): + return self.model.pack_image_features( + image_features=image_features, + image_sizes=image_sizes, + vision_feature_select_strategy=vision_feature_select_strategy, + image_newline=image_newline, + ) + + def get_image_features( + self, + pixel_values: torch.FloatTensor, + image_sizes: torch.Tensor, + vision_feature_layer: Optional[Union[int, List[int]]] = None, + vision_feature_select_strategy: Optional[str] = None, + ): + return self.model.get_image_features( + pixel_values=pixel_values, + image_sizes=image_sizes, + vision_feature_layer=vision_feature_layer, + vision_feature_select_strategy=vision_feature_select_strategy, + ) + # Make modules available throught conditional class for BC @property def language_model(self): @@ -988,5 +1010,17 @@ def _prepare_4d_causal_attention_mask_with_cache_position( return causal_mask + def get_video_features( + self, + pixel_values: torch.FloatTensor, + vision_feature_layer: Optional[Union[int, List[int]]] = None, + vision_feature_select_strategy: Optional[str] = None, + ): + return self.model.get_video_features( + pixel_values=pixel_values, + vision_feature_layer=vision_feature_layer, + vision_feature_select_strategy=vision_feature_select_strategy, + ) + __all__ = ["LlavaOnevisionModel", "LlavaOnevisionForConditionalGeneration", "LlavaOnevisionPreTrainedModel"] diff --git a/src/transformers/models/mistral3/modeling_mistral3.py b/src/transformers/models/mistral3/modeling_mistral3.py index 082020b3afd0..e034392b740a 100644 --- a/src/transformers/models/mistral3/modeling_mistral3.py +++ b/src/transformers/models/mistral3/modeling_mistral3.py @@ -412,6 +412,20 @@ def set_decoder(self, decoder): def get_decoder(self): return self.model + def get_image_features( + self, + pixel_values: torch.FloatTensor, + image_sizes: torch.Tensor, + vision_feature_layer: Optional[Union[int, List[int]]] = None, + **kwargs, + ): + return self.model.get_image_features( + pixel_values=pixel_values, + image_sizes=image_sizes, + vision_feature_layer=vision_feature_layer, + **kwargs, + ) + # Make modules available throught conditional class for BC @property def language_model(self): diff --git a/src/transformers/models/mistral3/modular_mistral3.py b/src/transformers/models/mistral3/modular_mistral3.py index 666b4cde4d5a..611973b64bdd 100644 --- a/src/transformers/models/mistral3/modular_mistral3.py +++ b/src/transformers/models/mistral3/modular_mistral3.py @@ -254,6 +254,20 @@ def forward( class Mistral3ForConditionalGeneration(LlavaForConditionalGeneration): + def get_image_features( + self, + pixel_values: torch.FloatTensor, + image_sizes: torch.Tensor, + vision_feature_layer: Optional[Union[int, List[int]]] = None, + **kwargs, + ): + return self.model.get_image_features( + pixel_values=pixel_values, + image_sizes=image_sizes, + vision_feature_layer=vision_feature_layer, + **kwargs, + ) + def forward( self, input_ids: torch.LongTensor = None, diff --git a/src/transformers/models/paligemma/modeling_paligemma.py b/src/transformers/models/paligemma/modeling_paligemma.py index addba2b30fef..4782da29ce48 100644 --- a/src/transformers/models/paligemma/modeling_paligemma.py +++ b/src/transformers/models/paligemma/modeling_paligemma.py @@ -423,6 +423,9 @@ def set_decoder(self, decoder): def get_decoder(self): return self.model + def get_image_features(self, pixel_values): + return self.model.get_image_features(pixel_values) + # Make modules available throught conditional class for BC @property def language_model(self): diff --git a/src/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py b/src/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py index 1b4cfea5b665..17861e714a98 100644 --- a/src/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +++ b/src/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py @@ -1503,6 +1503,14 @@ def set_decoder(self, decoder): def get_decoder(self): return self.model + def get_video_features( + self, pixel_values_videos: torch.FloatTensor, video_grid_thw: Optional[torch.LongTensor] = None + ): + return self.model.get_video_features(pixel_values_videos, video_grid_thw) + + def get_image_features(self, pixel_values: torch.FloatTensor, image_grid_thw: Optional[torch.LongTensor] = None): + return self.model.get_image_features(pixel_values, image_grid_thw) + # Make modules available throught conditional class for BC @property def language_model(self): diff --git a/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py b/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py index a995d064b529..67981d78cb9d 100644 --- a/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py +++ b/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py @@ -1387,6 +1387,14 @@ def set_decoder(self, decoder): def get_decoder(self): return self.model + def get_video_features( + self, pixel_values_videos: torch.FloatTensor, video_grid_thw: Optional[torch.LongTensor] = None + ): + return self.model.get_video_features(pixel_values_videos, video_grid_thw) + + def get_image_features(self, pixel_values: torch.FloatTensor, image_grid_thw: Optional[torch.LongTensor] = None): + return self.model.get_image_features(pixel_values, image_grid_thw) + # Make modules available throught conditional class for BC @property def language_model(self): diff --git a/src/transformers/models/smolvlm/modeling_smolvlm.py b/src/transformers/models/smolvlm/modeling_smolvlm.py index bbf05404ac12..b7b2f87b4429 100644 --- a/src/transformers/models/smolvlm/modeling_smolvlm.py +++ b/src/transformers/models/smolvlm/modeling_smolvlm.py @@ -874,6 +874,9 @@ def get_output_embeddings(self): def set_output_embeddings(self, new_embeddings): self.lm_head = new_embeddings + def get_image_features(self, pixel_values: torch.FloatTensor, pixel_attention_mask: torch.LongTensor = None): + return self.model.get_image_features(pixel_values=pixel_values, pixel_attention_mask=pixel_attention_mask) + @can_return_tuple @auto_docstring def forward( diff --git a/src/transformers/models/video_llava/modeling_video_llava.py b/src/transformers/models/video_llava/modeling_video_llava.py index ed7a19ca6645..8994bf805ed4 100644 --- a/src/transformers/models/video_llava/modeling_video_llava.py +++ b/src/transformers/models/video_llava/modeling_video_llava.py @@ -449,6 +449,18 @@ def set_decoder(self, decoder): def get_decoder(self): return self.model + def get_image_features( + self, + pixel_values_images: torch.FloatTensor, + vision_feature_layer: Optional[Union[int, List[int]]] = None, + vision_feature_select_strategy: Optional[str] = None, + ): + return self.model.get_image_features( + pixel_values_images=pixel_values_images, + vision_feature_layer=vision_feature_layer, + vision_feature_select_strategy=vision_feature_select_strategy, + ) + # Make modules available throught conditional class for BC @property def language_model(self): diff --git a/src/transformers/models/vipllava/modeling_vipllava.py b/src/transformers/models/vipllava/modeling_vipllava.py index c4a20aef9148..e451f3d9af8c 100644 --- a/src/transformers/models/vipllava/modeling_vipllava.py +++ b/src/transformers/models/vipllava/modeling_vipllava.py @@ -332,6 +332,11 @@ def set_decoder(self, decoder): def get_decoder(self): return self.model + def get_image_features( + self, pixel_values: torch.FloatTensor, vision_feature_layers: Optional[Union[int, List[int]]] = None + ): + return self.model.get_image_features(pixel_values=pixel_values, vision_feature_layers=vision_feature_layers) + # Make modules available throught conditional class for BC @property def language_model(self): diff --git a/src/transformers/models/vipllava/modular_vipllava.py b/src/transformers/models/vipllava/modular_vipllava.py index 5ae33e771c6a..a673bba7a99a 100644 --- a/src/transformers/models/vipllava/modular_vipllava.py +++ b/src/transformers/models/vipllava/modular_vipllava.py @@ -184,6 +184,11 @@ def forward( class VipLlavaForConditionalGeneration(LlavaForConditionalGeneration): + def get_image_features( + self, pixel_values: torch.FloatTensor, vision_feature_layers: Optional[Union[int, List[int]]] = None + ): + return self.model.get_image_features(pixel_values=pixel_values, vision_feature_layers=vision_feature_layers) + def forward( self, input_ids: torch.LongTensor = None,