Skip to content

Commit b78f8c8

Browse files
YushunXiangLewis Marshall
authored andcommitted
fix: Add method to get image features in PaliGemmaForConditionalGeneration (huggingface#38730)
* fix: Add method to retrieve image features in PaliGemmaForConditionalGeneration * feat: Add get_image_features method to multiple models for image feature extraction * fix: reformat the files with ruff. * feat: Add methods for packing and retrieving image and video features across multiple models modified: - modeling_chameleon.py - modeling_llava_next.py - modular_llava_next_video.py - modeling_qwen2_vl.py and generate the: - modeling_llava_next_video.py - modeling_llava_onevision.py - modeling_qwen2_5_vl.py * feat: Implement get_image_features method in Aria, Mistral3, and VipLlava models with updated parameters * fix: reformatted the code with fix-style
1 parent a908a2c commit b78f8c8

23 files changed

+269
-0
lines changed

src/transformers/models/aria/modeling_aria.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1225,6 +1225,18 @@ def set_decoder(self, decoder):
12251225
def get_decoder(self):
12261226
return self.model
12271227

1228+
def get_image_features(
1229+
self,
1230+
pixel_values: torch.FloatTensor,
1231+
pixel_mask: Optional[torch.FloatTensor] = None,
1232+
vision_feature_layer: int = -1,
1233+
):
1234+
return self.model.get_image_features(
1235+
pixel_values=pixel_values,
1236+
pixel_mask=pixel_mask,
1237+
vision_feature_layer=vision_feature_layer,
1238+
)
1239+
12281240
# Make modules available throught conditional class for BC
12291241
@property
12301242
def language_model(self):

src/transformers/models/aria/modular_aria.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1497,6 +1497,18 @@ def forward(
14971497
"""
14981498
)
14991499
class AriaForConditionalGeneration(LlavaForConditionalGeneration):
1500+
def get_image_features(
1501+
self,
1502+
pixel_values: torch.FloatTensor,
1503+
pixel_mask: Optional[torch.FloatTensor] = None,
1504+
vision_feature_layer: int = -1,
1505+
):
1506+
return self.model.get_image_features(
1507+
pixel_values=pixel_values,
1508+
pixel_mask=pixel_mask,
1509+
vision_feature_layer=vision_feature_layer,
1510+
)
1511+
15001512
@can_return_tuple
15011513
@auto_docstring
15021514
def forward(

src/transformers/models/aya_vision/modeling_aya_vision.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -394,6 +394,20 @@ def set_decoder(self, decoder):
394394
def get_decoder(self):
395395
return self.model
396396

397+
def get_image_features(
398+
self,
399+
pixel_values: torch.FloatTensor,
400+
vision_feature_layer: Optional[Union[int, List[int]]] = None,
401+
vision_feature_select_strategy: Optional[str] = None,
402+
**kwargs,
403+
):
404+
return self.model.get_image_features(
405+
pixel_values=pixel_values,
406+
vision_feature_layer=vision_feature_layer,
407+
vision_feature_select_strategy=vision_feature_select_strategy,
408+
**kwargs,
409+
)
410+
397411
# Make modules available throught conditional class for BC
398412
@property
399413
def language_model(self):

src/transformers/models/chameleon/modeling_chameleon.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1229,6 +1229,12 @@ def set_decoder(self, decoder):
12291229
def get_decoder(self):
12301230
return self.model
12311231

1232+
def get_image_tokens(self, pixel_values):
1233+
return self.model.get_image_tokens(pixel_values)
1234+
1235+
def get_image_features(self, pixel_values):
1236+
return self.model.get_image_features(pixel_values)
1237+
12321238
@can_return_tuple
12331239
@auto_docstring
12341240
def forward(

src/transformers/models/gemma3/modeling_gemma3.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1019,6 +1019,9 @@ def set_decoder(self, decoder):
10191019
def get_decoder(self):
10201020
return self.model
10211021

1022+
def get_image_features(self, pixel_values):
1023+
return self.model.get_image_features(pixel_values)
1024+
10221025
# Make modules available throught conditional class for BC
10231026
@property
10241027
def language_model(self):

src/transformers/models/got_ocr2/modeling_got_ocr2.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -762,6 +762,20 @@ def set_decoder(self, decoder):
762762
def get_decoder(self):
763763
return self.model
764764

765+
def get_image_features(
766+
self,
767+
pixel_values: torch.FloatTensor,
768+
vision_feature_layer: Optional[Union[int, List[int]]] = None,
769+
vision_feature_select_strategy: Optional[str] = None,
770+
**kwargs,
771+
):
772+
return self.model.get_image_features(
773+
pixel_values=pixel_values,
774+
vision_feature_layer=vision_feature_layer,
775+
vision_feature_select_strategy=vision_feature_select_strategy,
776+
**kwargs,
777+
)
778+
765779
# Make modules available throught conditional class for BC
766780
@property
767781
def language_model(self):

src/transformers/models/idefics2/modeling_idefics2.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1186,6 +1186,9 @@ def get_output_embeddings(self):
11861186
def set_output_embeddings(self, new_embeddings):
11871187
self.lm_head = new_embeddings
11881188

1189+
def get_image_features(self, pixel_values: torch.FloatTensor, pixel_attention_mask: torch.LongTensor = None):
1190+
return self.model.get_image_features(pixel_values=pixel_values, pixel_attention_mask=pixel_attention_mask)
1191+
11891192
@can_return_tuple
11901193
@auto_docstring
11911194
def forward(

src/transformers/models/idefics3/modeling_idefics3.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -909,6 +909,9 @@ def get_output_embeddings(self):
909909
def set_output_embeddings(self, new_embeddings):
910910
self.lm_head = new_embeddings
911911

912+
def get_image_features(self, pixel_values: torch.FloatTensor, pixel_attention_mask: torch.LongTensor = None):
913+
return self.model.get_image_features(pixel_values=pixel_values, pixel_attention_mask=pixel_attention_mask)
914+
912915
@can_return_tuple
913916
@auto_docstring
914917
def forward(

src/transformers/models/internvl/modeling_internvl.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -883,6 +883,20 @@ def set_decoder(self, decoder):
883883
def get_decoder(self):
884884
return self.model
885885

886+
def get_image_features(
887+
self,
888+
pixel_values: torch.FloatTensor,
889+
vision_feature_layer: Optional[Union[int, List[int]]] = None,
890+
vision_feature_select_strategy: Optional[str] = None,
891+
**kwargs,
892+
):
893+
return self.model.get_image_features(
894+
pixel_values=pixel_values,
895+
vision_feature_layer=vision_feature_layer,
896+
vision_feature_select_strategy=vision_feature_select_strategy,
897+
**kwargs,
898+
)
899+
886900
# Make modules available throught conditional class for BC
887901
@property
888902
def language_model(self):

src/transformers/models/llava/modeling_llava.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -376,6 +376,20 @@ def set_decoder(self, decoder):
376376
def get_decoder(self):
377377
return self.model
378378

379+
def get_image_features(
380+
self,
381+
pixel_values: torch.FloatTensor,
382+
vision_feature_layer: Optional[Union[int, List[int]]] = None,
383+
vision_feature_select_strategy: Optional[str] = None,
384+
**kwargs,
385+
):
386+
return self.model.get_image_features(
387+
pixel_values=pixel_values,
388+
vision_feature_layer=vision_feature_layer,
389+
vision_feature_select_strategy=vision_feature_select_strategy,
390+
**kwargs,
391+
)
392+
379393
# Make modules available throught conditional class for BC
380394
@property
381395
def language_model(self):

0 commit comments

Comments
 (0)