Skip to content
12 changes: 12 additions & 0 deletions src/transformers/models/aria/modeling_aria.py
Original file line number Diff line number Diff line change
Expand Up @@ -1225,6 +1225,18 @@ def set_decoder(self, decoder):
def get_decoder(self):
return self.model

def get_image_features(
self,
pixel_values: torch.FloatTensor,
pixel_mask: Optional[torch.FloatTensor] = None,
vision_feature_layer: int = -1,
):
return self.model.get_image_features(
pixel_values=pixel_values,
pixel_mask=pixel_mask,
vision_feature_layer=vision_feature_layer,
)

# Make modules available throught conditional class for BC
@property
def language_model(self):
Expand Down
12 changes: 12 additions & 0 deletions src/transformers/models/aria/modular_aria.py
Original file line number Diff line number Diff line change
Expand Up @@ -1497,6 +1497,18 @@ def forward(
"""
)
class AriaForConditionalGeneration(LlavaForConditionalGeneration):
def get_image_features(
self,
pixel_values: torch.FloatTensor,
pixel_mask: Optional[torch.FloatTensor] = None,
vision_feature_layer: int = -1,
):
return self.model.get_image_features(
pixel_values=pixel_values,
pixel_mask=pixel_mask,
vision_feature_layer=vision_feature_layer,
)

@can_return_tuple
@auto_docstring
def forward(
Expand Down
14 changes: 14 additions & 0 deletions src/transformers/models/aya_vision/modeling_aya_vision.py
Original file line number Diff line number Diff line change
Expand Up @@ -394,6 +394,20 @@ def set_decoder(self, decoder):
def get_decoder(self):
return self.model

def get_image_features(
self,
pixel_values: torch.FloatTensor,
vision_feature_layer: Optional[Union[int, List[int]]] = None,
vision_feature_select_strategy: Optional[str] = None,
**kwargs,
):
return self.model.get_image_features(
pixel_values=pixel_values,
vision_feature_layer=vision_feature_layer,
vision_feature_select_strategy=vision_feature_select_strategy,
**kwargs,
)

# Make modules available throught conditional class for BC
@property
def language_model(self):
Expand Down
6 changes: 6 additions & 0 deletions src/transformers/models/chameleon/modeling_chameleon.py
Original file line number Diff line number Diff line change
Expand Up @@ -1229,6 +1229,12 @@ def set_decoder(self, decoder):
def get_decoder(self):
return self.model

def get_image_tokens(self, pixel_values):
return self.model.get_image_tokens(pixel_values)

def get_image_features(self, pixel_values):
return self.model.get_image_features(pixel_values)

Comment on lines +1235 to +1237
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

for chameleon and emu3, the helper was get_image_tokens. Can we propagate that too?

@can_return_tuple
@auto_docstring
def forward(
Expand Down
3 changes: 3 additions & 0 deletions src/transformers/models/gemma3/modeling_gemma3.py
Original file line number Diff line number Diff line change
Expand Up @@ -1019,6 +1019,9 @@ def set_decoder(self, decoder):
def get_decoder(self):
return self.model

def get_image_features(self, pixel_values):
return self.model.get_image_features(pixel_values)

# Make modules available throught conditional class for BC
@property
def language_model(self):
Expand Down
14 changes: 14 additions & 0 deletions src/transformers/models/got_ocr2/modeling_got_ocr2.py
Original file line number Diff line number Diff line change
Expand Up @@ -762,6 +762,20 @@ def set_decoder(self, decoder):
def get_decoder(self):
return self.model

def get_image_features(
self,
pixel_values: torch.FloatTensor,
vision_feature_layer: Optional[Union[int, List[int]]] = None,
vision_feature_select_strategy: Optional[str] = None,
**kwargs,
):
return self.model.get_image_features(
pixel_values=pixel_values,
vision_feature_layer=vision_feature_layer,
vision_feature_select_strategy=vision_feature_select_strategy,
**kwargs,
)

# Make modules available throught conditional class for BC
@property
def language_model(self):
Expand Down
3 changes: 3 additions & 0 deletions src/transformers/models/idefics2/modeling_idefics2.py
Original file line number Diff line number Diff line change
Expand Up @@ -1186,6 +1186,9 @@ def get_output_embeddings(self):
def set_output_embeddings(self, new_embeddings):
self.lm_head = new_embeddings

def get_image_features(self, pixel_values: torch.FloatTensor, pixel_attention_mask: torch.LongTensor = None):
return self.model.get_image_features(pixel_values=pixel_values, pixel_attention_mask=pixel_attention_mask)

@can_return_tuple
@auto_docstring
def forward(
Expand Down
3 changes: 3 additions & 0 deletions src/transformers/models/idefics3/modeling_idefics3.py
Original file line number Diff line number Diff line change
Expand Up @@ -909,6 +909,9 @@ def get_output_embeddings(self):
def set_output_embeddings(self, new_embeddings):
self.lm_head = new_embeddings

def get_image_features(self, pixel_values: torch.FloatTensor, pixel_attention_mask: torch.LongTensor = None):
return self.model.get_image_features(pixel_values=pixel_values, pixel_attention_mask=pixel_attention_mask)

@can_return_tuple
@auto_docstring
def forward(
Expand Down
14 changes: 14 additions & 0 deletions src/transformers/models/internvl/modeling_internvl.py
Original file line number Diff line number Diff line change
Expand Up @@ -883,6 +883,20 @@ def set_decoder(self, decoder):
def get_decoder(self):
return self.model

def get_image_features(
self,
pixel_values: torch.FloatTensor,
vision_feature_layer: Optional[Union[int, List[int]]] = None,
vision_feature_select_strategy: Optional[str] = None,
**kwargs,
):
return self.model.get_image_features(
pixel_values=pixel_values,
vision_feature_layer=vision_feature_layer,
vision_feature_select_strategy=vision_feature_select_strategy,
**kwargs,
)

# Make modules available throught conditional class for BC
@property
def language_model(self):
Expand Down
14 changes: 14 additions & 0 deletions src/transformers/models/llava/modeling_llava.py
Original file line number Diff line number Diff line change
Expand Up @@ -376,6 +376,20 @@ def set_decoder(self, decoder):
def get_decoder(self):
return self.model

def get_image_features(
self,
pixel_values: torch.FloatTensor,
vision_feature_layer: Optional[Union[int, List[int]]] = None,
vision_feature_select_strategy: Optional[str] = None,
**kwargs,
):
return self.model.get_image_features(
pixel_values=pixel_values,
vision_feature_layer=vision_feature_layer,
vision_feature_select_strategy=vision_feature_select_strategy,
**kwargs,
)

# Make modules available throught conditional class for BC
@property
def language_model(self):
Expand Down
22 changes: 22 additions & 0 deletions src/transformers/models/llava_next/modeling_llava_next.py
Original file line number Diff line number Diff line change
Expand Up @@ -574,6 +574,28 @@ def set_decoder(self, decoder):
def get_decoder(self):
return self.model

def pack_image_features(self, image_features, image_sizes, vision_feature_select_strategy, image_newline=None):
return self.model.pack_image_features(
image_features=image_features,
image_sizes=image_sizes,
vision_feature_select_strategy=vision_feature_select_strategy,
image_newline=image_newline,
)

def get_image_features(
self,
pixel_values: torch.FloatTensor,
image_sizes: torch.Tensor,
vision_feature_layer: Optional[Union[int, List[int]]] = None,
vision_feature_select_strategy: Optional[str] = None,
):
return self.model.get_image_features(
pixel_values=pixel_values,
image_sizes=image_sizes,
vision_feature_layer=vision_feature_layer,
vision_feature_select_strategy=vision_feature_select_strategy,
)

# Make modules available throught conditional class for BC
@property
def language_model(self):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -706,6 +706,28 @@ def set_decoder(self, decoder):
def get_decoder(self):
return self.model

def pack_image_features(self, image_features, image_sizes, vision_feature_select_strategy, image_newline=None):
return self.model.pack_image_features(
image_features=image_features,
image_sizes=image_sizes,
vision_feature_select_strategy=vision_feature_select_strategy,
image_newline=image_newline,
)

def get_image_features(
self,
pixel_values: torch.FloatTensor,
image_sizes: torch.Tensor,
vision_feature_layer: Optional[Union[int, List[int]]] = None,
vision_feature_select_strategy: Optional[str] = None,
):
return self.model.get_image_features(
pixel_values=pixel_values,
image_sizes=image_sizes,
vision_feature_layer=vision_feature_layer,
vision_feature_select_strategy=vision_feature_select_strategy,
)

# Make modules available throught conditional class for BC
@property
def language_model(self):
Expand Down Expand Up @@ -952,5 +974,17 @@ def _prepare_4d_causal_attention_mask_with_cache_position(

return causal_mask

def get_video_features(
self,
pixel_values: torch.FloatTensor,
vision_feature_layer: Optional[Union[int, List[int]]] = None,
vision_feature_select_strategy: Optional[str] = None,
):
return self.model.get_video_features(
pixel_values=pixel_values,
vision_feature_layer=vision_feature_layer,
vision_feature_select_strategy=vision_feature_select_strategy,
)


__all__ = ["LlavaNextVideoForConditionalGeneration", "LlavaNextVideoModel", "LlavaNextVideoPreTrainedModel"]
Original file line number Diff line number Diff line change
Expand Up @@ -496,6 +496,18 @@ def forward(


class LlavaNextVideoForConditionalGeneration(LlavaNextForConditionalGeneration):
def get_video_features(
self,
pixel_values: torch.FloatTensor,
vision_feature_layer: Optional[Union[int, List[int]]] = None,
vision_feature_select_strategy: Optional[str] = None,
):
return self.model.get_video_features(
pixel_values=pixel_values,
vision_feature_layer=vision_feature_layer,
vision_feature_select_strategy=vision_feature_select_strategy,
)

def forward(
self,
input_ids: torch.LongTensor = None,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -747,6 +747,28 @@ def set_decoder(self, decoder):
def get_decoder(self):
return self.model

def pack_image_features(self, image_features, image_sizes, vision_feature_select_strategy, image_newline=None):
return self.model.pack_image_features(
image_features=image_features,
image_sizes=image_sizes,
vision_feature_select_strategy=vision_feature_select_strategy,
image_newline=image_newline,
)

def get_image_features(
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can we also add the get_video_features when it exists. I think it's only for llava-onevision, llava-next-video and video-llava

self,
pixel_values: torch.FloatTensor,
image_sizes: torch.Tensor,
vision_feature_layer: Optional[Union[int, List[int]]] = None,
vision_feature_select_strategy: Optional[str] = None,
):
return self.model.get_image_features(
pixel_values=pixel_values,
image_sizes=image_sizes,
vision_feature_layer=vision_feature_layer,
vision_feature_select_strategy=vision_feature_select_strategy,
)

# Make modules available throught conditional class for BC
@property
def language_model(self):
Expand Down Expand Up @@ -988,5 +1010,17 @@ def _prepare_4d_causal_attention_mask_with_cache_position(

return causal_mask

def get_video_features(
self,
pixel_values: torch.FloatTensor,
vision_feature_layer: Optional[Union[int, List[int]]] = None,
vision_feature_select_strategy: Optional[str] = None,
):
return self.model.get_video_features(
pixel_values=pixel_values,
vision_feature_layer=vision_feature_layer,
vision_feature_select_strategy=vision_feature_select_strategy,
)


__all__ = ["LlavaOnevisionModel", "LlavaOnevisionForConditionalGeneration", "LlavaOnevisionPreTrainedModel"]
14 changes: 14 additions & 0 deletions src/transformers/models/mistral3/modeling_mistral3.py
Original file line number Diff line number Diff line change
Expand Up @@ -412,6 +412,20 @@ def set_decoder(self, decoder):
def get_decoder(self):
return self.model

def get_image_features(
self,
pixel_values: torch.FloatTensor,
image_sizes: torch.Tensor,
vision_feature_layer: Optional[Union[int, List[int]]] = None,
**kwargs,
):
return self.model.get_image_features(
pixel_values=pixel_values,
image_sizes=image_sizes,
vision_feature_layer=vision_feature_layer,
**kwargs,
)

# Make modules available throught conditional class for BC
@property
def language_model(self):
Expand Down
14 changes: 14 additions & 0 deletions src/transformers/models/mistral3/modular_mistral3.py
Original file line number Diff line number Diff line change
Expand Up @@ -254,6 +254,20 @@ def forward(


class Mistral3ForConditionalGeneration(LlavaForConditionalGeneration):
def get_image_features(
self,
pixel_values: torch.FloatTensor,
image_sizes: torch.Tensor,
vision_feature_layer: Optional[Union[int, List[int]]] = None,
**kwargs,
):
return self.model.get_image_features(
pixel_values=pixel_values,
image_sizes=image_sizes,
vision_feature_layer=vision_feature_layer,
**kwargs,
)

def forward(
self,
input_ids: torch.LongTensor = None,
Expand Down
3 changes: 3 additions & 0 deletions src/transformers/models/paligemma/modeling_paligemma.py
Original file line number Diff line number Diff line change
Expand Up @@ -423,6 +423,9 @@ def set_decoder(self, decoder):
def get_decoder(self):
return self.model

def get_image_features(self, pixel_values):
return self.model.get_image_features(pixel_values)

# Make modules available throught conditional class for BC
@property
def language_model(self):
Expand Down
8 changes: 8 additions & 0 deletions src/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py
Original file line number Diff line number Diff line change
Expand Up @@ -1503,6 +1503,14 @@ def set_decoder(self, decoder):
def get_decoder(self):
return self.model

def get_video_features(
self, pixel_values_videos: torch.FloatTensor, video_grid_thw: Optional[torch.LongTensor] = None
):
return self.model.get_video_features(pixel_values_videos, video_grid_thw)

def get_image_features(self, pixel_values: torch.FloatTensor, image_grid_thw: Optional[torch.LongTensor] = None):
return self.model.get_image_features(pixel_values, image_grid_thw)

# Make modules available throught conditional class for BC
@property
def language_model(self):
Expand Down
8 changes: 8 additions & 0 deletions src/transformers/models/qwen2_vl/modeling_qwen2_vl.py
Original file line number Diff line number Diff line change
Expand Up @@ -1387,6 +1387,14 @@ def set_decoder(self, decoder):
def get_decoder(self):
return self.model

def get_video_features(
self, pixel_values_videos: torch.FloatTensor, video_grid_thw: Optional[torch.LongTensor] = None
):
return self.model.get_video_features(pixel_values_videos, video_grid_thw)

def get_image_features(self, pixel_values: torch.FloatTensor, image_grid_thw: Optional[torch.LongTensor] = None):
return self.model.get_image_features(pixel_values, image_grid_thw)

# Make modules available throught conditional class for BC
@property
def language_model(self):
Expand Down
3 changes: 3 additions & 0 deletions src/transformers/models/smolvlm/modeling_smolvlm.py
Original file line number Diff line number Diff line change
Expand Up @@ -874,6 +874,9 @@ def get_output_embeddings(self):
def set_output_embeddings(self, new_embeddings):
self.lm_head = new_embeddings

def get_image_features(self, pixel_values: torch.FloatTensor, pixel_attention_mask: torch.LongTensor = None):
return self.model.get_image_features(pixel_values=pixel_values, pixel_attention_mask=pixel_attention_mask)

@can_return_tuple
@auto_docstring
def forward(
Expand Down
Loading