diff --git a/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py b/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py index 5838c460e7c5..8593b13344c5 100644 --- a/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py +++ b/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py @@ -1693,6 +1693,9 @@ def forward( if attention_mask is not None: attention_mask = attention_mask.to(inputs_embeds.device) + if position_ids is None and input_ids is not None: + position_ids, _ = self.get_rope_index(input_ids, image_grid_thw, video_grid_thw, attention_mask) + outputs = self.model( input_ids=None, position_ids=position_ids,