From ab276066e2b8ba2f43d9991595ebb11e50d55361 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?= Date: Thu, 7 Aug 2025 05:47:01 +0000 Subject: [PATCH 1/2] [bugfix] ensure correct tensor device in Idefics2, Idefics3, and SmolVLM models --- src/transformers/models/idefics2/modeling_idefics2.py | 8 ++++---- src/transformers/models/idefics3/modeling_idefics3.py | 8 ++++---- src/transformers/models/smolvlm/modeling_smolvlm.py | 8 ++++---- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/src/transformers/models/idefics2/modeling_idefics2.py b/src/transformers/models/idefics2/modeling_idefics2.py index d25cf5e2f2a1..0586fc180c1c 100644 --- a/src/transformers/models/idefics2/modeling_idefics2.py +++ b/src/transformers/models/idefics2/modeling_idefics2.py @@ -144,11 +144,11 @@ def forward(self, pixel_values: torch.FloatTensor, patch_attention_mask: torch.B position_ids = torch.full(size=(batch_size, max_nb_patches_h * max_nb_patches_w), fill_value=0) for batch_idx, p_attn_mask in enumerate(patch_attention_mask): - nb_patches_h = p_attn_mask[:, 0].sum() - nb_patches_w = p_attn_mask[0].sum() + nb_patches_h = p_attn_mask[:, 0].sum().item() + nb_patches_w = p_attn_mask[0].sum().item() - h_indices = torch.arange(nb_patches_h, device=pixel_values.device, dtype=pixel_values.dtype) - w_indices = torch.arange(nb_patches_w, device=pixel_values.device, dtype=pixel_values.dtype) + h_indices = torch.arange(nb_patches_h, dtype=pixel_values.dtype) + w_indices = torch.arange(nb_patches_w, dtype=pixel_values.dtype) fractional_coords_h = h_indices / nb_patches_h * (1 - 1e-6) fractional_coords_w = w_indices / nb_patches_w * (1 - 1e-6) diff --git a/src/transformers/models/idefics3/modeling_idefics3.py b/src/transformers/models/idefics3/modeling_idefics3.py index c2d41aac02d7..c42ac1f13a49 100644 --- a/src/transformers/models/idefics3/modeling_idefics3.py +++ b/src/transformers/models/idefics3/modeling_idefics3.py @@ -144,11 +144,11 @@ def forward(self, pixel_values: torch.FloatTensor, patch_attention_mask: torch.B position_ids = torch.full(size=(batch_size, max_nb_patches_h * max_nb_patches_w), fill_value=0) for batch_idx, p_attn_mask in enumerate(patch_attention_mask): - nb_patches_h = p_attn_mask[:, 0].sum() - nb_patches_w = p_attn_mask[0].sum() + nb_patches_h = p_attn_mask[:, 0].sum().item() + nb_patches_w = p_attn_mask[0].sum().item() - h_indices = torch.arange(nb_patches_h, device=pixel_values.device, dtype=pixel_values.dtype) - w_indices = torch.arange(nb_patches_w, device=pixel_values.device, dtype=pixel_values.dtype) + h_indices = torch.arange(nb_patches_h, dtype=pixel_values.dtype) + w_indices = torch.arange(nb_patches_w, dtype=pixel_values.dtype) fractional_coords_h = h_indices / nb_patches_h * (1 - 1e-6) fractional_coords_w = w_indices / nb_patches_w * (1 - 1e-6) diff --git a/src/transformers/models/smolvlm/modeling_smolvlm.py b/src/transformers/models/smolvlm/modeling_smolvlm.py index 745206868581..2c33fca04f63 100644 --- a/src/transformers/models/smolvlm/modeling_smolvlm.py +++ b/src/transformers/models/smolvlm/modeling_smolvlm.py @@ -139,11 +139,11 @@ def forward(self, pixel_values: torch.FloatTensor, patch_attention_mask: torch.B position_ids = torch.full(size=(batch_size, max_nb_patches_h * max_nb_patches_w), fill_value=0) for batch_idx, p_attn_mask in enumerate(patch_attention_mask): - nb_patches_h = p_attn_mask[:, 0].sum() - nb_patches_w = p_attn_mask[0].sum() + nb_patches_h = p_attn_mask[:, 0].sum().item() + nb_patches_w = p_attn_mask[0].sum().item() - h_indices = torch.arange(nb_patches_h, device=pixel_values.device, dtype=pixel_values.dtype) - w_indices = torch.arange(nb_patches_w, device=pixel_values.device, dtype=pixel_values.dtype) + h_indices = torch.arange(nb_patches_h, dtype=pixel_values.dtype) + w_indices = torch.arange(nb_patches_w, dtype=pixel_values.dtype) fractional_coords_h = h_indices / nb_patches_h * (1 - 1e-6) fractional_coords_w = w_indices / nb_patches_w * (1 - 1e-6) From 36371b0205390b9c40e646fb2e932a01a178b692 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?= Date: Tue, 12 Aug 2025 05:26:06 +0000 Subject: [PATCH 2/2] to cuda --- .../models/idefics2/modeling_idefics2.py | 19 +++++++++++-------- .../models/idefics3/modeling_idefics3.py | 19 +++++++++++-------- .../models/smolvlm/modeling_smolvlm.py | 19 +++++++++++-------- 3 files changed, 33 insertions(+), 24 deletions(-) diff --git a/src/transformers/models/idefics2/modeling_idefics2.py b/src/transformers/models/idefics2/modeling_idefics2.py index 0586fc180c1c..90b972bc3dee 100644 --- a/src/transformers/models/idefics2/modeling_idefics2.py +++ b/src/transformers/models/idefics2/modeling_idefics2.py @@ -140,15 +140,19 @@ def forward(self, pixel_values: torch.FloatTensor, patch_attention_mask: torch.B embeddings = patch_embeds.flatten(2).transpose(1, 2) max_nb_patches_h, max_nb_patches_w = max_im_h // self.patch_size, max_im_w // self.patch_size - boundaries = torch.arange(1 / self.num_patches_per_side, 1.0, 1 / self.num_patches_per_side) - position_ids = torch.full(size=(batch_size, max_nb_patches_h * max_nb_patches_w), fill_value=0) + boundaries = torch.arange( + 1 / self.num_patches_per_side, 1.0, 1 / self.num_patches_per_side, device=pixel_values.device + ) + position_ids = torch.full( + size=(batch_size, max_nb_patches_h * max_nb_patches_w), fill_value=0, device=pixel_values.device + ) for batch_idx, p_attn_mask in enumerate(patch_attention_mask): - nb_patches_h = p_attn_mask[:, 0].sum().item() - nb_patches_w = p_attn_mask[0].sum().item() + nb_patches_h = p_attn_mask[:, 0].sum() + nb_patches_w = p_attn_mask[0].sum() - h_indices = torch.arange(nb_patches_h, dtype=pixel_values.dtype) - w_indices = torch.arange(nb_patches_w, dtype=pixel_values.dtype) + h_indices = torch.arange(nb_patches_h, device=pixel_values.device, dtype=pixel_values.dtype) + w_indices = torch.arange(nb_patches_w, device=pixel_values.device, dtype=pixel_values.dtype) fractional_coords_h = h_indices / nb_patches_h * (1 - 1e-6) fractional_coords_w = w_indices / nb_patches_w * (1 - 1e-6) @@ -157,9 +161,8 @@ def forward(self, pixel_values: torch.FloatTensor, patch_attention_mask: torch.B bucket_coords_w = torch.bucketize(fractional_coords_w, boundaries, right=True) pos_ids = (bucket_coords_h[:, None] * self.num_patches_per_side + bucket_coords_w).flatten() - position_ids[batch_idx][p_attn_mask.view(-1).cpu()] = pos_ids + position_ids[batch_idx][p_attn_mask.view(-1)] = pos_ids - position_ids = position_ids.to(self.position_embedding.weight.device) embeddings = embeddings + self.position_embedding(position_ids) return embeddings diff --git a/src/transformers/models/idefics3/modeling_idefics3.py b/src/transformers/models/idefics3/modeling_idefics3.py index c42ac1f13a49..a34f1551e243 100644 --- a/src/transformers/models/idefics3/modeling_idefics3.py +++ b/src/transformers/models/idefics3/modeling_idefics3.py @@ -140,15 +140,19 @@ def forward(self, pixel_values: torch.FloatTensor, patch_attention_mask: torch.B embeddings = patch_embeds.flatten(2).transpose(1, 2) max_nb_patches_h, max_nb_patches_w = max_im_h // self.patch_size, max_im_w // self.patch_size - boundaries = torch.arange(1 / self.num_patches_per_side, 1.0, 1 / self.num_patches_per_side) - position_ids = torch.full(size=(batch_size, max_nb_patches_h * max_nb_patches_w), fill_value=0) + boundaries = torch.arange( + 1 / self.num_patches_per_side, 1.0, 1 / self.num_patches_per_side, device=pixel_values.device + ) + position_ids = torch.full( + size=(batch_size, max_nb_patches_h * max_nb_patches_w), fill_value=0, device=pixel_values.device + ) for batch_idx, p_attn_mask in enumerate(patch_attention_mask): - nb_patches_h = p_attn_mask[:, 0].sum().item() - nb_patches_w = p_attn_mask[0].sum().item() + nb_patches_h = p_attn_mask[:, 0].sum() + nb_patches_w = p_attn_mask[0].sum() - h_indices = torch.arange(nb_patches_h, dtype=pixel_values.dtype) - w_indices = torch.arange(nb_patches_w, dtype=pixel_values.dtype) + h_indices = torch.arange(nb_patches_h, device=pixel_values.device, dtype=pixel_values.dtype) + w_indices = torch.arange(nb_patches_w, device=pixel_values.device, dtype=pixel_values.dtype) fractional_coords_h = h_indices / nb_patches_h * (1 - 1e-6) fractional_coords_w = w_indices / nb_patches_w * (1 - 1e-6) @@ -157,9 +161,8 @@ def forward(self, pixel_values: torch.FloatTensor, patch_attention_mask: torch.B bucket_coords_w = torch.bucketize(fractional_coords_w, boundaries, right=True) pos_ids = (bucket_coords_h[:, None] * self.num_patches_per_side + bucket_coords_w).flatten() - position_ids[batch_idx][p_attn_mask.view(-1).cpu()] = pos_ids + position_ids[batch_idx][p_attn_mask.view(-1)] = pos_ids - position_ids = position_ids.to(self.position_embedding.weight.device) embeddings = embeddings + self.position_embedding(position_ids) return embeddings diff --git a/src/transformers/models/smolvlm/modeling_smolvlm.py b/src/transformers/models/smolvlm/modeling_smolvlm.py index 2c33fca04f63..c12809c0b896 100644 --- a/src/transformers/models/smolvlm/modeling_smolvlm.py +++ b/src/transformers/models/smolvlm/modeling_smolvlm.py @@ -135,15 +135,19 @@ def forward(self, pixel_values: torch.FloatTensor, patch_attention_mask: torch.B embeddings = patch_embeds.flatten(2).transpose(1, 2) max_nb_patches_h, max_nb_patches_w = max_im_h // self.patch_size, max_im_w // self.patch_size - boundaries = torch.arange(1 / self.num_patches_per_side, 1.0, 1 / self.num_patches_per_side) - position_ids = torch.full(size=(batch_size, max_nb_patches_h * max_nb_patches_w), fill_value=0) + boundaries = torch.arange( + 1 / self.num_patches_per_side, 1.0, 1 / self.num_patches_per_side, device=pixel_values.device + ) + position_ids = torch.full( + size=(batch_size, max_nb_patches_h * max_nb_patches_w), fill_value=0, device=pixel_values.device + ) for batch_idx, p_attn_mask in enumerate(patch_attention_mask): - nb_patches_h = p_attn_mask[:, 0].sum().item() - nb_patches_w = p_attn_mask[0].sum().item() + nb_patches_h = p_attn_mask[:, 0].sum() + nb_patches_w = p_attn_mask[0].sum() - h_indices = torch.arange(nb_patches_h, dtype=pixel_values.dtype) - w_indices = torch.arange(nb_patches_w, dtype=pixel_values.dtype) + h_indices = torch.arange(nb_patches_h, device=pixel_values.device, dtype=pixel_values.dtype) + w_indices = torch.arange(nb_patches_w, device=pixel_values.device, dtype=pixel_values.dtype) fractional_coords_h = h_indices / nb_patches_h * (1 - 1e-6) fractional_coords_w = w_indices / nb_patches_w * (1 - 1e-6) @@ -152,9 +156,8 @@ def forward(self, pixel_values: torch.FloatTensor, patch_attention_mask: torch.B bucket_coords_w = torch.bucketize(fractional_coords_w, boundaries, right=True) pos_ids = (bucket_coords_h[:, None] * self.num_patches_per_side + bucket_coords_w).flatten() - position_ids[batch_idx][p_attn_mask.view(-1).cpu()] = pos_ids + position_ids[batch_idx][p_attn_mask.view(-1)] = pos_ids - position_ids = position_ids.to(self.position_embedding.weight.device) embeddings = embeddings + self.position_embedding(position_ids) return embeddings