huggingface · younesbelkada · Dec 9, 2022 · Dec 9, 2022 · Dec 9, 2022 · Dec 9, 2022
diff --git a/src/transformers/models/vit_hybrid/modeling_vit_hybrid.py b/src/transformers/models/vit_hybrid/modeling_vit_hybrid.py
@@ -358,7 +358,6 @@ def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> to
         return hidden_states
 
 
-# Copied from transformers.models.vit.modeling_vit.ViTLayer with ViT->ViTHybrid
 class ViTHybridLayer(nn.Module):
     """This corresponds to the Block class in the timm implementation."""
 
@@ -387,7 +386,8 @@ def forward(
         outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
 
         # first residual connection
-        hidden_states = attention_output + hidden_states
+        # We assign to correct device for `accelerate`, check: https://github.com/huggingface/transformers/pull/20705/
+        hidden_states = attention_output + hidden_states.to(attention_output.device)
 
         # in ViTHybrid, layernorm is also applied after self-attention
         layer_output = self.layernorm_after(hidden_states)