huggingface · regisss · Nov 26, 2024 · Sep 20, 2024
@@ -73,8 +73,7 @@ def gaudi_BertModel_forward(
 
     # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
     # ourselves in which case we just need to make it broadcastable to all heads.
-    dtype = torch.hpu.get_autocast_hpu_dtype() if torch.hpu.is_autocast_hpu_enabled() else self.dtype
-    extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape, dtype=dtype)
+    extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape, dtype=self.dtype)
 
     # If a 2D or 3D attention mask is provided for the cross-attention
     # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]