PaddlePaddle · zhangbo9674 · Nov 8, 2024 · Nov 29, 2024 · Nov 29, 2024 · Dec 6, 2024
diff --git a/paddlenlp/transformers/deepseek_v2/modeling.py b/paddlenlp/transformers/deepseek_v2/modeling.py
@@ -231,11 +231,8 @@
             )
 
         attn_weights = attn_weights + attention_mask
-        if not paddle.in_dynamic_mode():
+        with paddle.amp.auto_cast(False):
             attn_weights = F.softmax(attn_weights, axis=-1, dtype="float32").astype(query_states.dtype)
-        else:
-            with paddle.amp.auto_cast(False):
-                attn_weights = F.softmax(attn_weights, axis=-1, dtype="float32").astype(query_states.dtype)
 
         attn_weights = F.dropout(attn_weights, p=config.attention_dropout, training=training)
 
@@ -335,12 +332,7 @@
                     f"Implementation of fused_rms_norm is not available on {get_env_device()}. Please install paddle_xpu to use this feature"
                 )
 
-        if paddle.in_dynamic_mode():
-            with paddle.amp.auto_cast(False):
-                hidden_states = hidden_states.astype("float32")
-                variance = hidden_states.pow(2).mean(-1, keepdim=True)
-                hidden_states = paddle.rsqrt(variance + self.variance_epsilon) * hidden_states
-        else:
+        with paddle.amp.auto_cast(False):
             hidden_states = hidden_states.astype("float32")
             variance = hidden_states.pow(2).mean(-1, keepdim=True)
             hidden_states = paddle.rsqrt(variance + self.variance_epsilon) * hidden_states
@@ -543,7 +535,7 @@
 
         t = paddle.arange(seq_len, dtype=paddle.float32)
 
-        freqs = paddle.outer(t, self.inv_freq)
+        freqs = paddle.outer(t, paddle.cast(self.inv_freq, dtype="float32"))
 
         _mscale = float(
             yarn_get_mscale(self.scaling_factor, self.mscale)

diff --git a/paddlenlp/transformers/deepseek_v2/modeling_auto.py b/paddlenlp/transformers/deepseek_v2/modeling_auto.py
@@ -705,7 +705,11 @@
             inputs_embeds = self.embed_tokens(input_ids)
 
         # embed positions
-        if attn_mask_startend_row_indices is not None or get_use_casual_mask():
+        if (
+            attn_mask_startend_row_indices is not None
+            or get_use_casual_mask()
+            or (self.config.use_flash_attention and self.training)
+        ):
             attention_mask = None
         else:
             # [bs, seq_len]