diff --git a/paddleformers/transformers/glm4_moe/modeling.py b/paddleformers/transformers/glm4_moe/modeling.py index aa18e636a92..de40038ac6a 100644 --- a/paddleformers/transformers/glm4_moe/modeling.py +++ b/paddleformers/transformers/glm4_moe/modeling.py @@ -304,6 +304,9 @@ def __init__(self, config, num_experts, expert_hidden_size, **kwargs): ) self.expert_usage.stop_gradient = True + # weight and e_score_correction_bias do not need to be cast to low precision + self._cast_to_low_precision = False + def forward(self, hidden_states): """ Args: @@ -340,12 +343,15 @@ def __init__(self, config: Glm4MoeConfig): self.weight = paddle.create_parameter( shape=[self.n_routed_experts, config.hidden_size], - dtype="bfloat16", + dtype="float32", default_initializer=paddle.nn.initializer.Uniform(), ) self.register_buffer("e_score_correction_bias", paddle.zeros((self.n_routed_experts,), dtype=paddle.float32)) + # weight and e_score_correction_bias do not need to be cast to low precision + self._cast_to_low_precision = False + @paddle.no_grad() def get_topk_indices(self, scores): scores_for_choice = scores.reshape([-1, self.n_routed_experts]) + self.e_score_correction_bias.unsqueeze(0) @@ -588,6 +594,7 @@ class Glm4MoePreTrainedModel(PretrainedModel): config: Glm4MoeConfig config_class = Glm4MoeConfig base_model_prefix = "model" + _keep_in_fp32_modules = ["mlp.gate.weight", "e_score_correction_bias"] transpose_weight_keys = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"] @classmethod