diff --git a/python/sglang/srt/models/deepseek_v2.py b/python/sglang/srt/models/deepseek_v2.py index 83211e8ebd8..4fc158fba6f 100644 --- a/python/sglang/srt/models/deepseek_v2.py +++ b/python/sglang/srt/models/deepseek_v2.py @@ -420,10 +420,13 @@ def forward_deepep( topk_weights=topk_weights, forward_mode=forward_mode, ) - final_hidden_states *= self.routed_scaling_factor if shared_output is not None: - final_hidden_states = final_hidden_states + shared_output + x = shared_output + x.add_(final_hidden_states, alpha=self.routed_scaling_factor) + final_hidden_states = x + else: + final_hidden_states *= self.routed_scaling_factor return final_hidden_states