sgl-project · zhyncs · Jun 9, 2025 · Jun 8, 2025 · gemini-code-assist · Jun 8, 2025
diff --git a/python/sglang/srt/models/deepseek_v2.py b/python/sglang/srt/models/deepseek_v2.py
@@ -420,10 +420,13 @@ def forward_deepep(
                 topk_weights=topk_weights,
                 forward_mode=forward_mode,
             )
-        final_hidden_states *= self.routed_scaling_factor
 
         if shared_output is not None:
-            final_hidden_states = final_hidden_states + shared_output
+            x = shared_output
+            x.add_(final_hidden_states, alpha=self.routed_scaling_factor)
+            final_hidden_states = x
-            x = shared_output
-            x.add_(final_hidden_states, alpha=self.routed_scaling_factor)
-            final_hidden_states = x
+            # Update shared_output in-place, then assign to final_hidden_states.
+            # This is equivalent to:
+            # final_hidden_states = shared_output_original + final_hidden_states_unscaled * self.routed_scaling_factor
+            # and can be more memory-efficient if shared_output is an intermediate tensor.
+            shared_output.add_(final_hidden_states, alpha=self.routed_scaling_factor)
+            final_hidden_states = shared_output
-            x = shared_output
-            x.add_(final_hidden_states, alpha=self.routed_scaling_factor)
-            final_hidden_states = x
+            # Update shared_output in-place, then assign to final_hidden_states.
+            # This is equivalent to:
+            # final_hidden_states = shared_output_original + final_hidden_states_unscaled * self.routed_scaling_factor
+            # and can be more memory-efficient if shared_output is an intermediate tensor.
+            shared_output.add_(final_hidden_states, alpha=self.routed_scaling_factor)
+            final_hidden_states = shared_output
+        else:
+            final_hidden_states *= self.routed_scaling_factor
 
         return final_hidden_states