sgl-project · merrymercy · Apr 26, 2025 · Apr 24, 2025
diff --git a/python/sglang/srt/models/deepseek_nextn.py b/python/sglang/srt/models/deepseek_nextn.py
@@ -215,11 +215,11 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                     "up_proj.weight_scale_inv",
                 ]
             names_to_remove = []
-            for num_repeat in range(self.n_share_experts_fusion):
-                for suffix in suffix_list:
-                    shared_expert_weight_name = (
-                        f"model.layers.0.mlp.shared_experts.{suffix}"
-                    )
+            for suffix in suffix_list:
+                shared_expert_weight_name = (
+                    f"model.layers.0.mlp.shared_experts.{suffix}"
+                )
+                for num_repeat in range(self.n_share_experts_fusion):
                     weights_list.append(
                         (
                             f"model.layers.0."
@@ -229,7 +229,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                             weights_dict[shared_expert_weight_name],
                         )
                     )
-                    names_to_remove += [shared_expert_weight_name]
+                names_to_remove += [shared_expert_weight_name]
             weights = [w for w in weights_list if w[0] not in names_to_remove]
 
         # Params for weights, fp8 weight scales, fp8 activation scales

diff --git a/python/sglang/srt/models/deepseek_v2.py b/python/sglang/srt/models/deepseek_v2.py
@@ -1650,11 +1650,11 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 desc=f"Cloning {self.n_share_experts_fusion} "
                 "replicas of the shared expert into MoE",
             ):
-                for num_repeat in range(self.n_share_experts_fusion):
-                    for suffix in suffix_list:
-                        shared_expert_weight_name = (
-                            f"model.layers.{moe_layer}.mlp.shared_experts.{suffix}"
-                        )
+                for suffix in suffix_list:
+                    shared_expert_weight_name = (
+                        f"model.layers.{moe_layer}.mlp.shared_experts.{suffix}"
+                    )
+                    for num_repeat in range(self.n_share_experts_fusion):
                         weights_list.append(
                             (
                                 f"model.layers.{moe_layer}."
@@ -1664,7 +1664,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                                 weights_dict[shared_expert_weight_name],
                             )
                         )
-                        names_to_remove += [shared_expert_weight_name]
+                    names_to_remove += [shared_expert_weight_name]
             weights = [w for w in weights_list if w[0] not in names_to_remove]
 
         # Params for weights, fp8 weight scales, fp8 activation scales