sgl-project · zhyncs · May 29, 2025 · May 13, 2025 · May 14, 2025 · May 17, 2025
diff --git a/python/sglang/srt/models/deepseek_v2.py b/python/sglang/srt/models/deepseek_v2.py
@@ -1533,14 +1533,23 @@ def forward(
             input_ids, hidden_states, self.lm_head, forward_batch
         )
 
-    def post_load_weights(self, is_nextn=False):
+    def post_load_weights(self, is_nextn=False, weight_names=None):
 
         # Perform post-processing after loading weights
-        layer_ids = (
-            range(self.config.num_hidden_layers)
-            if not is_nextn
-            else [self.config.num_hidden_layers]
-        )
+        if weight_names is None:
+            layer_ids = (
+                range(self.config.num_hidden_layers)
+                if not is_nextn
+                else [self.config.num_hidden_layers]
+            )
+        else:
+            layer_ids = []
+            for name in weight_names:
+                if "kv_b_proj" in name:
+                    layer_id = int(name.split(".")[2])
+                    if layer_id not in layer_ids:
+                        layer_ids.append(layer_id)
+
         for layer_id in layer_ids:
             self_attn = (
                 self.model.layers[layer_id].self_attn
@@ -1640,13 +1649,22 @@ def post_load_weights(self, is_nextn=False):
                 0, (-1, self_attn.qk_nope_head_dim + self_attn.v_head_dim)
             ).split([self_attn.qk_nope_head_dim, self_attn.v_head_dim], dim=1)
             if not use_deep_gemm_bmm:
-                self_attn.w_kc = w_kc.transpose(1, 2).contiguous().transpose(1, 2)
-                self_attn.w_vc = w_vc.contiguous().transpose(1, 2)
+                if self_attn.w_kc is None:
+                    self_attn.w_kc = w_kc.transpose(1, 2).contiguous().transpose(1, 2)
+                    self_attn.w_vc = w_vc.contiguous().transpose(1, 2)
+                else:
+                    self_attn.w_kc.copy_(
+                        w_kc.transpose(1, 2).contiguous().transpose(1, 2)
+                    )
+                    self_attn.w_vc.copy_(w_vc.contiguous().transpose(1, 2))
                 if (
                     hasattr(self_attn.kv_b_proj, "weight_scale")
                     and self_attn.w_scale is None
                 ):
-                    self_attn.w_scale = self_attn.kv_b_proj.weight_scale
+                    if self_attn.w_scale is None:
+                        self_attn.w_scale = self_attn.kv_b_proj.weight_scale
+                    else:
+                        self_attn.w_scale.copy_(self_attn.kv_b_proj.weight_scale)
                     if _is_hip:
                         self_attn.w_scale *= 2.0
             else:
@@ -1655,10 +1673,16 @@ def post_load_weights(self, is_nextn=False):
                 ws_kc, ws_vc = block_scale.unflatten(
                     0, (-1, (num_tiles_k + num_tiles_n))
                 ).split([num_tiles_k, num_tiles_n], dim=1)
-                self_attn.w_scale_k = ws_kc.transpose(1, 2).contiguous()
-                self_attn.w_scale_v = ws_vc.contiguous()
-                self_attn.w_kc = w_kc.transpose(1, 2).contiguous()
-                self_attn.w_vc = w_vc.contiguous()
+                if self_attn.w_kc is None:
+                    self_attn.w_scale_k = ws_kc.transpose(1, 2).contiguous()
+                    self_attn.w_scale_v = ws_vc.contiguous()
+                    self_attn.w_kc = w_kc.transpose(1, 2).contiguous()
+                    self_attn.w_vc = w_vc.contiguous()
+                else:
+                    self_attn.w_scale_k.copy_(ws_kc.transpose(1, 2).contiguous())
+                    self_attn.w_scale_v.copy_(ws_vc.contiguous())
+                    self_attn.w_kc.copy_(w_kc.transpose(1, 2).contiguous())
+                    self_attn.w_vc.copy_(w_vc.contiguous())
                 self_attn.use_deep_gemm_bmm = True
 
     def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]], is_nextn=False):
@@ -1765,7 +1789,10 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]], is_nextn=Fal
             ]
 
         params_dict = dict(self.named_parameters())
+        weight_names = []
         for name, loaded_weight in weights:
+            weight_names.append(name)
+
             if not is_nextn:
                 if hasattr(self.config, "num_nextn_predict_layers"):
                     num_nextn_layers = self.config.num_nextn_predict_layers
@@ -1883,7 +1910,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]], is_nextn=Fal
                         )
                         weight_loader(param, loaded_weight)
 
-        self.post_load_weights(is_nextn=is_nextn)
+        self.post_load_weights(is_nextn=is_nextn, weight_names=weight_names)
 
     def get_embed_and_head(self):
         return self.model.embed_tokens.weight, self.lm_head.weight