sgl-project · hebiao064 · Aug 4, 2025 · gemini-code-assist · Aug 4, 2025 · gemini-code-assist
@@ -283,7 +283,7 @@ def _load_w13(
                 )
 
             expert_data = expert_data.narrow(shard_dim, start, shard_size)
-        expert_data.copy_(loaded_weight)
+        expert_data.copy_(loaded_weight, non_blocking=True)
 
     def _load_w2(
         self,
@@ -347,7 +347,7 @@ def _load_w2(
                 )
 
         # w2, down_proj: Load into only logical weight of w2.
-        expert_data.copy_(loaded_weight)
+        expert_data.copy_(loaded_weight, non_blocking=True)
 
     def _load_single_value(
         self, param: torch.nn.Parameter, loaded_weight: torch.Tensor, expert_id: int

diff --git a/python/sglang/srt/models/qwen3_moe.py b/python/sglang/srt/models/qwen3_moe.py
@@ -836,6 +836,9 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                     else:
                         logger.warning(f"Parameter {name} not found in params_dict")
 
+        # Synchronize to ensure all weights are loaded since we loaded them in non-blocking mode
+        torch.cuda.synchronize()
-        torch.cuda.synchronize()
+        if _is_cuda:
+            torch.cuda.synchronize()
-        torch.cuda.synchronize()
+        if _is_cuda:
+            torch.cuda.synchronize()
+
         # TODO mimic deepseek
         self.routed_experts_weights_of_layer = {
             layer_id: self.model.layers[layer_id].mlp.get_moe_weights()