diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/layer.py b/python/sglang/srt/layers/moe/fused_moe_triton/layer.py index d0a9ed132562..4568273c960f 100644 --- a/python/sglang/srt/layers/moe/fused_moe_triton/layer.py +++ b/python/sglang/srt/layers/moe/fused_moe_triton/layer.py @@ -283,7 +283,7 @@ def _load_w13( ) expert_data = expert_data.narrow(shard_dim, start, shard_size) - expert_data.copy_(loaded_weight) + expert_data.copy_(loaded_weight, non_blocking=True) def _load_w2( self, @@ -347,7 +347,7 @@ def _load_w2( ) # w2, down_proj: Load into only logical weight of w2. - expert_data.copy_(loaded_weight) + expert_data.copy_(loaded_weight, non_blocking=True) def _load_single_value( self, param: torch.nn.Parameter, loaded_weight: torch.Tensor, expert_id: int diff --git a/python/sglang/srt/models/qwen3_moe.py b/python/sglang/srt/models/qwen3_moe.py index c531dd0b4818..bd271c99a0e5 100644 --- a/python/sglang/srt/models/qwen3_moe.py +++ b/python/sglang/srt/models/qwen3_moe.py @@ -836,6 +836,9 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): else: logger.warning(f"Parameter {name} not found in params_dict") + # Synchronize to ensure all weights are loaded since we loaded them in non-blocking mode + torch.cuda.synchronize() + # TODO mimic deepseek self.routed_experts_weights_of_layer = { layer_id: self.model.layers[layer_id].mlp.get_moe_weights()