sgl-project · Kangyan-Zhou · Nov 16, 2025 · Nov 15, 2025 · Nov 15, 2025 · Nov 15, 2025
@@ -284,17 +284,9 @@ def update(
     # -------------------------------- usage ------------------------------------
 
     def logical_to_all_physical(
-        self,
-        layer_id: int,
-        logical_expert_id: int,
-        require_global_experts: bool = False,
+        self, layer_id: int, logical_expert_id: int
     ) -> List[int]:
         # Use CPU copy to avoid GPU→CPU sync on every call, which is expensive in update weights scenario
-        if require_global_experts:
-            num_physical_experts = self.logical_to_all_physical_map_cpu[layer_id].shape[
-                -1
-            ]
-            return list(torch.arange(0, num_physical_experts))
         return [
             physical_expert_id
             for physical_expert_id in self.logical_to_all_physical_map_cpu[
@@ -363,10 +355,14 @@ def _compute_logical_to_all_physical_map(
                 )
 
                 # Replace by the nearest physical expert
-                if nearest_expert != -1:
-                    logical_to_all_physical_map[layer_id][logical_expert_id] = [
-                        nearest_expert
-                    ]
+                mapped_physical_experts = logical_to_all_physical_map[layer_id][
+                    logical_expert_id
+                ]
+                if (
+                    nearest_expert != -1
+                    and nearest_expert not in mapped_physical_experts
+                ):
+                    mapped_physical_experts[0] = nearest_expert
-                mapped_physical_experts = logical_to_all_physical_map[layer_id][
-                    logical_expert_id
-                ]
-                if (
-                    nearest_expert != -1
-                    and nearest_expert not in mapped_physical_experts
-                ):
-                    mapped_physical_experts[0] = nearest_expert
+                mapped_physical_experts = logical_to_all_physical_map[layer_id][
+                    logical_expert_id
+                ]
+                if nearest_expert != -1:
+                    # The `_find_nearest_expert` function returns an expert from `mapped_physical_experts`.
+                    # We should move it to the front to prioritize it for dispatch.
+                    try:
+                        current_idx = mapped_physical_experts.index(nearest_expert)
+                        if current_idx > 0:
+                            # Swap with the first element to preserve list length and avoid duplicates.
+                            mapped_physical_experts[0], mapped_physical_experts[current_idx] = (
+                                mapped_physical_experts[current_idx],
+                                mapped_physical_experts[0],
+                            )
+                    except ValueError:
+                        # This path should ideally not be taken.
+                        # It implies `_find_nearest_expert` returned an expert not in the candidate list.
+                        # As a fallback, we replace the first expert.
+                        mapped_physical_experts[0] = nearest_expert
-                mapped_physical_experts = logical_to_all_physical_map[layer_id][
-                    logical_expert_id
-                ]
-                if (
-                    nearest_expert != -1
-                    and nearest_expert not in mapped_physical_experts
-                ):
-                    mapped_physical_experts[0] = nearest_expert
+                mapped_physical_experts = logical_to_all_physical_map[layer_id][
+                    logical_expert_id
+                ]
+                if nearest_expert != -1:
+                    # The `_find_nearest_expert` function returns an expert from `mapped_physical_experts`.
+                    # We should move it to the front to prioritize it for dispatch.
+                    try:
+                        current_idx = mapped_physical_experts.index(nearest_expert)
+                        if current_idx > 0:
+                            # Swap with the first element to preserve list length and avoid duplicates.
+                            mapped_physical_experts[0], mapped_physical_experts[current_idx] = (
+                                mapped_physical_experts[current_idx],
+                                mapped_physical_experts[0],
+                            )
+                    except ValueError:
+                        # This path should ideally not be taken.
+                        # It implies `_find_nearest_expert` returned an expert not in the candidate list.
+                        # As a fallback, we replace the first expert.
+                        mapped_physical_experts[0] = nearest_expert
 
     logical_to_all_physical_map = _pad_nested_array(
         logical_to_all_physical_map, pad_value=-1

@@ -539,12 +539,9 @@ def weight_loader(
             # This is a shared expert.
             physical_expert_ids = [expert_id]
         else:
-            require_global_experts = getattr(
-                param, "_sglang_require_global_experts", False
-            )
             physical_expert_ids = (
                 global_expert_location_metadata.logical_to_all_physical(
-                    self.layer_id, expert_id, require_global_experts
+                    self.layer_id, expert_id
                 )
             )
 
@@ -1129,6 +1126,7 @@ def forward(self, hidden_states: torch.Tensor, topk_output: TopKOutput):
             local_expert_offset=self.moe_ep_rank * self.num_local_experts,
             local_num_experts=self.num_local_experts,
             routed_scaling_factor=self.moe_runner_config.routed_scaling_factor,
+            tile_tokens_dim=None,
             routing_method_type=RoutingMethodType.DeepSeekV3,
             do_finalize=True,
             output=symm_output,

@@ -1245,6 +1245,7 @@ def apply_with_router_logits(
                 routed_scaling_factor=(
                     routed_scaling_factor if routed_scaling_factor is not None else 1.0
                 ),
+                tile_tokens_dim=None,
                 routing_method_type=routing_method_type,
                 use_shuffled_weight=False,
             )

@@ -695,6 +695,7 @@ def apply(
                         else 1.0
                     ),
                     use_routing_scales_on_input=use_routing_scales_on_input,
+                    tile_tokens_dim=None,
                     routing_method_type=routing_method_type,
                 )
 

@@ -681,6 +681,7 @@ def apply(
                 layer.moe_ep_rank * layer.num_local_experts,  # local_expert_offset
                 layer.num_local_experts,  # local num experts
                 None,
+                None,  # tile_tokens_dim
                 1,  # routing_method_type, renormalize
                 True,  # do finalize
                 output=symm_output,