Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 9 additions & 13 deletions python/sglang/srt/eplb/expert_location.py
Original file line number Diff line number Diff line change
Expand Up @@ -284,17 +284,9 @@ def update(
# -------------------------------- usage ------------------------------------

def logical_to_all_physical(
self,
layer_id: int,
logical_expert_id: int,
require_global_experts: bool = False,
self, layer_id: int, logical_expert_id: int
) -> List[int]:
# Use CPU copy to avoid GPU→CPU sync on every call, which is expensive in update weights scenario
if require_global_experts:
num_physical_experts = self.logical_to_all_physical_map_cpu[layer_id].shape[
-1
]
return list(torch.arange(0, num_physical_experts))
return [
physical_expert_id
for physical_expert_id in self.logical_to_all_physical_map_cpu[
Expand Down Expand Up @@ -363,10 +355,14 @@ def _compute_logical_to_all_physical_map(
)

# Replace by the nearest physical expert
if nearest_expert != -1:
logical_to_all_physical_map[layer_id][logical_expert_id] = [
nearest_expert
]
mapped_physical_experts = logical_to_all_physical_map[layer_id][
logical_expert_id
]
if (
nearest_expert != -1
and nearest_expert not in mapped_physical_experts
):
mapped_physical_experts[0] = nearest_expert
Comment on lines +358 to +365
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

critical

The condition nearest_expert not in mapped_physical_experts will always be False when nearest_expert != -1. This is because _find_nearest_expert is called with mapped_physical_experts as candidate_physical_expert_ids, and it returns an element from that list (or -1). As a result, this if block is effectively dead code, and the nearest expert is never prioritized.

This defeats the purpose of finding the nearest expert for locality optimization.

To correctly prioritize the nearest expert, it should be moved to the front of the mapped_physical_experts list. A swap with the first element is a safe way to do this while preserving the list length and avoiding duplicates.

Suggested change
mapped_physical_experts = logical_to_all_physical_map[layer_id][
logical_expert_id
]
if (
nearest_expert != -1
and nearest_expert not in mapped_physical_experts
):
mapped_physical_experts[0] = nearest_expert
mapped_physical_experts = logical_to_all_physical_map[layer_id][
logical_expert_id
]
if nearest_expert != -1:
# The `_find_nearest_expert` function returns an expert from `mapped_physical_experts`.
# We should move it to the front to prioritize it for dispatch.
try:
current_idx = mapped_physical_experts.index(nearest_expert)
if current_idx > 0:
# Swap with the first element to preserve list length and avoid duplicates.
mapped_physical_experts[0], mapped_physical_experts[current_idx] = (
mapped_physical_experts[current_idx],
mapped_physical_experts[0],
)
except ValueError:
# This path should ideally not be taken.
# It implies `_find_nearest_expert` returned an expert not in the candidate list.
# As a fallback, we replace the first expert.
mapped_physical_experts[0] = nearest_expert


logical_to_all_physical_map = _pad_nested_array(
logical_to_all_physical_map, pad_value=-1
Expand Down
6 changes: 2 additions & 4 deletions python/sglang/srt/layers/moe/fused_moe_triton/layer.py
Original file line number Diff line number Diff line change
Expand Up @@ -539,12 +539,9 @@ def weight_loader(
# This is a shared expert.
physical_expert_ids = [expert_id]
else:
require_global_experts = getattr(
param, "_sglang_require_global_experts", False
)
physical_expert_ids = (
global_expert_location_metadata.logical_to_all_physical(
self.layer_id, expert_id, require_global_experts
self.layer_id, expert_id
)
)

Expand Down Expand Up @@ -1129,6 +1126,7 @@ def forward(self, hidden_states: torch.Tensor, topk_output: TopKOutput):
local_expert_offset=self.moe_ep_rank * self.num_local_experts,
local_num_experts=self.num_local_experts,
routed_scaling_factor=self.moe_runner_config.routed_scaling_factor,
tile_tokens_dim=None,
routing_method_type=RoutingMethodType.DeepSeekV3,
do_finalize=True,
output=symm_output,
Expand Down
1 change: 1 addition & 0 deletions python/sglang/srt/layers/quantization/fp8.py
Original file line number Diff line number Diff line change
Expand Up @@ -1245,6 +1245,7 @@ def apply_with_router_logits(
routed_scaling_factor=(
routed_scaling_factor if routed_scaling_factor is not None else 1.0
),
tile_tokens_dim=None,
routing_method_type=routing_method_type,
use_shuffled_weight=False,
)
Expand Down
1 change: 1 addition & 0 deletions python/sglang/srt/layers/quantization/modelopt_quant.py
Original file line number Diff line number Diff line change
Expand Up @@ -695,6 +695,7 @@ def apply(
else 1.0
),
use_routing_scales_on_input=use_routing_scales_on_input,
tile_tokens_dim=None,
routing_method_type=routing_method_type,
)

Expand Down
1 change: 1 addition & 0 deletions python/sglang/srt/layers/quantization/mxfp4.py
Original file line number Diff line number Diff line change
Expand Up @@ -681,6 +681,7 @@ def apply(
layer.moe_ep_rank * layer.num_local_experts, # local_expert_offset
layer.num_local_experts, # local num experts
None,
None, # tile_tokens_dim
1, # routing_method_type, renormalize
True, # do finalize
output=symm_output,
Expand Down
Loading