sgl-project · fzyzcjy · Apr 1, 2025 · Apr 1, 2025 · Apr 1, 2025 · Apr 1, 2025
diff --git a/docs/backend/native_api.ipynb b/docs/backend/native_api.ipynb
@@ -408,19 +408,7 @@
     "print_highlight(response)\n",
     "\n",
     "response = requests.post(f\"http://localhost:{port}/dump_expert_distribution_record\")\n",
-    "print_highlight(response)\n",
-    "\n",
-    "import glob\n",
-    "\n",
-    "output_file = glob.glob(\"expert_distribution_*.csv\")[0]\n",
-    "with open(output_file, \"r\") as f:\n",
-    "    print_highlight(\"\\n| Layer ID | Expert ID | Count |\")\n",
-    "    print_highlight(\"|----------|-----------|--------|\")\n",
-    "    next(f)\n",
-    "    for i, line in enumerate(f):\n",
-    "        if i < 9:\n",
-    "            layer_id, expert_id, count = line.strip().split(\",\")\n",
-    "            print_highlight(f\"| {layer_id:8} | {expert_id:9} | {count:6} |\")"
+    "print_highlight(response)"
    ]
   },
   {

@@ -129,16 +129,7 @@ def load_model(server_args, port_args, tp_rank):
     suppress_other_loggers()
     rank_print = print if tp_rank == 0 else lambda *args, **kwargs: None
 
-    model_config = ModelConfig(
-        server_args.model_path,
-        trust_remote_code=server_args.trust_remote_code,
-        revision=server_args.revision,
-        context_length=server_args.context_length,
-        model_override_args=server_args.json_model_override_args,
-        is_embedding=server_args.is_embedding,
-        dtype=server_args.dtype,
-        quantization=server_args.quantization,
-    )
+    model_config = ModelConfig.from_server_args(server_args)
     model_runner = ModelRunner(
         model_config=model_config,
         mem_fraction_static=server_args.mem_fraction_static,

diff --git a/python/sglang/srt/configs/model_config.py b/python/sglang/srt/configs/model_config.py
@@ -24,6 +24,7 @@
 
 from sglang.srt.hf_transformers_utils import get_config, get_context_length
 from sglang.srt.layers.quantization import QUANTIZATION_METHODS
+from sglang.srt.server_args import ServerArgs
 from sglang.srt.utils import get_bool_env_var, is_hip
 
 logger = logging.getLogger(__name__)
@@ -171,6 +172,19 @@ def __init__(
         self.hf_eos_token_id = self.get_hf_eos_token_id()
         self.image_token_id = getattr(self.hf_config, "image_token_id", None)
 
+    @staticmethod
+    def from_server_args(server_args: ServerArgs, model_path: str = None):
+        return ModelConfig(
+            model_path=model_path or server_args.model_path,
+            trust_remote_code=server_args.trust_remote_code,
+            revision=server_args.revision,
+            context_length=server_args.context_length,
+            model_override_args=server_args.json_model_override_args,
+            is_embedding=server_args.is_embedding,
+            dtype=server_args.dtype,
+            quantization=server_args.quantization,
+        )
+
     # adapted from https://github.com/vllm-project/vllm/blob/main/vllm/config.py#L289
     def get_total_num_kv_heads(self) -> int:
         """Returns the total number of KV heads."""

@@ -31,6 +31,9 @@
 import zmq.asyncio
 from PIL.Image import Image
 
+from sglang.srt.configs.model_config import ModelConfig
+from sglang.srt.managers.expert_location import ExpertLocationMetadata
+
 # Fix a bug of Python threading
 setattr(threading, "_register_atexit", lambda *args, **kwargs: None)
 
@@ -495,6 +498,9 @@ def _launch_subprocesses(
         server_args.model_path, server_args.tokenizer_path
     )
 
+    model_config = ModelConfig.from_server_args(server_args)
+    expert_location_metadata = ExpertLocationMetadata.from_model_config(model_config)
+
     scheduler_procs = []
     if server_args.dp_size == 1:
         # Launch tensor parallel scheduler processes
@@ -516,7 +522,15 @@ def _launch_subprocesses(
             )
             proc = mp.Process(
                 target=run_scheduler_process,
-                args=(server_args, port_args, gpu_id, tp_rank, None, writer),
+                args=(
+                    server_args,
+                    port_args,
+                    expert_location_metadata,
+                    gpu_id,
+                    tp_rank,
+                    None,
+                    writer,
+                ),
             )
             with memory_saver_adapter.configure_subprocess():
                 proc.start()
@@ -528,7 +542,7 @@ def _launch_subprocesses(
         scheduler_pipe_readers = [reader]
         proc = mp.Process(
             target=run_data_parallel_controller_process,
-            args=(server_args, port_args, writer),
+            args=(server_args, port_args, expert_location_metadata, writer),
         )
         proc.start()
         scheduler_procs.append(proc)
@@ -565,7 +579,9 @@ def _launch_subprocesses(
     detoken_proc.start()
 
     # Launch tokenizer process
-    tokenizer_manager = TokenizerManager(server_args, port_args)
+    tokenizer_manager = TokenizerManager(
+        server_args, port_args, expert_location_metadata
+    )
     if server_args.chat_template:
         load_chat_template_for_openai_api(
             tokenizer_manager, server_args.chat_template, server_args.model_path

@@ -366,11 +366,8 @@ async def stop_expert_distribution_record_async():
 @app.api_route("/dump_expert_distribution_record", methods=["GET", "POST"])
 async def dump_expert_distribution_record_async():
     """Dump expert distribution record."""
-    await _global_state.tokenizer_manager.dump_expert_distribution_record()
-    return Response(
-        content="Dump expert distribution record.\n",
-        status_code=200,
-    )
+    content = await _global_state.tokenizer_manager.dump_expert_distribution_record()
+    return ORJSONResponse(content, status_code=200)
 
 
 @app.post("/update_weights_from_disk")

@@ -3,6 +3,8 @@
 
 import torch
 
+from sglang.srt.managers.schedule_batch import get_global_expert_location_metadata
+
 try:
     from deep_gemm import (
         get_col_major_tma_aligned_tensor,
@@ -131,6 +133,7 @@ def __init__(
         top_k: int,
         hidden_size: int,
         intermediate_size: int,
+        layer_id: int,
         params_dtype: Optional[torch.dtype] = None,
         renormalize: bool = True,
         use_grouped_topk: bool = False,
@@ -153,6 +156,7 @@ def __init__(
         )
         self.tp_rank = get_tensor_model_parallel_rank()
 
+        self.layer_id = layer_id
         self.num_experts = num_experts
         assert self.num_experts % self.tp_size == 0
         self.num_experts_per_partition = self.num_experts // self.tp_size
@@ -221,6 +225,9 @@ def forward(self, hidden_states: torch.Tensor, router_logits: torch.Tensor):
             num_expert_group=self.num_expert_group,
             correction_bias=self.correction_bias,
             custom_routing_function=self.custom_routing_function,
+            expert_logical_to_rank_dispatch_physical_map=get_global_expert_location_metadata().logical_to_rank_dispatch_physical_map[
+                self.tp_rank, self.layer_id, :
+            ],
         )
 
         reorder_topk_ids, src2dst, seg_indptr = run_moe_ep_preproess(
@@ -409,6 +416,28 @@ def weight_loader(
         weight_name: str,
         shard_id: str,
         expert_id: int,
+    ) -> None:
+        physical_expert_ids = (
+            get_global_expert_location_metadata().logical_to_all_physical(
+                self.layer_id, expert_id
+            )
+        )
+        for physical_expert_id in physical_expert_ids:
+            self._weight_loader_physical(
+                param=param,
+                loaded_weight=loaded_weight,
+                weight_name=weight_name,
+                shard_id=shard_id,
+                expert_id=physical_expert_id,
+            )
+
+    def _weight_loader_physical(
+        self,
+        param: torch.nn.Parameter,
+        loaded_weight: torch.Tensor,
+        weight_name: str,
+        shard_id: str,
+        expert_id: int,
     ) -> None:
         if expert_id < self.start_expert_id or expert_id > self.end_expert_id:
             return
@@ -802,6 +831,7 @@ def __init__(
         top_k: int,
         hidden_size: int,
         intermediate_size: int,
+        layer_id: int,
         params_dtype: Optional[torch.dtype] = None,
         renormalize: bool = True,
         use_grouped_topk: bool = False,
@@ -820,6 +850,7 @@ def __init__(
             top_k,
             hidden_size,
             intermediate_size,
+            layer_id,
             params_dtype,
             renormalize,
             use_grouped_topk,

@@ -1,3 +1,4 @@
+from sglang.srt.managers.expert_distribution import expert_distribution_recorder
 from sglang.srt.utils import DeepEPMode
 
 try:
@@ -248,7 +249,7 @@ def _dispatch_core(
             recv_x,
             recv_topk_idx,
             recv_topk_weights,
-            _,  # num_recv_tokens_per_expert_list
+            num_recv_tokens_per_expert_list,
             self.handle,
             event,
         ) = buffer.dispatch(
@@ -264,6 +265,10 @@ def _dispatch_core(
             allocate_on_comm_stream=(previous_event is not None) and self.async_finish,
         )
 
+        expert_distribution_recorder.on_deepep_dispatch_normal(
+            num_recv_tokens_per_expert_list
+        )
+
         return (
             recv_x,
             recv_topk_idx,

@@ -268,6 +268,7 @@ def __init__(
         top_k: int,
         hidden_size: int,
         intermediate_size: int,
+        layer_id: Optional[int] = None,
         params_dtype: Optional[torch.dtype] = None,
         reduce_results: bool = False,
         renormalize: bool = True,

@@ -12,23 +12,23 @@
 # limitations under the License.
 # ==============================================================================
 
-import os
 from typing import Callable, Optional
 
 import torch
 import torch.nn.functional as F
 
-from sglang.srt.managers.expert_distribution import ExpertDistributionRecorder
-from sglang.srt.managers.schedule_batch import global_server_args_dict
+from sglang.srt.managers.expert_distribution import expert_distribution_recorder
+from sglang.srt.managers.schedule_batch import (
+    get_global_expert_location_metadata,
+    global_expert_location_metadata,
+    global_server_args_dict,
+)
 from sglang.srt.utils import get_compiler_backend, is_cuda, is_hip
 
 _is_cuda = is_cuda()
 _is_hip = is_hip()
 
 
-expert_distribution_recorder = ExpertDistributionRecorder()
-
-
 def fused_topk_native(
     hidden_states: torch.Tensor,
     gating_output: torch.Tensor,
@@ -250,6 +250,7 @@ def select_experts(
     custom_routing_function: Optional[Callable] = None,
     correction_bias: Optional[torch.Tensor] = None,
     torch_native: bool = False,
+    expert_logical_to_rank_dispatch_physical_map: Optional[torch.Tensor] = None,
 ):
     n_share_experts_fusion = 0
     if global_server_args_dict["n_share_experts_fusion"] is not None:
@@ -301,6 +302,10 @@ def select_experts(
             renormalize=renormalize,
         )
 
-    expert_distribution_recorder.record_new_token(topk_ids)
+    if expert_logical_to_rank_dispatch_physical_map is not None:
+        # TODO this is inefficient, and I will fuse into existing kernels
+        topk_ids = expert_logical_to_rank_dispatch_physical_map[topk_ids]
+
+    expert_distribution_recorder.on_select_experts(topk_ids=topk_ids)
 
     return topk_weights, topk_ids