sgl-project · fzyzcjy · Apr 17, 2025 · Apr 17, 2025 · Apr 17, 2025 · Apr 17, 2025
diff --git a/docs/backend/native_api.ipynb b/docs/backend/native_api.ipynb
@@ -408,19 +408,7 @@
     "print_highlight(response)\n",
     "\n",
     "response = requests.post(f\"http://localhost:{port}/dump_expert_distribution_record\")\n",
-    "print_highlight(response)\n",
-    "\n",
-    "import glob\n",
-    "\n",
-    "output_file = glob.glob(\"expert_distribution_*.csv\")[0]\n",
-    "with open(output_file, \"r\") as f:\n",
-    "    print_highlight(\"\\n| Layer ID | Expert ID | Count |\")\n",
-    "    print_highlight(\"|----------|-----------|--------|\")\n",
-    "    next(f)\n",
-    "    for i, line in enumerate(f):\n",
-    "        if i < 9:\n",
-    "            layer_id, expert_id, count = line.strip().split(\",\")\n",
-    "            print_highlight(f\"| {layer_id:8} | {expert_id:9} | {count:6} |\")"
+    "print_highlight(response)"
    ]
   },
   {

@@ -129,6 +129,7 @@ def load_model(server_args, port_args, tp_rank):
     suppress_other_loggers()
     rank_print = print if tp_rank == 0 else lambda *args, **kwargs: None
 
+    # TODO re-apply from_server_args PR
     model_config = ModelConfig(
         server_args.model_path,
         trust_remote_code=server_args.trust_remote_code,

@@ -480,10 +480,11 @@ def get_tokenizer(
 
 
 def get_dataset(args, tokenizer):
+    num_prompts = args.num_prompts + args.skip_num_prompts
     if args.dataset_name == "sharegpt":
         input_requests = sample_sharegpt_requests(
             dataset_path=args.dataset_path,
-            num_requests=args.num_prompts,
+            num_requests=num_prompts,
             tokenizer=tokenizer,
             fixed_output_len=args.sharegpt_output_len,
             context_len=args.sharegpt_context_len,
@@ -494,7 +495,7 @@ def get_dataset(args, tokenizer):
         input_requests = sample_random_requests(
             input_len=args.random_input_len,
             output_len=args.random_output_len,
-            num_prompts=args.num_prompts,
+            num_prompts=num_prompts,
             range_ratio=args.random_range_ratio,
             tokenizer=tokenizer,
             dataset_path=args.dataset_path,
@@ -512,6 +513,7 @@ def get_dataset(args, tokenizer):
         )
     else:
         raise ValueError(f"Unknown dataset: {args.dataset_name}")
+    input_requests = input_requests[args.skip_num_prompts :]
     return input_requests
 
 
@@ -607,7 +609,7 @@ def sample_sharegpt_requests(
     apply_chat_template=False,
 ) -> List[Tuple[str, int, int]]:
     if fixed_output_len is not None and fixed_output_len < 4:
-        raise ValueError("output_len too small")
+        print("Warn: output_len too small")
 
     # Download sharegpt if necessary
     if not os.path.isfile(dataset_path) and dataset_path == "":
@@ -666,7 +668,7 @@ def sample_sharegpt_requests(
             len(completion_token_ids) if fixed_output_len is None else fixed_output_len
         )
 
-        if prompt_len < 2 or output_len < 2:
+        if prompt_len < 2 or ((fixed_output_len is None) and (output_len < 2)):
             # Prune too short sequences.
             continue
 
@@ -690,7 +692,6 @@ def sample_random_requests(
     dataset_path: str,
     random_sample: bool = True,
 ) -> List[Tuple[str, int, int]]:
-
     input_lens = np.random.randint(
         max(int(input_len * range_ratio), 1),
         input_len + 1,
@@ -976,6 +977,7 @@ async def benchmark(
     lora_names: List[str],
     extra_request_body: Dict[str, Any],
     profile: bool,
+    enable_expert_distribution_record: bool = False,
     pd_seperated: bool = False,
     flush_cache: bool = False,
 ):
@@ -1041,6 +1043,12 @@ async def limited_request_func(request_func_input, pbar):
 
     time.sleep(1.0)
 
+    if enable_expert_distribution_record:
+        print("Starting expert distribution record...")
+        output = await async_request_profile(
+            api_url=base_url + "/start_expert_distribution_record"
+        )
+        assert output.success
     # Start profiler
     if profile:
         print("Starting profiler...")
@@ -1085,6 +1093,16 @@ async def limited_request_func(request_func_input, pbar):
         profile_output = await async_request_profile(api_url=base_url + "/stop_profile")
         if profile_output.success:
             print("Profiler stopped")
+    if enable_expert_distribution_record:
+        print("Stopping expert distribution record...")
+        output = await async_request_profile(
+            api_url=base_url + "/dump_expert_distribution_record"
+        )
+        assert output.success
+        output = await async_request_profile(
+            api_url=base_url + "/stop_expert_distribution_record"
+        )
+        assert output.success
 
     if pbar is not None:
         pbar.close()
@@ -1393,6 +1411,7 @@ def run_benchmark(args_: argparse.Namespace):
             lora_names=args.lora_name,
             extra_request_body=extra_request_body,
             profile=args.profile,
+            enable_expert_distribution_record=args.enable_expert_distribution_record,
             pd_seperated=args.pd_seperated,
             flush_cache=args.flush_cache,
         )
@@ -1466,6 +1485,12 @@ def __call__(self, parser, namespace, values, option_string=None):
         default=1000,
         help="Number of prompts to process. Default is 1000.",
     )
+    parser.add_argument(
+        "--skip-num-prompts",
+        type=int,
+        default=0,
+        help="Number of prompts to skip. Default is 0.",
+    )
     parser.add_argument(
         "--sharegpt-output-len",
         type=int,
@@ -1557,6 +1582,11 @@ def __call__(self, parser, namespace, values, option_string=None):
         help="Use Torch Profiler. The endpoint must be launched with "
         "SGLANG_TORCH_PROFILER_DIR to enable profiler.",
     )
+    parser.add_argument(
+        "--enable-expert-distribution-record",
+        action="store_true",
+        help="Enable expert distribution record",
+    )
     parser.add_argument(
         "--lora-name",
         type=str,

diff --git a/python/sglang/srt/configs/model_config.py b/python/sglang/srt/configs/model_config.py
@@ -24,6 +24,7 @@
 
 from sglang.srt.hf_transformers_utils import get_config, get_context_length
 from sglang.srt.layers.quantization import QUANTIZATION_METHODS
+from sglang.srt.server_args import ServerArgs
 from sglang.srt.utils import get_bool_env_var, is_hip
 
 logger = logging.getLogger(__name__)
@@ -187,6 +188,19 @@ def __init__(
         self.hf_eos_token_id = self.get_hf_eos_token_id()
         self.image_token_id = getattr(self.hf_config, "image_token_id", None)
 
+    @staticmethod
+    def from_server_args(server_args: ServerArgs, model_path: str = None):
+        return ModelConfig(
+            model_path=model_path or server_args.model_path,
+            trust_remote_code=server_args.trust_remote_code,
+            revision=server_args.revision,
+            context_length=server_args.context_length,
+            model_override_args=server_args.json_model_override_args,
+            is_embedding=server_args.is_embedding,
+            dtype=server_args.dtype,
+            quantization=server_args.quantization,
+        )
+
     # adapted from https://github.com/vllm-project/vllm/blob/main/vllm/config.py#L289
     def get_total_num_kv_heads(self) -> int:
         """Returns the total number of KV heads."""

@@ -426,6 +426,7 @@ def event_loop_normal_disagg_decode(self):
         while True:
             recv_reqs = self.recv_requests()
             self.process_input_requests(recv_reqs)
+            self.model_runner_event_loop_step()
             # polling and allocating kv cache
             self.process_decode_queue()
             batch = self.get_next_disagg_decode_batch_to_run()

@@ -178,6 +178,7 @@ def event_loop_normal_disagg_prefill(self):
         while True:
             recv_reqs = self.recv_requests()
             self.process_input_requests(recv_reqs)
+            self.model_runner_event_loop_step()
             self.waiting_queue.extend(
                 self.disagg_prefill_pending_queue.pop_bootstrapped()
             )

@@ -20,17 +20,24 @@
 import asyncio
 import atexit
 import dataclasses
+import json
 import logging
 import multiprocessing as mp
 import os
 import signal
 import threading
+from json import JSONDecodeError
+from pathlib import Path
 from typing import AsyncIterator, Dict, Iterator, List, Optional, Tuple, Union
 
 import zmq
 import zmq.asyncio
 from PIL.Image import Image
 
+from sglang.srt.configs.model_config import ModelConfig
+from sglang.srt.managers.eplb_manager import EPLBManager
+from sglang.srt.managers.expert_location import ExpertLocationMetadata
+
 # Fix a bug of Python threading
 setattr(threading, "_register_atexit", lambda *args, **kwargs: None)
 
@@ -45,13 +52,15 @@
 from sglang.srt.managers.detokenizer_manager import run_detokenizer_process
 from sglang.srt.managers.io_struct import (
     EmbeddingReqInput,
+    EplbRebalanceReqInput,
     GenerateReqInput,
     GetWeightsByNameReqInput,
     InitWeightsUpdateGroupReqInput,
     ReleaseMemoryOccupationReqInput,
     ResumeMemoryOccupationReqInput,
     RpcReqInput,
     RpcReqOutput,
+    UpdateExpertLocationReqInput,
     UpdateWeightFromDiskReqInput,
     UpdateWeightsFromDistributedReqInput,
     UpdateWeightsFromTensorReqInput,
@@ -279,6 +288,10 @@ def __exit__(self, exc_type, exc_value, traceback):
         self.shutdown()
         return False
 
+    def flush_cache(self):
+        loop = asyncio.get_event_loop()
+        return loop.run_until_complete(self.tokenizer_manager.flush_cache())
+
     def start_profile(self):
         loop = asyncio.get_event_loop()
         loop.run_until_complete(self.tokenizer_manager.start_profile())
@@ -355,10 +368,32 @@ def update_weights_from_tensor(
             self.tokenizer_manager.update_weights_from_tensor(obj, None)
         )
 
+    def eplb_rebalance(self):
+        loop = asyncio.get_event_loop()
+        return loop.run_until_complete(
+            self.tokenizer_manager.eplb_rebalance(EplbRebalanceReqInput())
+        )
+
+    def eplb_save_expert_distribution(self):
+        loop = asyncio.get_event_loop()
+        return loop.run_until_complete(
+            self.tokenizer_manager.eplb_save_expert_distribution()
+        )
+
+    def update_expert_location(self, expert_location_metadata: ExpertLocationMetadata):
+        obj = UpdateExpertLocationReqInput(
+            expert_location_metadata=expert_location_metadata,
+        )
+        loop = asyncio.get_event_loop()
+        return loop.run_until_complete(
+            self.tokenizer_manager.update_expert_location(obj)
+        )
+
     def update_weights_from_disk(
         self,
         model_path: str,
         load_format: Optional[str] = None,
+        param_categories: Optional[List[str]] = None,
     ):
         """Update the weights from disk inplace without re-launching the engine.
 
@@ -369,6 +404,7 @@ def update_weights_from_disk(
         obj = UpdateWeightFromDiskReqInput(
             model_path=model_path,
             load_format=load_format,
+            param_categories=param_categories,
         )
 
         loop = asyncio.get_event_loop()
@@ -496,6 +532,14 @@ def _launch_subprocesses(
         server_args.model_path, server_args.tokenizer_path
     )
 
+    if server_args.node_rank == 0:
+        eplb_manager = EPLBManager(server_args) if server_args.enable_eplb else None
+        expert_location_metadata = _compute_initial_expert_location_metadata(
+            server_args, eplb_manager
+        )
+    else:
+        eplb_manager = expert_location_metadata = None
+
     scheduler_procs = []
     if server_args.dp_size == 1:
         # Launch tensor parallel scheduler processes
@@ -517,7 +561,15 @@ def _launch_subprocesses(
             )
             proc = mp.Process(
                 target=run_scheduler_process,
-                args=(server_args, port_args, gpu_id, tp_rank, None, writer),
+                args=(
+                    server_args,
+                    port_args,
+                    expert_location_metadata,
+                    gpu_id,
+                    tp_rank,
+                    None,
+                    writer,
+                ),
             )
             with memory_saver_adapter.configure_subprocess():
                 proc.start()
@@ -529,7 +581,7 @@ def _launch_subprocesses(
         scheduler_pipe_readers = [reader]
         proc = mp.Process(
             target=run_data_parallel_controller_process,
-            args=(server_args, port_args, writer),
+            args=(server_args, port_args, expert_location_metadata, writer),
         )
         proc.start()
         scheduler_procs.append(proc)
@@ -566,7 +618,9 @@ def _launch_subprocesses(
     detoken_proc.start()
 
     # Launch tokenizer process
-    tokenizer_manager = TokenizerManager(server_args, port_args)
+    tokenizer_manager = TokenizerManager(
+        server_args, port_args, expert_location_metadata, eplb_manager
+    )
     if server_args.chat_template:
         load_chat_template_for_openai_api(
             tokenizer_manager, server_args.chat_template, server_args.model_path
@@ -598,3 +652,33 @@ def _launch_subprocesses(
     scheduler_info = scheduler_infos[0]
     tokenizer_manager.max_req_input_len = scheduler_info["max_req_input_len"]
     return tokenizer_manager, scheduler_info
+
+
+def _compute_initial_expert_location_metadata(
+    server_args: ServerArgs, eplb_manager: EPLBManager
+) -> ExpertLocationMetadata:
+    if (data := server_args.init_expert_location) is not None:
+        try:
+            data_dict = json.loads(data)
+        except JSONDecodeError:
+            data_dict = json.loads(Path(data).read_text())
+
+        if "physical_to_logical_map" in data_dict:
+            logger.info(
+                "init_expert_location from init_by_mapping using ServerArgs.init_expert_location"
+            )
+            return ExpertLocationMetadata.init_by_mapping(server_args, **data_dict)
+        elif "logical_count" in data_dict:
+            logger.info(
+                "init_expert_location from init_by_eplb using ServerArgs.init_expert_location"
+            )
+            return ExpertLocationMetadata.init_by_eplb(server_args, **data_dict)
+        else:
+            raise NotImplementedError(
+                f"Unknown init_expert_location format ({list(data_dict.keys())=})"
+            )
+    if server_args.enable_eplb:
+        logger.info("init_expert_location from EPLBManager")
+        return eplb_manager.compute_expert_location_metadata()
+
+    return ExpertLocationMetadata.init_trivial(server_args)