Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
510 commits
Select commit Hold shift + click to select a range
3dfd359
more
fzyzcjy Apr 10, 2025
740b8e7
more
fzyzcjy Apr 10, 2025
5a6305e
more
fzyzcjy Apr 10, 2025
2b2da31
more
fzyzcjy Apr 10, 2025
1ea467b
more
fzyzcjy Apr 10, 2025
a90120a
more
fzyzcjy Apr 10, 2025
a4d47fa
more
fzyzcjy Apr 10, 2025
6948dab
fmt
fzyzcjy Apr 10, 2025
351bf69
more
fzyzcjy Apr 10, 2025
dfca693
more
fzyzcjy Apr 10, 2025
e5e955e
more
fzyzcjy Apr 10, 2025
cae8461
more
fzyzcjy Apr 10, 2025
d38fda2
more
fzyzcjy Apr 10, 2025
1f04f71
more
fzyzcjy Apr 10, 2025
ec13124
more
fzyzcjy Apr 10, 2025
a9cb3fa
more
fzyzcjy Apr 10, 2025
66a6702
more
fzyzcjy Apr 10, 2025
fa0ee76
rm
fzyzcjy Apr 10, 2025
52e820c
rm
fzyzcjy Apr 10, 2025
892630f
Merge branch 'feat/expert_distribution_recorder' into feat/phy_vs_log…
fzyzcjy Apr 10, 2025
5767e79
cherry pick
fzyzcjy Apr 10, 2025
f7be532
rm
fzyzcjy Apr 10, 2025
8767194
more
fzyzcjy Apr 10, 2025
afe433d
more
fzyzcjy Apr 10, 2025
f8b2c17
more
fzyzcjy Apr 10, 2025
2006ba4
more
fzyzcjy Apr 10, 2025
494172d
more
fzyzcjy Apr 10, 2025
cf76067
more
fzyzcjy Apr 10, 2025
ee32245
more
fzyzcjy Apr 10, 2025
7ad2a48
more
fzyzcjy Apr 10, 2025
d845b4e
more
fzyzcjy Apr 10, 2025
b7a52d5
more
fzyzcjy Apr 10, 2025
da995b1
more
fzyzcjy Apr 10, 2025
abbb2af
more
fzyzcjy Apr 10, 2025
eedfcb1
more
fzyzcjy Apr 10, 2025
3852d05
more
fzyzcjy Apr 10, 2025
cd4caee
more
fzyzcjy Apr 10, 2025
031ce90
more
fzyzcjy Apr 10, 2025
4623be4
more
fzyzcjy Apr 10, 2025
c19f4d8
fmt
fzyzcjy Apr 10, 2025
ecf08b8
Merge branch 'feat/phy_vs_logical' into feat/ep_num_redundant_experts
fzyzcjy Apr 10, 2025
b70afc1
fmt
fzyzcjy Apr 10, 2025
cc1cbd4
more
fzyzcjy Apr 10, 2025
64c3b19
more
fzyzcjy Apr 10, 2025
e09f1cf
fmt
fzyzcjy Apr 10, 2025
ba13a2d
more
fzyzcjy Apr 10, 2025
78ab92c
more
fzyzcjy Apr 10, 2025
fe10086
more
fzyzcjy Apr 10, 2025
5215648
more
fzyzcjy Apr 10, 2025
8e2c6ff
more
fzyzcjy Apr 10, 2025
adee880
more
fzyzcjy Apr 10, 2025
ff029cc
more
fzyzcjy Apr 10, 2025
44246e2
more
fzyzcjy Apr 10, 2025
63220f2
more
fzyzcjy Apr 10, 2025
a9aa042
more
fzyzcjy Apr 10, 2025
b437345
fmt
fzyzcjy Apr 10, 2025
9ea883b
mv
fzyzcjy Apr 10, 2025
4eb1936
more
fzyzcjy Apr 10, 2025
728bae5
more
fzyzcjy Apr 10, 2025
1b00700
more
fzyzcjy Apr 10, 2025
82d0d38
more
fzyzcjy Apr 10, 2025
c3479c0
more
fzyzcjy Apr 10, 2025
42116e1
more
fzyzcjy Apr 10, 2025
4d772bd
more
fzyzcjy Apr 10, 2025
11c08b0
more
fzyzcjy Apr 10, 2025
a997a46
more
fzyzcjy Apr 10, 2025
a3783f0
more
fzyzcjy Apr 10, 2025
7e78dfe
more
fzyzcjy Apr 10, 2025
78a2138
more
fzyzcjy Apr 10, 2025
fb5e00f
more
fzyzcjy Apr 10, 2025
2c4c689
more
fzyzcjy Apr 10, 2025
804d52b
more
fzyzcjy Apr 10, 2025
8d95825
more
fzyzcjy Apr 10, 2025
84dd409
more
fzyzcjy Apr 10, 2025
5f0e0bc
more
fzyzcjy Apr 10, 2025
db52e91
more
fzyzcjy Apr 10, 2025
f5ee24d
more
fzyzcjy Apr 10, 2025
af4e8f3
more
fzyzcjy Apr 10, 2025
022bbfe
more
fzyzcjy Apr 10, 2025
6276598
more
fzyzcjy Apr 10, 2025
9b1a6f2
more
fzyzcjy Apr 10, 2025
a80af1b
more
fzyzcjy Apr 10, 2025
9f0ba54
more
fzyzcjy Apr 10, 2025
f7ae980
more
fzyzcjy Apr 10, 2025
211fc81
more
fzyzcjy Apr 10, 2025
31692d7
more
fzyzcjy Apr 10, 2025
69c5777
more
fzyzcjy Apr 10, 2025
3af3fdc
more
fzyzcjy Apr 10, 2025
a46be2c
more
fzyzcjy Apr 10, 2025
0cbd4b4
more
fzyzcjy Apr 10, 2025
1ba8466
more
fzyzcjy Apr 10, 2025
e3186cf
more
fzyzcjy Apr 10, 2025
1582464
more
fzyzcjy Apr 10, 2025
c43c110
more
fzyzcjy Apr 10, 2025
981a6c4
more
fzyzcjy Apr 10, 2025
1baf771
more
fzyzcjy Apr 10, 2025
dc9dd3e
more
fzyzcjy Apr 10, 2025
9682c56
rm
fzyzcjy Apr 10, 2025
8979514
fmt
fzyzcjy Apr 10, 2025
8775c9e
more
fzyzcjy Apr 10, 2025
5a5fdb8
more
fzyzcjy Apr 10, 2025
0597c10
more
fzyzcjy Apr 10, 2025
7350e86
more
fzyzcjy Apr 10, 2025
bde8ed1
more
fzyzcjy Apr 10, 2025
bfbb6ab
more
fzyzcjy Apr 10, 2025
dc7c425
more
fzyzcjy Apr 10, 2025
661de4a
more
fzyzcjy Apr 10, 2025
621f102
fmt
fzyzcjy Apr 10, 2025
72fc8be
more
fzyzcjy Apr 10, 2025
c1f1516
more
fzyzcjy Apr 10, 2025
24ce30e
more
fzyzcjy Apr 10, 2025
68b8c2a
more
fzyzcjy Apr 10, 2025
8a66fe6
more
fzyzcjy Apr 10, 2025
676195b
more
fzyzcjy Apr 10, 2025
58cddd6
more
fzyzcjy Apr 10, 2025
ca82648
more
fzyzcjy Apr 10, 2025
57d512c
more
fzyzcjy Apr 10, 2025
4ae108c
more
fzyzcjy Apr 10, 2025
6b7b09c
more
fzyzcjy Apr 10, 2025
bf4d4b8
more
fzyzcjy Apr 10, 2025
00e2048
more
fzyzcjy Apr 10, 2025
8259636
more
fzyzcjy Apr 10, 2025
d930b74
more
fzyzcjy Apr 10, 2025
67bdc44
more
fzyzcjy Apr 10, 2025
aa415f4
more
fzyzcjy Apr 10, 2025
e4fbf09
more
fzyzcjy Apr 10, 2025
bbd416f
more
fzyzcjy Apr 10, 2025
a3eff17
more
fzyzcjy Apr 10, 2025
de47475
more
fzyzcjy Apr 10, 2025
61c8afe
more
fzyzcjy Apr 10, 2025
51f7ac7
more
fzyzcjy Apr 10, 2025
c1c5161
more
fzyzcjy Apr 10, 2025
d4531d1
more
fzyzcjy Apr 10, 2025
a85ca16
more
fzyzcjy Apr 10, 2025
e6009c6
more
fzyzcjy Apr 10, 2025
0643d12
more
fzyzcjy Apr 10, 2025
a29b0b4
more
fzyzcjy Apr 10, 2025
1114ab3
more
fzyzcjy Apr 10, 2025
8ccdb08
more
fzyzcjy Apr 10, 2025
4d21a24
more
fzyzcjy Apr 10, 2025
47ee3c9
more
fzyzcjy Apr 10, 2025
7059e3e
more
fzyzcjy Apr 10, 2025
be693e7
more
fzyzcjy Apr 10, 2025
ae9bac8
more
fzyzcjy Apr 10, 2025
dc1e050
more
fzyzcjy Apr 10, 2025
f178507
more
fzyzcjy Apr 10, 2025
4ebdffc
more
fzyzcjy Apr 10, 2025
a0eeeba
more
fzyzcjy Apr 10, 2025
44915c7
more
fzyzcjy Apr 10, 2025
aae2976
more
fzyzcjy Apr 10, 2025
686743a
more
fzyzcjy Apr 10, 2025
1cab4e4
more
fzyzcjy Apr 10, 2025
5ba2b94
more
fzyzcjy Apr 10, 2025
92e26a6
fmt
fzyzcjy Apr 10, 2025
92069e5
more
fzyzcjy Apr 10, 2025
50cdc3d
rm
fzyzcjy Apr 10, 2025
b9de9d1
Revert "rm"
fzyzcjy Apr 10, 2025
e910b3e
more
fzyzcjy Apr 10, 2025
a13ad3a
more
fzyzcjy Apr 10, 2025
3e151b9
more
fzyzcjy Apr 10, 2025
89a1a5e
more
fzyzcjy Apr 10, 2025
c351090
more
fzyzcjy Apr 10, 2025
8690ff2
more
fzyzcjy Apr 10, 2025
b57dee3
more
fzyzcjy Apr 10, 2025
d7e30c7
fmt
fzyzcjy Apr 10, 2025
e202174
Merge branch 'feat/eplb_algo' into feat/hot_update_expert_location
fzyzcjy Apr 10, 2025
1b9d164
more
fzyzcjy Apr 10, 2025
c5c2790
more
fzyzcjy Apr 10, 2025
e441686
more
fzyzcjy Apr 10, 2025
a938258
more
fzyzcjy Apr 10, 2025
792da16
more
fzyzcjy Apr 10, 2025
ad69c3b
more
fzyzcjy Apr 10, 2025
f66c045
Merge branch 'main' into feat/colocate_batch_gen
fzyzcjy Apr 10, 2025
0ac176b
fmt
fzyzcjy Apr 10, 2025
6f3fd42
more
fzyzcjy Apr 10, 2025
a4412b6
Merge branch 'feat/colocate_batch_gen' into feat/hot_update_expert_lo…
fzyzcjy Apr 10, 2025
c8261e8
more
fzyzcjy Apr 10, 2025
8218a45
more
fzyzcjy Apr 10, 2025
3a5fe8e
more
fzyzcjy Apr 10, 2025
a059d25
more
fzyzcjy Apr 10, 2025
da7c69d
more
fzyzcjy Apr 10, 2025
a509f31
more
fzyzcjy Apr 10, 2025
23d1207
more
fzyzcjy Apr 10, 2025
5e56309
more
fzyzcjy Apr 10, 2025
7a4517b
more
fzyzcjy Apr 10, 2025
e6925de
more
fzyzcjy Apr 10, 2025
d87235f
more
fzyzcjy Apr 10, 2025
de7dbd2
more
fzyzcjy Apr 10, 2025
71bee87
cherry pick
fzyzcjy Apr 10, 2025
15124de
more
fzyzcjy Apr 10, 2025
c362860
more
fzyzcjy Apr 10, 2025
d242b79
more
fzyzcjy Apr 10, 2025
6e5d979
more
fzyzcjy Apr 10, 2025
44f308d
more
fzyzcjy Apr 10, 2025
b98d8cc
more
fzyzcjy Apr 10, 2025
5f3d7ea
more
fzyzcjy Apr 10, 2025
b6ac080
more
fzyzcjy Apr 10, 2025
0a0e1d0
more
fzyzcjy Apr 10, 2025
2e3def9
more
fzyzcjy Apr 10, 2025
e72d4b1
more
fzyzcjy Apr 10, 2025
bd3cc0c
more
fzyzcjy Apr 10, 2025
9196eb5
more
fzyzcjy Apr 10, 2025
d1563bd
fmt
fzyzcjy Apr 10, 2025
4635582
more
fzyzcjy Apr 10, 2025
0e17b02
more
fzyzcjy Apr 10, 2025
c3efb6c
more
fzyzcjy Apr 10, 2025
f6c5c38
more
fzyzcjy Apr 10, 2025
adc3a03
more
fzyzcjy Apr 10, 2025
3af15fa
more
fzyzcjy Apr 10, 2025
72e7d44
more
fzyzcjy Apr 10, 2025
f3506bb
rename
fzyzcjy Apr 10, 2025
518b4fe
more
fzyzcjy Apr 10, 2025
3db8c61
more
fzyzcjy Apr 10, 2025
14b7d6e
more
fzyzcjy Apr 10, 2025
4ac95af
more
fzyzcjy Apr 10, 2025
5b9d05f
more
fzyzcjy Apr 10, 2025
46f079e
more
fzyzcjy Apr 10, 2025
c9a6c41
fmt
fzyzcjy Apr 10, 2025
8b41331
more
fzyzcjy Apr 10, 2025
f08973f
Merge branch 'feat/flush_cache_await' into feat/engine_flush_cache
fzyzcjy Apr 10, 2025
2d16f97
more
fzyzcjy Apr 10, 2025
f17187e
Merge branch 'feat/engine_flush_cache' into feat/eplb_e2e_test
fzyzcjy Apr 10, 2025
7488935
more
fzyzcjy Apr 10, 2025
275c0e2
more
fzyzcjy Apr 10, 2025
9c12636
more
fzyzcjy Apr 10, 2025
149781a
more
fzyzcjy Apr 10, 2025
507a44f
more
fzyzcjy Apr 10, 2025
e0bcfa3
more
fzyzcjy Apr 10, 2025
1da2438
more
fzyzcjy Apr 10, 2025
a8a8ffe
more
fzyzcjy Apr 10, 2025
6e3e673
more
fzyzcjy Apr 10, 2025
78b9e64
fmt
fzyzcjy Apr 10, 2025
472ebdb
Merge branch 'feat/period_rebalance' into feat/eplb_e2e_test
fzyzcjy Apr 10, 2025
fe202f2
more
fzyzcjy Apr 10, 2025
b080880
more
fzyzcjy Apr 10, 2025
07fcc4d
more
fzyzcjy Apr 10, 2025
cf6b689
more
fzyzcjy Apr 10, 2025
5ef94f2
more
fzyzcjy Apr 10, 2025
b74588d
more
fzyzcjy Apr 10, 2025
c73c927
more
fzyzcjy Apr 10, 2025
83950be
fmt
fzyzcjy Apr 10, 2025
452524a
more
fzyzcjy Apr 11, 2025
3bcf250
more
fzyzcjy Apr 11, 2025
de3ecd2
more
fzyzcjy Apr 11, 2025
3a61ef3
fmt
fzyzcjy Apr 11, 2025
b264b77
more
fzyzcjy Apr 11, 2025
3e8f979
more
fzyzcjy Apr 11, 2025
c38a21f
more
fzyzcjy Apr 11, 2025
247c675
more
fzyzcjy Apr 11, 2025
bba50db
fmt
fzyzcjy Apr 11, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 1 addition & 13 deletions docs/backend/native_api.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -408,19 +408,7 @@
"print_highlight(response)\n",
"\n",
"response = requests.post(f\"http://localhost:{port}/dump_expert_distribution_record\")\n",
"print_highlight(response)\n",
"\n",
"import glob\n",
"\n",
"output_file = glob.glob(\"expert_distribution_*.csv\")[0]\n",
"with open(output_file, \"r\") as f:\n",
" print_highlight(\"\\n| Layer ID | Expert ID | Count |\")\n",
" print_highlight(\"|----------|-----------|--------|\")\n",
" next(f)\n",
" for i, line in enumerate(f):\n",
" if i < 9:\n",
" layer_id, expert_id, count = line.strip().split(\",\")\n",
" print_highlight(f\"| {layer_id:8} | {expert_id:9} | {count:6} |\")"
"print_highlight(response)"
]
},
{
Expand Down
11 changes: 1 addition & 10 deletions python/sglang/bench_one_batch.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,16 +129,7 @@ def load_model(server_args, port_args, tp_rank):
suppress_other_loggers()
rank_print = print if tp_rank == 0 else lambda *args, **kwargs: None

model_config = ModelConfig(
server_args.model_path,
trust_remote_code=server_args.trust_remote_code,
revision=server_args.revision,
context_length=server_args.context_length,
model_override_args=server_args.json_model_override_args,
is_embedding=server_args.is_embedding,
dtype=server_args.dtype,
quantization=server_args.quantization,
)
model_config = ModelConfig.from_server_args(server_args)
model_runner = ModelRunner(
model_config=model_config,
mem_fraction_static=server_args.mem_fraction_static,
Expand Down
14 changes: 14 additions & 0 deletions python/sglang/srt/configs/model_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@

from sglang.srt.hf_transformers_utils import get_config, get_context_length
from sglang.srt.layers.quantization import QUANTIZATION_METHODS
from sglang.srt.server_args import ServerArgs
from sglang.srt.utils import get_bool_env_var, is_hip

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -171,6 +172,19 @@ def __init__(
self.hf_eos_token_id = self.get_hf_eos_token_id()
self.image_token_id = getattr(self.hf_config, "image_token_id", None)

@staticmethod
def from_server_args(server_args: ServerArgs, model_path: str = None):
return ModelConfig(
model_path=model_path or server_args.model_path,
trust_remote_code=server_args.trust_remote_code,
revision=server_args.revision,
context_length=server_args.context_length,
model_override_args=server_args.json_model_override_args,
is_embedding=server_args.is_embedding,
dtype=server_args.dtype,
quantization=server_args.quantization,
)

# adapted from https://github.com/vllm-project/vllm/blob/main/vllm/config.py#L289
def get_total_num_kv_heads(self) -> int:
"""Returns the total number of KV heads."""
Expand Down
71 changes: 68 additions & 3 deletions python/sglang/srt/entrypoints/engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
import asyncio
import atexit
import dataclasses
import json
import logging
import multiprocessing as mp
import os
Expand All @@ -31,6 +32,10 @@
import zmq.asyncio
from PIL.Image import Image

from sglang.srt.configs.model_config import ModelConfig
from sglang.srt.managers.eplb_manager import EPLBManager
from sglang.srt.managers.expert_location import ExpertLocationMetadata

# Fix a bug of Python threading
setattr(threading, "_register_atexit", lambda *args, **kwargs: None)

Expand All @@ -51,6 +56,7 @@
ResumeMemoryOccupationReqInput,
RpcReqInput,
RpcReqOutput,
UpdateExpertLocationReqInput,
UpdateWeightFromDiskReqInput,
UpdateWeightsFromDistributedReqInput,
UpdateWeightsFromTensorReqInput,
Expand Down Expand Up @@ -278,6 +284,10 @@ def __exit__(self, exc_type, exc_value, traceback):
self.shutdown()
return False

def flush_cache(self):
loop = asyncio.get_event_loop()
return loop.run_until_complete(self.tokenizer_manager.flush_cache())

def start_profile(self):
loop = asyncio.get_event_loop()
loop.run_until_complete(self.tokenizer_manager.start_profile())
Expand Down Expand Up @@ -354,10 +364,30 @@ def update_weights_from_tensor(
self.tokenizer_manager.update_weights_from_tensor(obj, None)
)

def eplb_rebalance(self):
loop = asyncio.get_event_loop()
return loop.run_until_complete(self.tokenizer_manager.eplb_rebalance())

def eplb_save_expert_distribution(self):
loop = asyncio.get_event_loop()
return loop.run_until_complete(
self.tokenizer_manager.eplb_save_expert_distribution()
)

def update_expert_location(self, expert_location_metadata: ExpertLocationMetadata):
obj = UpdateExpertLocationReqInput(
expert_location_metadata=expert_location_metadata,
)
loop = asyncio.get_event_loop()
return loop.run_until_complete(
self.tokenizer_manager.update_expert_location(obj)
)

def update_weights_from_disk(
self,
model_path: str,
load_format: Optional[str] = None,
param_categories: Optional[List[str]] = None,
):
"""Update the weights from disk inplace without re-launching the engine.

Expand All @@ -368,6 +398,7 @@ def update_weights_from_disk(
obj = UpdateWeightFromDiskReqInput(
model_path=model_path,
load_format=load_format,
param_categories=param_categories,
)

loop = asyncio.get_event_loop()
Expand Down Expand Up @@ -495,6 +526,11 @@ def _launch_subprocesses(
server_args.model_path, server_args.tokenizer_path
)

eplb_manager = EPLBManager(server_args) if server_args.enable_eplb else None
expert_location_metadata = _compute_initial_expert_location_metadata(
server_args, eplb_manager
)

scheduler_procs = []
if server_args.dp_size == 1:
# Launch tensor parallel scheduler processes
Expand All @@ -516,7 +552,15 @@ def _launch_subprocesses(
)
proc = mp.Process(
target=run_scheduler_process,
args=(server_args, port_args, gpu_id, tp_rank, None, writer),
args=(
server_args,
port_args,
expert_location_metadata,
gpu_id,
tp_rank,
None,
writer,
),
)
with memory_saver_adapter.configure_subprocess():
proc.start()
Expand All @@ -528,7 +572,7 @@ def _launch_subprocesses(
scheduler_pipe_readers = [reader]
proc = mp.Process(
target=run_data_parallel_controller_process,
args=(server_args, port_args, writer),
args=(server_args, port_args, expert_location_metadata, writer),
)
proc.start()
scheduler_procs.append(proc)
Expand Down Expand Up @@ -565,7 +609,9 @@ def _launch_subprocesses(
detoken_proc.start()

# Launch tokenizer process
tokenizer_manager = TokenizerManager(server_args, port_args)
tokenizer_manager = TokenizerManager(
server_args, port_args, expert_location_metadata, eplb_manager
)
if server_args.chat_template:
load_chat_template_for_openai_api(
tokenizer_manager, server_args.chat_template, server_args.model_path
Expand Down Expand Up @@ -597,3 +643,22 @@ def _launch_subprocesses(
scheduler_info = scheduler_infos[0]
tokenizer_manager.max_req_input_len = scheduler_info["max_req_input_len"]
return tokenizer_manager, scheduler_info


def _compute_initial_expert_location_metadata(
server_args: ServerArgs, eplb_manager: EPLBManager
) -> ExpertLocationMetadata:
if (data := server_args.init_expert_location) is not None:
data_dict = json.loads(data)
if "physical_to_logical_map" in data_dict:
# TODO We may want to allow users to not provide `logical_to_all_physical_map` if this API is frequently used
return ExpertLocationMetadata.init_by_mapping(server_args, **data_dict)
elif "logical_count" in data_dict:
return ExpertLocationMetadata.init_by_eplb(server_args, **data_dict)
else:
raise NotImplementedError(
f"Unknown init_expert_location format ({list(data_dict.keys())=})"
)
if server_args.enable_eplb:
return eplb_manager.compute_expert_location_metadata()
return ExpertLocationMetadata.init_trivial(server_args)
23 changes: 16 additions & 7 deletions python/sglang/srt/entrypoints/http_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -310,11 +310,11 @@ async def classify_request(obj: EmbeddingReqInput, request: Request):
@app.api_route("/flush_cache", methods=["GET", "POST"])
async def flush_cache():
"""Flush the radix cache."""
_global_state.tokenizer_manager.flush_cache()
ret = await _global_state.tokenizer_manager.flush_cache()
return Response(
content="Cache flushed.\nPlease check backend logs for more details. "
"(When there are running or waiting requests, the operation will not be performed.)\n",
status_code=200,
status_code=200 if ret.success else HTTPStatus.BAD_REQUEST,
)


Expand Down Expand Up @@ -366,11 +366,20 @@ async def stop_expert_distribution_record_async():
@app.api_route("/dump_expert_distribution_record", methods=["GET", "POST"])
async def dump_expert_distribution_record_async():
"""Dump expert distribution record."""
await _global_state.tokenizer_manager.dump_expert_distribution_record()
return Response(
content="Dump expert distribution record.\n",
status_code=200,
)
content = await _global_state.tokenizer_manager.dump_expert_distribution_record()
return ORJSONResponse(content, status_code=200)


@app.post("/eplb_rebalance")
async def eplb_rebalance():
await _global_state.tokenizer_manager.eplb_rebalance()
return ORJSONResponse({}, status_code=200)


@app.post("/eplb_save_expert_distribution")
async def eplb_save_expert_distribution():
await _global_state.tokenizer_manager.eplb_save_expert_distribution()
return ORJSONResponse({}, status_code=200)


@app.post("/update_weights_from_disk")
Expand Down
31 changes: 31 additions & 0 deletions python/sglang/srt/layers/moe/ep_moe/layer.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@

import torch

from sglang.srt.managers.schedule_batch import get_global_expert_location_metadata

try:
from deep_gemm import (
get_col_major_tma_aligned_tensor,
Expand Down Expand Up @@ -131,6 +133,7 @@ def __init__(
top_k: int,
hidden_size: int,
intermediate_size: int,
layer_id: int,
params_dtype: Optional[torch.dtype] = None,
renormalize: bool = True,
use_grouped_topk: bool = False,
Expand All @@ -153,6 +156,7 @@ def __init__(
)
self.tp_rank = get_tensor_model_parallel_rank()

self.layer_id = layer_id
self.num_experts = num_experts
assert self.num_experts % self.tp_size == 0
self.num_experts_per_partition = self.num_experts // self.tp_size
Expand Down Expand Up @@ -221,6 +225,9 @@ def forward(self, hidden_states: torch.Tensor, router_logits: torch.Tensor):
num_expert_group=self.num_expert_group,
correction_bias=self.correction_bias,
custom_routing_function=self.custom_routing_function,
expert_logical_to_rank_dispatch_physical_map=get_global_expert_location_metadata().logical_to_rank_dispatch_physical_map[
self.tp_rank, self.layer_id, :
],
)

reorder_topk_ids, src2dst, seg_indptr = run_moe_ep_preproess(
Expand Down Expand Up @@ -409,6 +416,28 @@ def weight_loader(
weight_name: str,
shard_id: str,
expert_id: int,
) -> None:
physical_expert_ids = (
get_global_expert_location_metadata().logical_to_all_physical(
self.layer_id, expert_id
)
)
for physical_expert_id in physical_expert_ids:
self._weight_loader_physical(
param=param,
loaded_weight=loaded_weight,
weight_name=weight_name,
shard_id=shard_id,
expert_id=physical_expert_id,
)

def _weight_loader_physical(
self,
param: torch.nn.Parameter,
loaded_weight: torch.Tensor,
weight_name: str,
shard_id: str,
expert_id: int,
) -> None:
if expert_id < self.start_expert_id or expert_id > self.end_expert_id:
return
Expand Down Expand Up @@ -802,6 +831,7 @@ def __init__(
top_k: int,
hidden_size: int,
intermediate_size: int,
layer_id: int,
params_dtype: Optional[torch.dtype] = None,
renormalize: bool = True,
use_grouped_topk: bool = False,
Expand All @@ -820,6 +850,7 @@ def __init__(
top_k,
hidden_size,
intermediate_size,
layer_id,
params_dtype,
renormalize,
use_grouped_topk,
Expand Down
7 changes: 6 additions & 1 deletion python/sglang/srt/layers/moe/ep_moe/token_dispatcher.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from sglang.srt.managers.expert_distribution import expert_distribution_recorder
from sglang.srt.utils import DeepEPMode

try:
Expand Down Expand Up @@ -248,7 +249,7 @@ def _dispatch_core(
recv_x,
recv_topk_idx,
recv_topk_weights,
_, # num_recv_tokens_per_expert_list
num_recv_tokens_per_expert_list,
self.handle,
event,
) = buffer.dispatch(
Expand All @@ -264,6 +265,10 @@ def _dispatch_core(
allocate_on_comm_stream=(previous_event is not None) and self.async_finish,
)

expert_distribution_recorder.on_deepep_dispatch_normal(
num_recv_tokens_per_expert_list
)

return (
recv_x,
recv_topk_idx,
Expand Down
1 change: 1 addition & 0 deletions python/sglang/srt/layers/moe/fused_moe_triton/layer.py
Original file line number Diff line number Diff line change
Expand Up @@ -268,6 +268,7 @@ def __init__(
top_k: int,
hidden_size: int,
intermediate_size: int,
layer_id: Optional[int] = None,
params_dtype: Optional[torch.dtype] = None,
reduce_results: bool = False,
renormalize: bool = True,
Expand Down
Loading
Loading