Skip to content
Closed

EPLB #5295

Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
1118 commits
Select commit Hold shift + click to select a range
f760bdb
more
fzyzcjy Apr 17, 2025
39bc973
more
fzyzcjy Apr 17, 2025
ef61f21
more
fzyzcjy Apr 17, 2025
8c5926d
more
fzyzcjy Apr 17, 2025
4ff6c40
more
fzyzcjy Apr 17, 2025
8560a4d
more
fzyzcjy Apr 17, 2025
6505e2b
more
fzyzcjy Apr 17, 2025
3d3b9f5
more
fzyzcjy Apr 17, 2025
09dee3d
more
fzyzcjy Apr 17, 2025
0efd3a8
more
fzyzcjy Apr 17, 2025
9b9f06e
more
fzyzcjy Apr 17, 2025
b546f07
more
fzyzcjy Apr 17, 2025
394bd52
more
fzyzcjy Apr 17, 2025
f223889
more
fzyzcjy Apr 17, 2025
65947e3
more
fzyzcjy Apr 17, 2025
0ce12ff
more
fzyzcjy Apr 17, 2025
af4a7cc
more
fzyzcjy Apr 17, 2025
8fa7258
more
fzyzcjy Apr 17, 2025
c8f0eca
more
fzyzcjy Apr 17, 2025
53f1435
more
fzyzcjy Apr 17, 2025
19633b2
more
fzyzcjy Apr 17, 2025
46ebc09
more
fzyzcjy Apr 17, 2025
647b0ac
more
fzyzcjy Apr 17, 2025
16582dd
more
fzyzcjy Apr 17, 2025
34d9e14
more
fzyzcjy Apr 17, 2025
9666260
more
fzyzcjy Apr 17, 2025
8118d2b
more
fzyzcjy Apr 17, 2025
8bc37cf
more
fzyzcjy Apr 17, 2025
59d25f8
more
fzyzcjy Apr 17, 2025
afcdca8
more
fzyzcjy Apr 17, 2025
eb5cb9f
more
fzyzcjy Apr 17, 2025
286b4f8
more
fzyzcjy Apr 17, 2025
a32010f
more
fzyzcjy Apr 17, 2025
fbbd6cc
more
fzyzcjy Apr 17, 2025
1a4f05b
more
fzyzcjy Apr 17, 2025
b53ade6
more
fzyzcjy Apr 17, 2025
840dedd
more
fzyzcjy Apr 17, 2025
c450db4
more
fzyzcjy Apr 17, 2025
053a547
more
fzyzcjy Apr 17, 2025
687647c
more
fzyzcjy Apr 17, 2025
c40ed87
more
fzyzcjy Apr 17, 2025
31c1505
more
fzyzcjy Apr 17, 2025
e7669d0
more
fzyzcjy Apr 17, 2025
4c767b3
more
fzyzcjy Apr 17, 2025
384db82
more
fzyzcjy Apr 17, 2025
b2fc2e4
more
fzyzcjy Apr 17, 2025
bcb6a04
more
fzyzcjy Apr 17, 2025
c7f258f
more
fzyzcjy Apr 17, 2025
8c132b0
more
fzyzcjy Apr 17, 2025
afb7b7b
more
fzyzcjy Apr 17, 2025
819508f
more
fzyzcjy Apr 17, 2025
5b30dfc
more
fzyzcjy Apr 17, 2025
568b6a7
more
fzyzcjy Apr 17, 2025
17c21db
more
fzyzcjy Apr 17, 2025
1c1a9ee
more
fzyzcjy Apr 17, 2025
6981b86
more
fzyzcjy Apr 17, 2025
74cb707
more
fzyzcjy Apr 17, 2025
f721eb1
more
fzyzcjy Apr 17, 2025
b33211d
more
fzyzcjy Apr 17, 2025
ccc2fa5
more
fzyzcjy Apr 17, 2025
0b5efcf
more
fzyzcjy Apr 17, 2025
bebd259
more
fzyzcjy Apr 17, 2025
1904401
more
fzyzcjy Apr 17, 2025
53f896d
more
fzyzcjy Apr 17, 2025
cf3f695
more
fzyzcjy Apr 17, 2025
c743100
more
fzyzcjy Apr 17, 2025
034c412
more
fzyzcjy Apr 17, 2025
7ef9b7f
more
fzyzcjy Apr 17, 2025
3bf8148
more
fzyzcjy Apr 17, 2025
898b8d8
more
fzyzcjy Apr 17, 2025
998cd4b
more
fzyzcjy Apr 17, 2025
f6d0ec4
more
fzyzcjy Apr 17, 2025
388ce31
more
fzyzcjy Apr 17, 2025
efdb318
more
fzyzcjy Apr 17, 2025
1baa588
more
fzyzcjy Apr 17, 2025
e54fe05
more
fzyzcjy Apr 17, 2025
ac0058e
more
fzyzcjy Apr 17, 2025
f80431b
more
fzyzcjy Apr 17, 2025
6cfae60
more
fzyzcjy Apr 17, 2025
bcf6ddb
more
fzyzcjy Apr 17, 2025
c3f483d
more
fzyzcjy Apr 17, 2025
562d4db
more
fzyzcjy Apr 17, 2025
c148303
more
fzyzcjy Apr 17, 2025
af35f39
more
fzyzcjy Apr 17, 2025
ed85334
more
fzyzcjy Apr 17, 2025
51a279f
more
fzyzcjy Apr 17, 2025
c729059
more
fzyzcjy Apr 17, 2025
e137504
more
fzyzcjy Apr 17, 2025
5239567
more
fzyzcjy Apr 17, 2025
b7905c3
more
fzyzcjy Apr 17, 2025
c786935
more
fzyzcjy Apr 17, 2025
c89ba31
more
fzyzcjy Apr 17, 2025
8ded714
more
fzyzcjy Apr 17, 2025
debef34
more
fzyzcjy Apr 17, 2025
7c91cf2
more
fzyzcjy Apr 17, 2025
36d2f1b
more
fzyzcjy Apr 17, 2025
5e5680d
more
fzyzcjy Apr 17, 2025
485b8dd
more
fzyzcjy Apr 17, 2025
9f40a96
more
fzyzcjy Apr 17, 2025
9df3327
more
fzyzcjy Apr 17, 2025
2a3680a
more
fzyzcjy Apr 17, 2025
e053933
more
fzyzcjy Apr 17, 2025
1da663d
more
fzyzcjy Apr 17, 2025
0dbfdd8
more
fzyzcjy Apr 17, 2025
0b7069b
more
fzyzcjy Apr 17, 2025
8358d02
more
fzyzcjy Apr 17, 2025
28d3d82
more
fzyzcjy Apr 17, 2025
beeb011
more
fzyzcjy Apr 17, 2025
e585d3f
more
fzyzcjy Apr 17, 2025
5ff279d
more
fzyzcjy Apr 17, 2025
e5c7937
more
fzyzcjy Apr 17, 2025
47134c2
more
fzyzcjy Apr 17, 2025
614236a
more
fzyzcjy Apr 17, 2025
12cdd94
more
fzyzcjy Apr 17, 2025
5586e15
more
fzyzcjy Apr 17, 2025
28686bb
more
fzyzcjy Apr 17, 2025
de77896
more
fzyzcjy Apr 17, 2025
41d2bf6
more
fzyzcjy Apr 17, 2025
084b195
more
fzyzcjy Apr 17, 2025
80463a8
more
fzyzcjy Apr 17, 2025
2b39cfe
more
fzyzcjy Apr 17, 2025
f92ab32
more
fzyzcjy Apr 17, 2025
0730e8f
more
fzyzcjy Apr 17, 2025
a0fbeeb
more
fzyzcjy Apr 17, 2025
10ac4fb
more
fzyzcjy Apr 17, 2025
6c60e04
more
fzyzcjy Apr 17, 2025
44bb9f4
more
fzyzcjy Apr 17, 2025
9b08955
more
fzyzcjy Apr 17, 2025
d782002
more
fzyzcjy Apr 17, 2025
f67e583
more
fzyzcjy Apr 17, 2025
63f3ec4
more
fzyzcjy Apr 17, 2025
4a81936
more
fzyzcjy Apr 17, 2025
8e036e1
more
fzyzcjy Apr 17, 2025
9b501ce
more
fzyzcjy Apr 17, 2025
0ab750b
more
fzyzcjy Apr 17, 2025
9a49ec6
more
fzyzcjy Apr 17, 2025
b48a191
more
fzyzcjy Apr 17, 2025
d74299c
more
fzyzcjy Apr 17, 2025
121bc45
more
fzyzcjy Apr 17, 2025
9bedaa2
more
fzyzcjy Apr 17, 2025
4ebb695
more
fzyzcjy Apr 17, 2025
f45df0a
more
fzyzcjy Apr 17, 2025
8e027e6
more
fzyzcjy Apr 17, 2025
cd874a7
more
fzyzcjy Apr 17, 2025
9b5bdfe
more
fzyzcjy Apr 17, 2025
9b506b0
more
fzyzcjy Apr 17, 2025
003815c
more
fzyzcjy Apr 17, 2025
3768d14
more
fzyzcjy Apr 17, 2025
d011216
more
fzyzcjy Apr 17, 2025
f4c5303
more
fzyzcjy Apr 17, 2025
007f8b1
more
fzyzcjy Apr 17, 2025
af6f513
more
fzyzcjy Apr 17, 2025
eadf4ae
more
fzyzcjy Apr 17, 2025
238e326
more
fzyzcjy Apr 17, 2025
1d858da
more
fzyzcjy Apr 17, 2025
d25429a
more
fzyzcjy Apr 17, 2025
0680026
more
fzyzcjy Apr 17, 2025
bad67f4
more
fzyzcjy Apr 17, 2025
b72221f
more
fzyzcjy Apr 17, 2025
a77d21d
more
fzyzcjy Apr 17, 2025
352887c
more
fzyzcjy Apr 17, 2025
27f4a14
more
fzyzcjy Apr 17, 2025
5ae31c0
more
fzyzcjy Apr 17, 2025
3c21a84
more
fzyzcjy Apr 17, 2025
f8559b7
more
fzyzcjy Apr 17, 2025
12e5ca6
more
fzyzcjy Apr 17, 2025
05cc93a
more
fzyzcjy Apr 17, 2025
7929e8d
more
fzyzcjy Apr 17, 2025
5668d1c
more
fzyzcjy Apr 17, 2025
1c424cc
more
fzyzcjy Apr 17, 2025
aa1d887
more
fzyzcjy Apr 17, 2025
c35fdcb
more
fzyzcjy Apr 17, 2025
55344c5
more
fzyzcjy Apr 17, 2025
3c67506
more
fzyzcjy Apr 17, 2025
a38b0ae
more
fzyzcjy Apr 17, 2025
2f09e4d
more
fzyzcjy Apr 17, 2025
f473f29
more
fzyzcjy Apr 17, 2025
1bdd7dd
more
fzyzcjy Apr 17, 2025
547804b
more
fzyzcjy Apr 17, 2025
cfb7672
more
fzyzcjy Apr 17, 2025
0ff54bd
more
fzyzcjy Apr 17, 2025
950054b
more
fzyzcjy Apr 17, 2025
2a6fe7c
more
fzyzcjy Apr 17, 2025
a0842dc
more
fzyzcjy Apr 17, 2025
3d29e50
more
fzyzcjy Apr 17, 2025
917f22e
more
fzyzcjy Apr 17, 2025
198f327
more
fzyzcjy Apr 17, 2025
2fc6df1
more
fzyzcjy Apr 17, 2025
31bef4e
more
fzyzcjy Apr 17, 2025
09db1a1
more
fzyzcjy Apr 17, 2025
1a6e5ca
more
fzyzcjy Apr 17, 2025
a96d483
more
fzyzcjy Apr 17, 2025
3cda0fb
more
fzyzcjy Apr 17, 2025
68b7aa0
more
fzyzcjy Apr 17, 2025
80837da
more
fzyzcjy Apr 17, 2025
ee8ffc1
more
fzyzcjy Apr 17, 2025
d6211fc
more
fzyzcjy Apr 17, 2025
5e3ee7e
more
fzyzcjy Apr 17, 2025
c1410c4
more
fzyzcjy Apr 17, 2025
cc0d5d3
more
fzyzcjy Apr 17, 2025
80dbbc3
more
fzyzcjy Apr 17, 2025
e26de0c
more
fzyzcjy Apr 17, 2025
9048c07
more
fzyzcjy Apr 17, 2025
03e0a0c
more
fzyzcjy Apr 17, 2025
6ca241a
more
fzyzcjy Apr 17, 2025
b98d248
more
fzyzcjy Apr 17, 2025
2228a29
more
fzyzcjy Apr 17, 2025
d44b193
more
fzyzcjy Apr 17, 2025
07acebf
more
fzyzcjy Apr 17, 2025
649fe9d
more
fzyzcjy Apr 17, 2025
c1224c0
more
fzyzcjy Apr 17, 2025
91fd636
more
fzyzcjy Apr 17, 2025
d9da5d2
more
fzyzcjy Apr 17, 2025
5d82a94
more
fzyzcjy Apr 17, 2025
4c77a3e
more
fzyzcjy Apr 17, 2025
c49f028
more
fzyzcjy Apr 17, 2025
a111bff
more
fzyzcjy Apr 17, 2025
75d36e7
more
fzyzcjy Apr 17, 2025
50e4540
more
fzyzcjy Apr 17, 2025
0f24d39
more
fzyzcjy Apr 17, 2025
3a73d7f
more
fzyzcjy Apr 17, 2025
770fbff
more
fzyzcjy Apr 17, 2025
e298c9b
more
fzyzcjy Apr 17, 2025
169ddc5
more
fzyzcjy Apr 17, 2025
88c40a7
more
fzyzcjy Apr 17, 2025
b6a8467
more
fzyzcjy Apr 17, 2025
200fce1
fmt
fzyzcjy Apr 17, 2025
efbf773
more
fzyzcjy Apr 17, 2025
c702f6c
more
fzyzcjy Apr 17, 2025
dc76f5c
more
fzyzcjy Apr 17, 2025
a1c895a
more
fzyzcjy Apr 17, 2025
ba4d449
more
fzyzcjy Apr 17, 2025
5b0f321
more
fzyzcjy Apr 17, 2025
552b7f5
more
fzyzcjy Apr 17, 2025
5fa7fa8
more
fzyzcjy Apr 17, 2025
dd910a4
more
fzyzcjy Apr 17, 2025
6b19c68
more
fzyzcjy Apr 17, 2025
6b52b6d
more
fzyzcjy Apr 17, 2025
518ca32
more
fzyzcjy Apr 18, 2025
02c622f
fmt
fzyzcjy Apr 18, 2025
a880d02
more
fzyzcjy Apr 18, 2025
30728d2
more
fzyzcjy Apr 18, 2025
2898f33
Merge branch 'main-upstream' into feat/eplb_final
fzyzcjy Apr 18, 2025
a7445c4
fix merge
fzyzcjy Apr 18, 2025
c61324e
fmt
fzyzcjy Apr 18, 2025
5ce21cd
fix merge
fzyzcjy Apr 18, 2025
5286fdd
fix merge
fzyzcjy Apr 18, 2025
fb77370
fix merge
fzyzcjy Apr 18, 2025
f100686
fix merge
fzyzcjy Apr 18, 2025
3c8b60d
Merge branch 'main-upstream' into feat/eplb_final
fzyzcjy Apr 18, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 1 addition & 13 deletions docs/backend/native_api.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -408,19 +408,7 @@
"print_highlight(response)\n",
"\n",
"response = requests.post(f\"http://localhost:{port}/dump_expert_distribution_record\")\n",
"print_highlight(response)\n",
"\n",
"import glob\n",
"\n",
"output_file = glob.glob(\"expert_distribution_*.csv\")[0]\n",
"with open(output_file, \"r\") as f:\n",
" print_highlight(\"\\n| Layer ID | Expert ID | Count |\")\n",
" print_highlight(\"|----------|-----------|--------|\")\n",
" next(f)\n",
" for i, line in enumerate(f):\n",
" if i < 9:\n",
" layer_id, expert_id, count = line.strip().split(\",\")\n",
" print_highlight(f\"| {layer_id:8} | {expert_id:9} | {count:6} |\")"
"print_highlight(response)"
]
},
{
Expand Down
1 change: 1 addition & 0 deletions python/sglang/bench_one_batch.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,7 @@ def load_model(server_args, port_args, tp_rank):
suppress_other_loggers()
rank_print = print if tp_rank == 0 else lambda *args, **kwargs: None

# TODO re-apply from_server_args PR
model_config = ModelConfig(
server_args.model_path,
trust_remote_code=server_args.trust_remote_code,
Expand Down
40 changes: 35 additions & 5 deletions python/sglang/bench_serving.py
Original file line number Diff line number Diff line change
Expand Up @@ -480,10 +480,11 @@ def get_tokenizer(


def get_dataset(args, tokenizer):
num_prompts = args.num_prompts + args.skip_num_prompts
if args.dataset_name == "sharegpt":
input_requests = sample_sharegpt_requests(
dataset_path=args.dataset_path,
num_requests=args.num_prompts,
num_requests=num_prompts,
tokenizer=tokenizer,
fixed_output_len=args.sharegpt_output_len,
context_len=args.sharegpt_context_len,
Expand All @@ -494,7 +495,7 @@ def get_dataset(args, tokenizer):
input_requests = sample_random_requests(
input_len=args.random_input_len,
output_len=args.random_output_len,
num_prompts=args.num_prompts,
num_prompts=num_prompts,
range_ratio=args.random_range_ratio,
tokenizer=tokenizer,
dataset_path=args.dataset_path,
Expand All @@ -512,6 +513,7 @@ def get_dataset(args, tokenizer):
)
else:
raise ValueError(f"Unknown dataset: {args.dataset_name}")
input_requests = input_requests[args.skip_num_prompts :]
return input_requests


Expand Down Expand Up @@ -607,7 +609,7 @@ def sample_sharegpt_requests(
apply_chat_template=False,
) -> List[Tuple[str, int, int]]:
if fixed_output_len is not None and fixed_output_len < 4:
raise ValueError("output_len too small")
print("Warn: output_len too small")

# Download sharegpt if necessary
if not os.path.isfile(dataset_path) and dataset_path == "":
Expand Down Expand Up @@ -666,7 +668,7 @@ def sample_sharegpt_requests(
len(completion_token_ids) if fixed_output_len is None else fixed_output_len
)

if prompt_len < 2 or output_len < 2:
if prompt_len < 2 or ((fixed_output_len is None) and (output_len < 2)):
# Prune too short sequences.
continue

Expand All @@ -690,7 +692,6 @@ def sample_random_requests(
dataset_path: str,
random_sample: bool = True,
) -> List[Tuple[str, int, int]]:

input_lens = np.random.randint(
max(int(input_len * range_ratio), 1),
input_len + 1,
Expand Down Expand Up @@ -976,6 +977,7 @@ async def benchmark(
lora_names: List[str],
extra_request_body: Dict[str, Any],
profile: bool,
enable_expert_distribution_record: bool = False,
pd_seperated: bool = False,
flush_cache: bool = False,
):
Expand Down Expand Up @@ -1041,6 +1043,12 @@ async def limited_request_func(request_func_input, pbar):

time.sleep(1.0)

if enable_expert_distribution_record:
print("Starting expert distribution record...")
output = await async_request_profile(
api_url=base_url + "/start_expert_distribution_record"
)
assert output.success
# Start profiler
if profile:
print("Starting profiler...")
Expand Down Expand Up @@ -1085,6 +1093,16 @@ async def limited_request_func(request_func_input, pbar):
profile_output = await async_request_profile(api_url=base_url + "/stop_profile")
if profile_output.success:
print("Profiler stopped")
if enable_expert_distribution_record:
print("Stopping expert distribution record...")
output = await async_request_profile(
api_url=base_url + "/dump_expert_distribution_record"
)
assert output.success
output = await async_request_profile(
api_url=base_url + "/stop_expert_distribution_record"
)
assert output.success

if pbar is not None:
pbar.close()
Expand Down Expand Up @@ -1393,6 +1411,7 @@ def run_benchmark(args_: argparse.Namespace):
lora_names=args.lora_name,
extra_request_body=extra_request_body,
profile=args.profile,
enable_expert_distribution_record=args.enable_expert_distribution_record,
pd_seperated=args.pd_seperated,
flush_cache=args.flush_cache,
)
Expand Down Expand Up @@ -1466,6 +1485,12 @@ def __call__(self, parser, namespace, values, option_string=None):
default=1000,
help="Number of prompts to process. Default is 1000.",
)
parser.add_argument(
"--skip-num-prompts",
type=int,
default=0,
help="Number of prompts to skip. Default is 0.",
)
parser.add_argument(
"--sharegpt-output-len",
type=int,
Expand Down Expand Up @@ -1557,6 +1582,11 @@ def __call__(self, parser, namespace, values, option_string=None):
help="Use Torch Profiler. The endpoint must be launched with "
"SGLANG_TORCH_PROFILER_DIR to enable profiler.",
)
parser.add_argument(
"--enable-expert-distribution-record",
action="store_true",
help="Enable expert distribution record",
)
parser.add_argument(
"--lora-name",
type=str,
Expand Down
14 changes: 14 additions & 0 deletions python/sglang/srt/configs/model_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@

from sglang.srt.hf_transformers_utils import get_config, get_context_length
from sglang.srt.layers.quantization import QUANTIZATION_METHODS
from sglang.srt.server_args import ServerArgs
from sglang.srt.utils import get_bool_env_var, is_hip

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -187,6 +188,19 @@ def __init__(
self.hf_eos_token_id = self.get_hf_eos_token_id()
self.image_token_id = getattr(self.hf_config, "image_token_id", None)

@staticmethod
def from_server_args(server_args: ServerArgs, model_path: str = None):
return ModelConfig(
model_path=model_path or server_args.model_path,
trust_remote_code=server_args.trust_remote_code,
revision=server_args.revision,
context_length=server_args.context_length,
model_override_args=server_args.json_model_override_args,
is_embedding=server_args.is_embedding,
dtype=server_args.dtype,
quantization=server_args.quantization,
)

# adapted from https://github.com/vllm-project/vllm/blob/main/vllm/config.py#L289
def get_total_num_kv_heads(self) -> int:
"""Returns the total number of KV heads."""
Expand Down
1 change: 1 addition & 0 deletions python/sglang/srt/disaggregation/decode.py
Original file line number Diff line number Diff line change
Expand Up @@ -426,6 +426,7 @@ def event_loop_normal_disagg_decode(self):
while True:
recv_reqs = self.recv_requests()
self.process_input_requests(recv_reqs)
self.model_runner_event_loop_step()
# polling and allocating kv cache
self.process_decode_queue()
batch = self.get_next_disagg_decode_batch_to_run()
Expand Down
1 change: 1 addition & 0 deletions python/sglang/srt/disaggregation/prefill.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,7 @@ def event_loop_normal_disagg_prefill(self):
while True:
recv_reqs = self.recv_requests()
self.process_input_requests(recv_reqs)
self.model_runner_event_loop_step()
self.waiting_queue.extend(
self.disagg_prefill_pending_queue.pop_bootstrapped()
)
Expand Down
90 changes: 87 additions & 3 deletions python/sglang/srt/entrypoints/engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,17 +20,24 @@
import asyncio
import atexit
import dataclasses
import json
import logging
import multiprocessing as mp
import os
import signal
import threading
from json import JSONDecodeError
from pathlib import Path
from typing import AsyncIterator, Dict, Iterator, List, Optional, Tuple, Union

import zmq
import zmq.asyncio
from PIL.Image import Image

from sglang.srt.configs.model_config import ModelConfig
from sglang.srt.managers.eplb_manager import EPLBManager
from sglang.srt.managers.expert_location import ExpertLocationMetadata

# Fix a bug of Python threading
setattr(threading, "_register_atexit", lambda *args, **kwargs: None)

Expand All @@ -45,13 +52,15 @@
from sglang.srt.managers.detokenizer_manager import run_detokenizer_process
from sglang.srt.managers.io_struct import (
EmbeddingReqInput,
EplbRebalanceReqInput,
GenerateReqInput,
GetWeightsByNameReqInput,
InitWeightsUpdateGroupReqInput,
ReleaseMemoryOccupationReqInput,
ResumeMemoryOccupationReqInput,
RpcReqInput,
RpcReqOutput,
UpdateExpertLocationReqInput,
UpdateWeightFromDiskReqInput,
UpdateWeightsFromDistributedReqInput,
UpdateWeightsFromTensorReqInput,
Expand Down Expand Up @@ -279,6 +288,10 @@ def __exit__(self, exc_type, exc_value, traceback):
self.shutdown()
return False

def flush_cache(self):
loop = asyncio.get_event_loop()
return loop.run_until_complete(self.tokenizer_manager.flush_cache())

def start_profile(self):
loop = asyncio.get_event_loop()
loop.run_until_complete(self.tokenizer_manager.start_profile())
Expand Down Expand Up @@ -355,10 +368,32 @@ def update_weights_from_tensor(
self.tokenizer_manager.update_weights_from_tensor(obj, None)
)

def eplb_rebalance(self):
loop = asyncio.get_event_loop()
return loop.run_until_complete(
self.tokenizer_manager.eplb_rebalance(EplbRebalanceReqInput())
)

def eplb_save_expert_distribution(self):
loop = asyncio.get_event_loop()
return loop.run_until_complete(
self.tokenizer_manager.eplb_save_expert_distribution()
)

def update_expert_location(self, expert_location_metadata: ExpertLocationMetadata):
obj = UpdateExpertLocationReqInput(
expert_location_metadata=expert_location_metadata,
)
loop = asyncio.get_event_loop()
return loop.run_until_complete(
self.tokenizer_manager.update_expert_location(obj)
)

def update_weights_from_disk(
self,
model_path: str,
load_format: Optional[str] = None,
param_categories: Optional[List[str]] = None,
):
"""Update the weights from disk inplace without re-launching the engine.

Expand All @@ -369,6 +404,7 @@ def update_weights_from_disk(
obj = UpdateWeightFromDiskReqInput(
model_path=model_path,
load_format=load_format,
param_categories=param_categories,
)

loop = asyncio.get_event_loop()
Expand Down Expand Up @@ -496,6 +532,14 @@ def _launch_subprocesses(
server_args.model_path, server_args.tokenizer_path
)

if server_args.node_rank == 0:
eplb_manager = EPLBManager(server_args) if server_args.enable_eplb else None
expert_location_metadata = _compute_initial_expert_location_metadata(
server_args, eplb_manager
)
else:
eplb_manager = expert_location_metadata = None

scheduler_procs = []
if server_args.dp_size == 1:
# Launch tensor parallel scheduler processes
Expand All @@ -517,7 +561,15 @@ def _launch_subprocesses(
)
proc = mp.Process(
target=run_scheduler_process,
args=(server_args, port_args, gpu_id, tp_rank, None, writer),
args=(
server_args,
port_args,
expert_location_metadata,
gpu_id,
tp_rank,
None,
writer,
),
)
with memory_saver_adapter.configure_subprocess():
proc.start()
Expand All @@ -529,7 +581,7 @@ def _launch_subprocesses(
scheduler_pipe_readers = [reader]
proc = mp.Process(
target=run_data_parallel_controller_process,
args=(server_args, port_args, writer),
args=(server_args, port_args, expert_location_metadata, writer),
)
proc.start()
scheduler_procs.append(proc)
Expand Down Expand Up @@ -566,7 +618,9 @@ def _launch_subprocesses(
detoken_proc.start()

# Launch tokenizer process
tokenizer_manager = TokenizerManager(server_args, port_args)
tokenizer_manager = TokenizerManager(
server_args, port_args, expert_location_metadata, eplb_manager
)
if server_args.chat_template:
load_chat_template_for_openai_api(
tokenizer_manager, server_args.chat_template, server_args.model_path
Expand Down Expand Up @@ -598,3 +652,33 @@ def _launch_subprocesses(
scheduler_info = scheduler_infos[0]
tokenizer_manager.max_req_input_len = scheduler_info["max_req_input_len"]
return tokenizer_manager, scheduler_info


def _compute_initial_expert_location_metadata(
server_args: ServerArgs, eplb_manager: EPLBManager
) -> ExpertLocationMetadata:
if (data := server_args.init_expert_location) is not None:
try:
data_dict = json.loads(data)
except JSONDecodeError:
data_dict = json.loads(Path(data).read_text())

if "physical_to_logical_map" in data_dict:
logger.info(
"init_expert_location from init_by_mapping using ServerArgs.init_expert_location"
)
return ExpertLocationMetadata.init_by_mapping(server_args, **data_dict)
elif "logical_count" in data_dict:
logger.info(
"init_expert_location from init_by_eplb using ServerArgs.init_expert_location"
)
return ExpertLocationMetadata.init_by_eplb(server_args, **data_dict)
else:
raise NotImplementedError(
f"Unknown init_expert_location format ({list(data_dict.keys())=})"
)
if server_args.enable_eplb:
logger.info("init_expert_location from EPLBManager")
return eplb_manager.compute_expert_location_metadata()

return ExpertLocationMetadata.init_trivial(server_args)
Loading
Loading