Skip to content

Commit c90f094

Browse files
author
root
committed
refine router
1 parent e650a65 commit c90f094

File tree

15 files changed

+452
-207
lines changed

15 files changed

+452
-207
lines changed

examples/splitwise/start_v0.sh

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,14 @@
11
#!/bin/bash
2+
set -e
3+
24
# Test splitwise deployment
35
# v0 requires prefill and decode in one node and it uses local scheduler
46
# v1 supports prefill and decode in multi node and it uses splitwise scheduler
57
# v2 supports prefill and decode in multi node and it uses router and local scheduler
68

79
# start prefill
810
export FD_LOG_DIR="log_prefill"
9-
rm -rf ${FD_LOG_DIR}
11+
rm -rf log_*
1012
mkdir -p ${FD_LOG_DIR}
1113

1214
export CUDA_VISIBLE_DEVICES=0
@@ -26,7 +28,6 @@ sleep 2
2628

2729
# start decode
2830
export FD_LOG_DIR="log_decode"
29-
rm -rf ${FD_LOG_DIR}
3031
mkdir -p ${FD_LOG_DIR}
3132

3233
export CUDA_VISIBLE_DEVICES=1

examples/splitwise/start_v1.sh

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
11
#!/bin/bash
2+
set -e
3+
24
# Test splitwise deployment
35
# v0 requires prefill and decode in one node and it uses local scheduler
46
# v1 supports prefill and decode in multi node and it uses splitwise scheduler
@@ -14,6 +16,10 @@ SCRIPT_PATH=$(readlink -f "$0")
1416
SCRIPT_DIR=$(dirname "$SCRIPT_PATH")
1517
export $(bash ${SCRIPT_DIR}/../../scripts/get_rdma_nics.sh gpu)
1618
echo "KVCACHE_RDMA_NICS:${KVCACHE_RDMA_NICS}"
19+
if [ -z "${KVCACHE_RDMA_NICS}" ]; then
20+
echo "KVCACHE_RDMA_NICS is empty, please check the output of get_rdma_nics.sh"
21+
exit 1
22+
fi
1723

1824
# start redis
1925
if ! redis-cli ping &>/dev/null; then
@@ -28,7 +34,7 @@ sleep 1
2834
# start prefill
2935
export CUDA_VISIBLE_DEVICES=0
3036
export FD_LOG_DIR="log_prefill"
31-
rm -rf ${FD_LOG_DIR}
37+
rm -rf log_*
3238
mkdir -p ${FD_LOG_DIR}
3339

3440
nohup python -m fastdeploy.entrypoints.openai.api_server \
@@ -52,7 +58,6 @@ sleep 1
5258
# start decode
5359
export CUDA_VISIBLE_DEVICES=1
5460
export FD_LOG_DIR="log_decode"
55-
rm -rf ${FD_LOG_DIR}
5661
mkdir -p ${FD_LOG_DIR}
5762

5863
nohup python -m fastdeploy.entrypoints.openai.api_server \

examples/splitwise/start_v1_tp2.sh

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
11
#!/bin/bash
2+
set -e
3+
24
# Test splitwise deployment
35
# v0 requires prefill and decode in one node and it uses local scheduler
46
# v1 supports prefill and decode in multi node and it uses splitwise scheduler
@@ -14,6 +16,10 @@ SCRIPT_PATH=$(readlink -f "$0")
1416
SCRIPT_DIR=$(dirname "$SCRIPT_PATH")
1517
export $(bash ${SCRIPT_DIR}/../../scripts/get_rdma_nics.sh gpu)
1618
echo "KVCACHE_RDMA_NICS:${KVCACHE_RDMA_NICS}"
19+
if [ -z "${KVCACHE_RDMA_NICS}" ]; then
20+
echo "KVCACHE_RDMA_NICS is empty, please check the output of get_rdma_nics.sh"
21+
exit 1
22+
fi
1723

1824
# start redis
1925
if ! redis-cli ping &>/dev/null; then
@@ -28,7 +34,7 @@ sleep 1
2834
# start prefill
2935
export CUDA_VISIBLE_DEVICES=0,1
3036
export FD_LOG_DIR="log_prefill"
31-
rm -rf ${FD_LOG_DIR}
37+
rm -rf log_*
3238
mkdir -p ${FD_LOG_DIR}
3339

3440
nohup python -m fastdeploy.entrypoints.openai.api_server \
@@ -53,7 +59,6 @@ sleep 1
5359
# start decode
5460
export CUDA_VISIBLE_DEVICES=2,3
5561
export FD_LOG_DIR="log_decode"
56-
rm -rf ${FD_LOG_DIR}
5762
mkdir -p ${FD_LOG_DIR}
5863

5964
nohup python -m fastdeploy.entrypoints.openai.api_server \

examples/splitwise/start_v2.sh

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,15 @@
11
#!/bin/bash
2+
set -e
3+
24
# Test splitwise deployment
35
# v0 requires prefill and decode in one node and it uses local scheduler
46
# v1 supports prefill and decode in multi node and it uses splitwise scheduler
57
# v2 supports prefill and decode in multi node and it uses router and local scheduler
68

79
# prepare environment
810
MODEL_NAME="baidu/ERNIE-4.5-0.3B-Paddle"
11+
# MODEL_NAME="baidu/ERNIE-4.5-21B-A3B-Paddle"
12+
913
export FD_DEBUG=1
1014
export ENABLE_V1_KVCACHE_SCHEDULER=0
1115
export KVCACHE_GDRCOPY_FLUSH_ENABLE=1
@@ -14,22 +18,26 @@ SCRIPT_PATH=$(readlink -f "$0")
1418
SCRIPT_DIR=$(dirname "$SCRIPT_PATH")
1519
export $(bash ${SCRIPT_DIR}/../../scripts/get_rdma_nics.sh gpu)
1620
echo "KVCACHE_RDMA_NICS:${KVCACHE_RDMA_NICS}"
21+
if [ -z "${KVCACHE_RDMA_NICS}" ]; then
22+
echo "KVCACHE_RDMA_NICS is empty, please check the output of get_rdma_nics.sh"
23+
exit 1
24+
fi
1725

1826
# start router
1927
export FD_LOG_DIR="log_router"
20-
rm -rf ${FD_LOG_DIR}
28+
rm -rf log_*
2129
mkdir -p ${FD_LOG_DIR}
2230

2331
router_port=9000
2432
nohup python -m fastdeploy.router.launch \
2533
--port ${router_port} \
34+
--splitwise \
2635
2>&1 >${FD_LOG_DIR}/nohup &
2736
sleep 1
2837

2938
# start prefill
3039
export CUDA_VISIBLE_DEVICES=0
3140
export FD_LOG_DIR="log_prefill"
32-
rm -rf ${FD_LOG_DIR}
3341
mkdir -p ${FD_LOG_DIR}
3442

3543
nohup python -m fastdeploy.entrypoints.openai.api_server \
@@ -40,6 +48,7 @@ nohup python -m fastdeploy.entrypoints.openai.api_server \
4048
--cache-queue-port 8103 \
4149
--max-model-len 32768 \
4250
--splitwise-role "prefill" \
51+
--cache-transfer-protocol "ipc,rdma" \
4352
--rdma-comm-ports 8104 \
4453
--pd-comm-port 8105 \
4554
--router "0.0.0.0:${router_port}" \
@@ -49,7 +58,6 @@ sleep 1
4958
# start decode
5059
export CUDA_VISIBLE_DEVICES=1
5160
export FD_LOG_DIR="log_decode"
52-
rm -rf ${FD_LOG_DIR}
5361
mkdir -p ${FD_LOG_DIR}
5462

5563
nohup python -m fastdeploy.entrypoints.openai.api_server \
@@ -60,6 +68,7 @@ nohup python -m fastdeploy.entrypoints.openai.api_server \
6068
--cache-queue-port 8203 \
6169
--max-model-len 32768 \
6270
--splitwise-role "decode" \
71+
--cache-transfer-protocol "ipc,rdma" \
6372
--rdma-comm-ports 8204 \
6473
--pd-comm-port 8205 \
6574
--router "0.0.0.0:${router_port}" \

examples/splitwise/start_v2_tp2.sh

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
11
#!/bin/bash
2+
set -e
3+
24
# Test splitwise deployment
35
# v0 requires prefill and decode in one node and it uses local scheduler
46
# v1 supports prefill and decode in multi node and it uses splitwise scheduler
@@ -14,22 +16,26 @@ SCRIPT_PATH=$(readlink -f "$0")
1416
SCRIPT_DIR=$(dirname "$SCRIPT_PATH")
1517
export $(bash ${SCRIPT_DIR}/../../scripts/get_rdma_nics.sh gpu)
1618
echo "KVCACHE_RDMA_NICS:${KVCACHE_RDMA_NICS}"
19+
if [ -z "${KVCACHE_RDMA_NICS}" ]; then
20+
echo "KVCACHE_RDMA_NICS is empty, please check the output of get_rdma_nics.sh"
21+
exit 1
22+
fi
1723

1824
# start router
1925
export FD_LOG_DIR="log_router"
20-
rm -rf ${FD_LOG_DIR}
26+
rm -rf log_*
2127
mkdir -p ${FD_LOG_DIR}
2228

2329
router_port=9000
2430
nohup python -m fastdeploy.router.launch \
2531
--port ${router_port} \
32+
--splitwise \
2633
2>&1 >${FD_LOG_DIR}/nohup &
2734
sleep 1
2835

2936
# start prefill
3037
export CUDA_VISIBLE_DEVICES=0,1
3138
export FD_LOG_DIR="log_prefill"
32-
rm -rf ${FD_LOG_DIR}
3339
mkdir -p ${FD_LOG_DIR}
3440

3541
nohup python -m fastdeploy.entrypoints.openai.api_server \
@@ -50,7 +56,6 @@ sleep 1
5056
# start decode
5157
export CUDA_VISIBLE_DEVICES=2,3
5258
export FD_LOG_DIR="log_decode"
53-
rm -rf ${FD_LOG_DIR}
5459
mkdir -p ${FD_LOG_DIR}
5560

5661
nohup python -m fastdeploy.entrypoints.openai.api_server \

fastdeploy/cache_manager/cache_messager.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,8 +35,6 @@
3535
from fastdeploy.model_executor.ops.gpu import get_output_kv_signal, set_data_ipc
3636
from fastdeploy.utils import envs, get_logger
3737

38-
logger = get_logger("cache_messager", "cache_messager.log")
39-
4038

4139
def parse_args():
4240
"""

fastdeploy/config.py

Lines changed: 36 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1303,6 +1303,24 @@ def print(self):
13031303
logger.info("=============================================================")
13041304

13051305

1306+
class RouterConfig:
1307+
"""
1308+
Configuration for router
1309+
Attributes:
1310+
router: the url of router, such as http://127.0.0.1:8000
1311+
api_server_host: the host ip of model server
1312+
api_server_port: the http port of model server
1313+
"""
1314+
1315+
def __init__(self, args: dict):
1316+
self.router = args["router"]
1317+
if self.router is not None and not self.router.startswith(("http://", "https://")):
1318+
self.router = f"http://{self.router}"
1319+
1320+
self.api_server_host = get_host_ip()
1321+
self.api_server_port = args["port"]
1322+
1323+
13061324
class CommitConfig:
13071325
"""
13081326
Configuration for tracking version information from version.txt
@@ -1404,6 +1422,7 @@ def __init__(
14041422
speculative_config: SpeculativeConfig = None,
14051423
eplb_config: EPLBConfig = None,
14061424
structured_outputs_config: StructuredOutputsConfig = None,
1425+
router_config: RouterConfig = None,
14071426
tokenizer: str = None,
14081427
ips: str = None,
14091428
use_warmup: bool = False,
@@ -1416,7 +1435,6 @@ def __init__(
14161435
early_stop_config: Optional[Dict[str, Any]] = None,
14171436
tool_parser: str = None,
14181437
test_mode=False,
1419-
port=None,
14201438
):
14211439
self.model_config: ModelConfig = model_config # type: ignore
14221440
self.cache_config: CacheConfig = cache_config # type: ignore
@@ -1432,6 +1450,7 @@ def __init__(
14321450
self.cache_config: CacheConfig = cache_config # type: ignore
14331451
self.plas_attention_config: Optional[PlasAttentionConfig] = plas_attention_config
14341452
self.structured_outputs_config: StructuredOutputsConfig = structured_outputs_config
1453+
self.router_config: RouterConfig = router_config
14351454

14361455
# Initialize cuda graph capture list
14371456
max_capture_shape = self.scheduler_config.max_num_seqs
@@ -1459,7 +1478,6 @@ def __init__(
14591478
self.ips = self.ips.split(",")
14601479

14611480
self.host_ip = get_host_ip()
1462-
self.port = port
14631481

14641482
if self.ips is None:
14651483
self.nnode = 1
@@ -1730,39 +1748,39 @@ def init_cache_info(self):
17301748
"""
17311749
initialize cache info
17321750
"""
1733-
# TODO: group the splitiwse params
1751+
# TODO: group the splitiwse params, remove code of v0
17341752
# v0 requires prefill and decode in one node and it uses local scheduler
17351753
# v1 supports prefill and decode in multi node and it uses splitwise or dp scheduler
17361754
# v2 supports prefill and decode in multi node and it uses router and local scheduler
17371755
self.splitwise_version = None
1738-
if self.scheduler_config.name == "local" and self.scheduler_config.router is None:
1756+
if self.scheduler_config.name == "local" and (self.router_config is None or self.router_config.router is None):
17391757
self.splitwise_version = "v0"
17401758
elif self.scheduler_config.name in ("splitwise", "dp"):
17411759
self.splitwise_version = "v1"
1742-
elif self.scheduler_config.name == "local" and self.scheduler_config.router:
1760+
elif self.scheduler_config.name == "local" and self.router_config and self.router_config.router:
17431761
self.splitwise_version = "v2"
17441762
else:
17451763
raise ValueError(
17461764
f"Unsupported scheduler mode, scheduler_name: {self.scheduler_config.name}, "
1747-
f"router: {self.scheduler_config.router}"
1765+
f"router_config: {self.router_config}"
17481766
)
17491767
logger.info(f"splitwise_version: {self.splitwise_version}")
17501768

1769+
if isinstance(self.parallel_config.engine_worker_queue_port, (int, str)):
1770+
engine_worker_queue_port = self.parallel_config.engine_worker_queue_port
1771+
else:
1772+
engine_worker_queue_port = self.parallel_config.engine_worker_queue_port[
1773+
self.parallel_config.local_data_parallel_id
1774+
]
1775+
connector_port = self.cache_config.pd_comm_port[0] if self.cache_config.pd_comm_port else None
1776+
17511777
self.disaggregate_info = {}
17521778
if self.scheduler_config.splitwise_role != "mixed":
17531779
self.disaggregate_info["role"] = self.scheduler_config.splitwise_role
17541780
self.disaggregate_info["cache_info"] = dict()
17551781
current_protocol = self.cache_config.cache_transfer_protocol.split(",")
17561782
self.disaggregate_info["transfer_protocol"] = current_protocol
17571783

1758-
if isinstance(self.parallel_config.engine_worker_queue_port, (int, str)):
1759-
engine_worker_queue_port = self.parallel_config.engine_worker_queue_port
1760-
else:
1761-
engine_worker_queue_port = self.parallel_config.engine_worker_queue_port[
1762-
self.parallel_config.local_data_parallel_id
1763-
]
1764-
connector_port = self.cache_config.pd_comm_port[0] if self.cache_config.pd_comm_port else None
1765-
17661784
for protocol in current_protocol:
17671785
if protocol == "ipc":
17681786
self.disaggregate_info["cache_info"][protocol] = {
@@ -1778,17 +1796,18 @@ def init_cache_info(self):
17781796
}
17791797
logger.info(f"disaggregate_info: {self.disaggregate_info}")
17801798

1781-
self.splitwise_instance_info = {
1799+
if self.router_config:
1800+
self.register_info = {
17821801
"role": self.scheduler_config.splitwise_role,
17831802
"host_ip": self.host_ip,
1784-
"port": self.port,
1803+
"port": self.router_config.api_server_port,
17851804
"connector_port": connector_port,
17861805
"rdma_ports": self.cache_config.rdma_comm_ports,
17871806
"engine_worker_queue_port": engine_worker_queue_port,
17881807
"device_ids": self.local_device_ids,
17891808
"transfer_protocol": self.cache_config.cache_transfer_protocol.split(","),
17901809
}
1791-
logger.info(f"splitwise_instance_info: {self.splitwise_instance_info}")
1810+
logger.info(f"register_info: {self.register_info}")
17921811

17931812
def read_from_config(self):
17941813
"""

fastdeploy/engine/args_utils.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@
3434
ParallelConfig,
3535
PlasAttentionConfig,
3636
PoolerConfig,
37+
RouterConfig,
3738
RunnerOption,
3839
SpeculativeConfig,
3940
StructuredOutputsConfig,
@@ -1141,6 +1142,7 @@ def create_engine_config(self, port_availability_check=True) -> FDConfig:
11411142
scheduler_cfg = self.create_scheduler_config()
11421143
graph_opt_cfg = self.create_graph_optimization_config()
11431144
plas_attention_config = self.create_plas_attention_config()
1145+
router_config = RouterConfig(all_dict)
11441146

11451147
early_stop_cfg = self.create_early_stop_config()
11461148
early_stop_cfg.update_enable_early_stop(self.enable_early_stop)
@@ -1160,6 +1162,7 @@ def create_engine_config(self, port_availability_check=True) -> FDConfig:
11601162
speculative_config=speculative_cfg,
11611163
eplb_config=eplb_cfg,
11621164
structured_outputs_config=structured_outputs_config,
1165+
router_config=router_config,
11631166
ips=self.ips,
11641167
use_warmup=self.use_warmup,
11651168
limit_mm_per_prompt=self.limit_mm_per_prompt,
@@ -1172,5 +1175,4 @@ def create_engine_config(self, port_availability_check=True) -> FDConfig:
11721175
graph_opt_config=graph_opt_cfg,
11731176
plas_attention_config=plas_attention_config,
11741177
early_stop_config=early_stop_cfg,
1175-
port=self.port,
11761178
)

0 commit comments

Comments
 (0)