Skip to content

Commit 08ca0f6

Browse files
authored
[Feature] [PD] add simple router and refine splitwise deployment (#4709)
* add simple router and refine splitwise deployment * fix
1 parent 831266d commit 08ca0f6

39 files changed

+2397
-171
lines changed

benchmarks/backend_request_func.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -94,10 +94,11 @@ async def async_request_eb_openai_chat_completions(
9494
"stream_options": {
9595
"include_usage": True,
9696
"continuous_usage_stats": True,
97-
}
97+
},
98+
"max_tokens": request_func_input.output_len,
9899
}
99100
if request_func_input.response_format:
100-
payload["response_format"] =request_func_input.response_format
101+
payload["response_format"] = request_func_input.response_format
101102

102103
# 超参由yaml传入
103104
payload.update(request_func_input.hyper_parameters)
@@ -132,13 +133,13 @@ async def async_request_eb_openai_chat_completions(
132133

133134
chunk = chunk_bytes.decode("utf-8").removeprefix("data: ")
134135
if chunk != "[DONE]":
135-
#print("####chunk:", chunk, type(chunk))
136+
# print("####chunk:", chunk, type(chunk))
136137
timestamp = time.perf_counter()
137138
data = json.loads(chunk)
138139

139140
if request_id == "None" and "id" in data:
140141
request_id = data["id"]
141-
142+
142143
if choices := data.get("choices"):
143144
content = choices[0]["delta"].get("content")
144145
reason_content = choices[0]["delta"].get("reasoning_content")
@@ -164,7 +165,6 @@ async def async_request_eb_openai_chat_completions(
164165
elif usage := data.get("usage", {}):
165166
output.output_tokens = usage.get("completion_tokens", 0)
166167
output.prompt_tokens = usage.get("prompt_tokens", 0)
167-
168168

169169
most_recent_timestamp = timestamp
170170

benchmarks/benchmark_dataset.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ class SampleRequest:
4646
prompt_len: int
4747
expected_output_len: int
4848
response_format: Optional[dict] = None
49-
49+
5050

5151
class BenchmarkDataset(ABC):
5252
"""BenchmarkDataset"""
@@ -299,7 +299,7 @@ def sample(
299299
prompt = entry["messages"][-1].get("content", "")
300300
history_QA = entry.get("messages", [])
301301
response_format = entry.get("response_format")
302-
new_output_len = int(entry.get("max_tokens", 12288))
302+
new_output_len = int(entry.get("max_tokens", output_len if output_len else 12288))
303303

304304
if enable_multimodal_chat:
305305
prompt = self.apply_multimodal_chat_transformation(prompt, None)
@@ -311,7 +311,7 @@ def sample(
311311
prompt_len=0,
312312
history_QA=history_QA,
313313
expected_output_len=new_output_len,
314-
response_format=response_format
314+
response_format=response_format,
315315
)
316316
)
317317
cnt += 1

benchmarks/benchmark_serving.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -352,7 +352,7 @@ async def benchmark(
352352
ignore_eos=ignore_eos,
353353
debug=debug,
354354
extra_body=extra_body,
355-
response_format=response_format
355+
response_format=response_format,
356356
)
357357

358358
print("test_input:", test_input)
@@ -384,7 +384,7 @@ async def benchmark(
384384
logprobs=logprobs,
385385
ignore_eos=ignore_eos,
386386
extra_body=extra_body,
387-
response_format=response_format
387+
response_format=response_format,
388388
)
389389
profile_output = await request_func(request_func_input=profile_input)
390390
if profile_output.success:
@@ -444,7 +444,7 @@ async def limited_request_func(request_func_input, pbar):
444444
debug=debug,
445445
ignore_eos=ignore_eos,
446446
extra_body=extra_body,
447-
response_format=response_format
447+
response_format=response_format,
448448
)
449449
tasks.append(asyncio.create_task(limited_request_func(request_func_input=request_func_input, pbar=pbar)))
450450
outputs: list[RequestFuncOutput] = await asyncio.gather(*tasks)
@@ -460,7 +460,7 @@ async def limited_request_func(request_func_input, pbar):
460460
api_url=base_url + "/stop_profile",
461461
output_len=test_output_len,
462462
logprobs=logprobs,
463-
response_format=response_format
463+
response_format=response_format,
464464
)
465465
profile_output = await request_func(request_func_input=profile_input)
466466
if profile_output.success:

benchmarks/yaml/qwen25_7b-vl-32k-bf16.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,4 +3,4 @@ max_num_seqs: 128
33
gpu_memory_utilization: 0.85
44
tensor_parallel_size: 1
55
limit_mm_per_prompt: '{"image": 100, "video": 100}'
6-
enable_mm: True
6+
enable_mm: True

benchmarks/yaml/request_yaml/qwen25-vl-32k.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,4 +5,4 @@ metadata:
55
max_tokens: 32768
66
repetition_penalty: 1.05
77
frequency_penalty: 0
8-
presence_penalty: 0
8+
presence_penalty: 0

docs/features/multi-node_deployment.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ We recommend using mpirun for one-command startup without manually starting each
2626
4. Ensure all nodes can resolve each other's hostnames
2727

2828
* Online inference startup example:
29-
29+
3030
```shell
3131
python -m fastdeploy.entrypoints.openai.api_server \
3232
--model baidu/ERNIE-4.5-300B-A47B-Paddle \
@@ -40,7 +40,7 @@ We recommend using mpirun for one-command startup without manually starting each
4040
```
4141

4242
* Offline startup example:
43-
43+
4444
```python
4545
from fastdeploy.engine.sampling_params import SamplingParams
4646
from fastdeploy.entrypoints.llm import LLM

docs/zh/features/multi-node_deployment.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
4. 确保所有节点能够解析彼此的主机名
2727

2828
* 在线推理启动示例:
29-
29+
3030
```shell
3131
python -m fastdeploy.entrypoints.openai.api_server \
3232
--model baidu/ERNIE-4.5-300B-A47B-Paddle \
@@ -40,7 +40,7 @@
4040
```
4141

4242
* 离线启动示例:
43-
43+
4444
```python
4545
from fastdeploy.engine.sampling_params import SamplingParams
4646
from fastdeploy.entrypoints.llm import LLM

examples/splitwise/start_mixed.sh

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
#!/bin/bash
2+
set -e
3+
4+
wait_for_health() {
5+
local server_port=$1
6+
while true; do
7+
status_code=$(curl -s -o /dev/null -w "%{http_code}" "http://0.0.0.0:${server_port}/health" || echo "000")
8+
if [ "$status_code" -eq 200 ]; then
9+
break
10+
else
11+
echo "Service not ready. Retrying in 2s..."
12+
sleep 2
13+
fi
14+
done
15+
}
16+
17+
# prepare environment
18+
MODEL_NAME="PaddlePaddle/ERNIE-4.5-0.3B-Paddle"
19+
# MODEL_NAME="baidu/ERNIE-4.5-21B-A3B-Paddle"
20+
21+
export FD_DEBUG=1
22+
export ENABLE_V1_KVCACHE_SCHEDULER=0
23+
export KVCACHE_GDRCOPY_FLUSH_ENABLE=1
24+
25+
unset http_proxy && unset https_proxy
26+
rm -rf log_*
27+
28+
# start router
29+
export FD_LOG_DIR="log_router"
30+
mkdir -p ${FD_LOG_DIR}
31+
32+
router_port=9000
33+
nohup python -m fastdeploy.router.launch \
34+
--port ${router_port} \
35+
2>&1 >${FD_LOG_DIR}/nohup &
36+
sleep 1
37+
38+
# start modelserver 0
39+
export CUDA_VISIBLE_DEVICES=0
40+
export FD_LOG_DIR="log_server_0"
41+
mkdir -p ${FD_LOG_DIR}
42+
43+
nohup python -m fastdeploy.entrypoints.openai.api_server \
44+
--model ${MODEL_NAME} \
45+
--port 8100 \
46+
--metrics-port 8101 \
47+
--engine-worker-queue-port 8102 \
48+
--cache-queue-port 8103 \
49+
--max-model-len 32768 \
50+
--router "0.0.0.0:${router_port}" \
51+
2>&1 >${FD_LOG_DIR}/nohup &
52+
sleep 1
53+
54+
wait_for_health 8100
55+
56+
# start modelserver 1
57+
export CUDA_VISIBLE_DEVICES=1
58+
export FD_LOG_DIR="log_server_1"
59+
mkdir -p ${FD_LOG_DIR}
60+
61+
nohup python -m fastdeploy.entrypoints.openai.api_server \
62+
--model ${MODEL_NAME} \
63+
--port 8200 \
64+
--metrics-port 8201 \
65+
--engine-worker-queue-port 8202 \
66+
--cache-queue-port 8203 \
67+
--max-model-len 32768 \
68+
--router "0.0.0.0:${router_port}" \
69+
2>&1 >${FD_LOG_DIR}/nohup &
70+
71+
wait_for_health 8200

examples/splitwise/start_v0_tp1.sh

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
#!/bin/bash
2+
set -e
3+
4+
# Test splitwise deployment
5+
# v0 requires prefill and decode in one node and it uses local scheduler
6+
# v1 supports prefill and decode in multi node and it uses splitwise scheduler
7+
# v2 supports prefill and decode in multi node and it uses router and local scheduler
8+
9+
wait_for_health() {
10+
local server_port=$1
11+
while true; do
12+
status_code=$(curl -s -o /dev/null -w "%{http_code}" "http://0.0.0.0:${server_port}/health" || echo "000")
13+
if [ "$status_code" -eq 200 ]; then
14+
break
15+
else
16+
echo "Service not ready. Retrying in 2s..."
17+
sleep 2
18+
fi
19+
done
20+
}
21+
22+
MODEL_NAME="PaddlePaddle/ERNIE-4.5-0.3B-Paddle"
23+
# MODEL_NAME="baidu/ERNIE-4.5-21B-A3B-Paddle"
24+
aistudio download --model ${MODEL_NAME}
25+
26+
unset http_proxy && unset https_proxy
27+
rm -rf log_*
28+
29+
# start prefill
30+
export FD_LOG_DIR="log_prefill"
31+
mkdir -p ${FD_LOG_DIR}
32+
33+
export CUDA_VISIBLE_DEVICES=0
34+
export FD_DEBUG=1
35+
export ENABLE_V1_KVCACHE_SCHEDULER=0
36+
37+
nohup python -m fastdeploy.entrypoints.openai.api_server \
38+
--model ${MODEL_NAME} \
39+
--port 8100 \
40+
--metrics-port 8101 \
41+
--engine-worker-queue-port 8102 \
42+
--cache-queue-port 8103 \
43+
--max-model-len 32768 \
44+
--splitwise-role "prefill" \
45+
2>&1 >${FD_LOG_DIR}/nohup &
46+
wait_for_health 8100
47+
48+
# start decode
49+
export FD_LOG_DIR="log_decode"
50+
mkdir -p ${FD_LOG_DIR}
51+
52+
export CUDA_VISIBLE_DEVICES=1
53+
export FD_DEBUG=1
54+
export ENABLE_V1_KVCACHE_SCHEDULER=0
55+
56+
nohup python -m fastdeploy.entrypoints.openai.api_server \
57+
--model ${MODEL_NAME} \
58+
--port 9000 \
59+
--metrics-port 9001 \
60+
--engine-worker-queue-port 9002 \
61+
--cache-queue-port 9003 \
62+
--max-model-len 32768 \
63+
--splitwise-role "decode" \
64+
--innode-prefill-ports 8102 \
65+
2>&1 >${FD_LOG_DIR}/nohup &
66+
wait_for_health 9000

examples/splitwise/start_v1_tp1.sh

Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
#!/bin/bash
2+
set -e
3+
4+
# Test splitwise deployment
5+
# v0 requires prefill and decode in one node and it uses local scheduler
6+
# v1 supports prefill and decode in multi node and it uses splitwise scheduler
7+
# v2 supports prefill and decode in multi node and it uses router and local scheduler
8+
9+
wait_for_health() {
10+
local server_port=$1
11+
while true; do
12+
status_code=$(curl -s -o /dev/null -w "%{http_code}" "http://0.0.0.0:${server_port}/health" || echo "000")
13+
if [ "$status_code" -eq 200 ]; then
14+
break
15+
else
16+
echo "Service not ready. Retrying in 2s..."
17+
sleep 2
18+
fi
19+
done
20+
}
21+
22+
# prepare environment
23+
MODEL_NAME="PaddlePaddle/ERNIE-4.5-0.3B-Paddle"
24+
# MODEL_NAME="baidu/ERNIE-4.5-21B-A3B-Paddle"
25+
26+
export FD_DEBUG=1
27+
export ENABLE_V1_KVCACHE_SCHEDULER=0
28+
export KVCACHE_GDRCOPY_FLUSH_ENABLE=1
29+
30+
SCRIPT_PATH=$(readlink -f "$0")
31+
SCRIPT_DIR=$(dirname "$SCRIPT_PATH")
32+
export $(bash ${SCRIPT_DIR}/../../scripts/get_rdma_nics.sh gpu)
33+
echo "KVCACHE_RDMA_NICS:${KVCACHE_RDMA_NICS}"
34+
if [ -z "${KVCACHE_RDMA_NICS}" ]; then
35+
echo "KVCACHE_RDMA_NICS is empty, please check the output of get_rdma_nics.sh"
36+
exit 1
37+
fi
38+
39+
unset http_proxy && unset https_proxy
40+
rm -rf log_*
41+
42+
# start redis
43+
if ! redis-cli ping &>/dev/null; then
44+
echo "Redis is not running. Starting redis-server..."
45+
redis-server --daemonize yes
46+
sleep 1
47+
else
48+
echo "Redis is already running."
49+
fi
50+
sleep 1
51+
52+
# start prefill
53+
export CUDA_VISIBLE_DEVICES=0
54+
export FD_LOG_DIR="log_prefill"
55+
mkdir -p ${FD_LOG_DIR}
56+
57+
nohup python -m fastdeploy.entrypoints.openai.api_server \
58+
--model ${MODEL_NAME} \
59+
--port 8100 \
60+
--metrics-port 8101 \
61+
--engine-worker-queue-port 8102 \
62+
--cache-queue-port 8103 \
63+
--max-model-len 32768 \
64+
--splitwise-role "prefill" \
65+
--cache-transfer-protocol "rdma,ipc" \
66+
--rdma-comm-ports 8104 \
67+
--pd-comm-port 8105 \
68+
--scheduler-name "splitwise" \
69+
--scheduler-host "127.0.0.1" \
70+
--scheduler-port 6379 \
71+
--scheduler-ttl 9000 \
72+
2>&1 >${FD_LOG_DIR}/nohup &
73+
wait_for_health 8100
74+
75+
# start decode
76+
export CUDA_VISIBLE_DEVICES=1
77+
export FD_LOG_DIR="log_decode"
78+
mkdir -p ${FD_LOG_DIR}
79+
80+
nohup python -m fastdeploy.entrypoints.openai.api_server \
81+
--model ${MODEL_NAME} \
82+
--port 9000 \
83+
--metrics-port 9001 \
84+
--engine-worker-queue-port 9002 \
85+
--cache-queue-port 9003 \
86+
--max-model-len 32768 \
87+
--splitwise-role "decode" \
88+
--cache-transfer-protocol "rdma,ipc" \
89+
--rdma-comm-ports 9004 \
90+
--pd-comm-port 9005 \
91+
--scheduler-name "splitwise" \
92+
--scheduler-host "127.0.0.1" \
93+
--scheduler-port 6379 \
94+
--scheduler-ttl 9000 \
95+
2>&1 >${FD_LOG_DIR}/nohup &
96+
wait_for_health 9000

0 commit comments

Comments
 (0)