Skip to content

Commit ba9bf97

Browse files
tvoasCharleneHu-42jikunshangys950902xinyu-intel
committed
[deepseek_r1] General PP enabling
Co-authored-by: Hu, Yabai <[email protected]> Co-authored-by: Ji, Kunshang <[email protected]> Co-authored-by: Sheng, Yi <[email protected]> Co-authored-by: Chen, Xinyu <[email protected]> Co-authored-by: Voas, Tanner <[email protected]> Signed-off-by: Voas, Tanner <[email protected]>
1 parent ff916ed commit ba9bf97

File tree

18 files changed

+720
-84
lines changed

18 files changed

+720
-84
lines changed
Lines changed: 62 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,49 @@
11
#!/bin/bash
22

33
# Check for minimum number of required arguments
4+
Help() {
5+
# Display Help
6+
echo "Usage: $0 docker_image head_node_address --head|--worker path_to_hf_home"
7+
echo " [-h] [-d hpu|gpu] [-c true|false] [-- additional_args..."]
8+
}
9+
410
if [ $# -lt 4 ]; then
5-
echo "Usage: $0 docker_image head_node_address --head|--worker path_to_hf_home [additional_args...]"
11+
Help
612
exit 1
713
fi
814

9-
# Assign the first three arguments and shift them away
15+
# Assign the first four arguments and shift them away
1016
DOCKER_IMAGE="$1"
1117
HEAD_NODE_ADDRESS="$2"
1218
NODE_TYPE="$3" # Should be --head or --worker
1319
PATH_TO_HF_HOME="$4"
1420
shift 4
1521

22+
PLATFORM="gpu"
23+
CLEANUP_ON_EXIT="true"
24+
25+
# Get the options
26+
while getopts hd:c: flag; do
27+
case $flag in
28+
h) # display Help
29+
Help
30+
exit
31+
;;
32+
d) # get the device type
33+
PLATFORM=$OPTARG ;;
34+
c) # get TP value
35+
CLEANUP_ON_EXIT=$OPTARG ;;
36+
\?) # Invalid option
37+
echo "Error: Invalid option"
38+
Help
39+
exit
40+
;;
41+
esac
42+
done
43+
44+
# Shift the processed options and their arguments
45+
shift $((OPTIND - 1))
46+
1647
# Additional arguments are passed directly to the Docker command
1748
ADDITIONAL_ARGS=("$@")
1849

@@ -27,23 +58,41 @@ cleanup() {
2758
docker stop node
2859
docker rm node
2960
}
30-
trap cleanup EXIT
61+
if [[ "$CLEANUP_ON_EXIT" == "true" ]]; then
62+
trap cleanup EXIT
63+
fi
3164

3265
# Command setup for head or worker node
3366
RAY_START_CMD="ray start --block"
3467
if [ "${NODE_TYPE}" == "--head" ]; then
35-
RAY_START_CMD+=" --head --port=6379"
68+
RAY_START_CMD+=" --head --node-ip-address ${HEAD_NODE_ADDRESS} --port=6379"
3669
else
3770
RAY_START_CMD+=" --address=${HEAD_NODE_ADDRESS}:6379"
3871
fi
3972

4073
# Run the docker command with the user specified parameters and additional arguments
41-
docker run \
42-
--entrypoint /bin/bash \
43-
--network host \
44-
--name node \
45-
--shm-size 10.24g \
46-
--gpus all \
47-
-v "${PATH_TO_HF_HOME}:/root/.cache/huggingface" \
48-
"${ADDITIONAL_ARGS[@]}" \
49-
"${DOCKER_IMAGE}" -c "${RAY_START_CMD}"
74+
if [[ "$PLATFORM" == "hpu" ]]; then
75+
docker run \
76+
-td \
77+
--entrypoint /bin/bash \
78+
--network host \
79+
--ipc=host \
80+
--name node \
81+
--runtime=habana \
82+
-e HABANA_VISIBLE_DEVICES=all \
83+
-e GLOO_SOCKET_IFNAME=${GLOO_SOCKET_IFNAME} \
84+
-e HCCL_SOCKET_IFNAME=${HCCL_SOCKET_IFNAME} \
85+
-v "${PATH_TO_HF_HOME}:/root/.cache/huggingface" \
86+
"${ADDITIONAL_ARGS[@]}" \
87+
"${DOCKER_IMAGE}" -c "${RAY_START_CMD}"
88+
else
89+
docker run \
90+
--entrypoint /bin/bash \
91+
--network host \
92+
--name node \
93+
--shm-size 10.24g \
94+
--gpus all \
95+
-v "${PATH_TO_HF_HOME}:/root/.cache/huggingface" \
96+
"${ADDITIONAL_ARGS[@]}" \
97+
"${DOCKER_IMAGE}" -c "${RAY_START_CMD}"
98+
fi

scripts/benchmark_client_param.sh

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
#!/bin/bash
2+
set -e
3+
4+
# Usage: source benchmark_client_param.sh
5+
# test_benchmark_client_serving INPUT_LEN OUTPUT_LEN MAX_CONCURRENCY NUM_PROMPTS [LEN_RATIO] [HOST] [PORT] [MODEL_PATH] [RESULTS_DIR]
6+
#
7+
# Arguments:
8+
# INPUT_LEN Length of the input sequence (number of tokens).
9+
# OUTPUT_LEN Length of the output sequence (number of tokens).
10+
# MAX_CONCURRENCY Maximum number of concurrent requests to send to the server.
11+
# NUM_PROMPTS Number of prompts to send in the benchmark.
12+
# LEN_RATIO (Optional) Ratio of minimum to maximum input/output lengths generated. Default: 1.0.
13+
# HOST (Optional) Host address of the server. Default: 127.0.0.1.
14+
# PORT (Optional) Port of the server. Default: 8688.
15+
# MODEL_PATH (Optional) Path to the model. Default: /root/.cache/huggingface/DeepSeek-R1-BF16-w8afp8-dynamic-no-ste-G2.
16+
# RESULTS_DIR (Optional) Directory to store benchmark results. Default: logs/test-results.
17+
#
18+
# Description:
19+
# This script defines a function `test_benchmark_client_serving` that runs a client-side benchmark
20+
# for vLLM serving. It sends random input prompts to the server and measures performance metrics
21+
# such as throughput and latency. The results are saved in JSON format in the specified results directory.
22+
#
23+
# Use this script to evaluate the performance of a vLLM server under different client configurations,
24+
# including varying input/output lengths, concurrency levels, and prompt counts.
25+
26+
test_benchmark_client_serving() {
27+
export PT_HPU_LAZY_MODE=1
28+
INPUT_LEN=$1
29+
OUTPUT_LEN=$2
30+
MAX_CONCURRENCY=$3
31+
NUM_PROMPTS=$4
32+
LEN_RATIO=${5:-1.0}
33+
HOST=${6:-127.0.0.1}
34+
PORT=${7:-8688}
35+
MODEL_PATH=${8:-${MODEL_PATH:-/root/.cache/huggingface/DeepSeek-R1-BF16-w8afp8-dynamic-no-ste-G2}}
36+
RESULTS_DIR=${9:-logs/test-results}
37+
mkdir -p "$RESULTS_DIR"
38+
39+
export no_proxy=localhost,${HOST},10.239.129.9
40+
41+
# Run serving benchmark
42+
echo "Running serving benchmark: input=${INPUT_LEN}, output=${OUTPUT_LEN}, concurrency=${MAX_CONCURRENCY}, prompts=${NUM_PROMPTS}, ratio=${LEN_RATIO},"
43+
echo " host=${HOST}, port=${PORT},"
44+
echo " model=${MODEL_PATH},"
45+
echo " results=${RESULTS_DIR}"
46+
47+
TIMESTAMP=$(TZ='Asia/Kolkata' date +%F-%H-%M-%S)
48+
LOG_BASE="benchmark_${NUM_PROMPTS}prompts_${MAX_CONCURRENCY}bs_in${INPUT_LEN}_out${OUTPUT_LEN}_ratio${LEN_RATIO}_${TIMESTAMP}"
49+
50+
python3 ../benchmarks/benchmark_serving.py \
51+
--backend vllm \
52+
--model "${MODEL_PATH}" \
53+
--trust-remote-code \
54+
--host "${HOST}" \
55+
--port "${PORT}" \
56+
--dataset-name random \
57+
--random-input-len "${INPUT_LEN}" \
58+
--random-output-len "${OUTPUT_LEN}" \
59+
--random-range-ratio "${LEN_RATIO}" \
60+
--max-concurrency "${MAX_CONCURRENCY}" \
61+
--num-prompts "${NUM_PROMPTS}" \
62+
--request-rate inf \
63+
--seed 0 \
64+
--ignore-eos \
65+
--save-result \
66+
--result-filename "${RESULTS_DIR}/${LOG_BASE}.json"
67+
}

scripts/benchmark_server_param.sh

Lines changed: 151 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,151 @@
1+
#!/bin/bash
2+
set -x
3+
4+
# Usage: benchmark_server_param.sh NUM_NODES MAX_MODEL_LEN MAX_NUM_SEQS TP_SIZE PP_SIZE \
5+
# COMM_BACKEND [PP_LAYER_PARTITION] [KV_CACHE_DTYPE] \
6+
# [DO_WARMUP] [DO_PROFILE] [HOST] [PORT] [MODEL_PATH] [RESULTS_DIR]
7+
#
8+
# Arguments:
9+
# NUM_NODES Number of nodes to use for the server.
10+
# MAX_MODEL_LEN Maximum model length (number of tokens).
11+
# MAX_NUM_SEQS Maximum number of sequences to process concurrently.
12+
# TP_SIZE Tensor parallelism size.
13+
# PP_SIZE Pipeline parallelism size.
14+
# COMM_BACKEND Communication backend to use (e.g., hccl, gloo).
15+
# PP_LAYER_PARTITION (Optional) Layer partitioning for pipeline parallelism (comma-separated list).
16+
# KV_CACHE_DTYPE (Optional) Data type for KV cache (e.g., auto, fp8_inc). Default: auto.
17+
# DO_WARMUP (Optional) Whether to perform warmup before benchmarking (true/false). Default: true.
18+
# DO_PROFILE (Optional) Whether to enable profiling (true/false). Default: false.
19+
# HOST (Optional) Host address for the server. Default: 127.0.0.1.
20+
# PORT (Optional) Port for the server. Default: 8688.
21+
# MODEL_PATH (Optional) Path to the model. Default: /root/.cache/huggingface/DeepSeek-R1-BF16-w8afp8-dynamic-no-ste-G2.
22+
# RESULTS_DIR (Optional) Directory to store results and logs. Default: logs/test-results.
23+
#
24+
# Description:
25+
# This script launches a vLLM server with the specified configuration for benchmarking.
26+
# It supports various parallelism configurations (tensor and pipeline), communication backends,
27+
# and optional profiling. The script sets up the environment, configures memory and scheduling
28+
# parameters, and starts the server with the provided arguments.
29+
#
30+
# Use this script as part of a benchmarking workflow to evaluate the performance of vLLM
31+
# under different configurations.
32+
33+
NUM_NODES=$1
34+
MAX_MODEL_LEN=$2
35+
MAX_NUM_SEQS=$3
36+
TP_SIZE=$4
37+
PP_SIZE=$5
38+
COMM_BACKEND=$6
39+
PP_LAYER_PARTITION=${7:-}
40+
KV_CACHE_DTYPE=${8:-auto}
41+
DO_WARMUP=${9:-true}
42+
DO_PROFILE=${10:-false}
43+
HOST=${11:-127.0.0.1}
44+
PORT=${12:-8688}
45+
MODEL_PATH=${13:-${MODEL_PATH:-/root/.cache/huggingface/DeepSeek-R1-BF16-w8afp8-dynamic-no-ste-G2}}
46+
RESULTS_DIR=${14:-logs/test-results}
47+
48+
if [ "$DO_PROFILE" == "true" ]; then
49+
hl-prof-config --use-template profile_api --hw-trace off
50+
export HABANA_PROFILE=1
51+
export VLLM_PROFILER_ENABLED=full
52+
export VLLM_TORCH_PROFILER_DIR=${RESULTS_DIR}/profiler/
53+
fi
54+
55+
# Environment settings
56+
export HABANA_VISIBLE_DEVICES="ALL"
57+
export PT_HPU_LAZY_MODE=1
58+
export PT_HPU_ENABLE_LAZY_COLLECTIVES="true"
59+
export VLLM_RAY_DISABLE_LOG_TO_DRIVER="1"
60+
export RAY_IGNORE_UNHANDLED_ERRORS="1"
61+
export PT_HPU_WEIGHT_SHARING=0
62+
export HABANA_VISIBLE_MODULES="0,1,2,3,4,5,6,7"
63+
64+
if [ "$DO_WARMUP" == "true" ]; then
65+
export VLLM_SKIP_WARMUP=false
66+
else
67+
export VLLM_SKIP_WARMUP=true
68+
fi
69+
export VLLM_MLA_DISABLE_REQUANTIZATION=1
70+
export VLLM_MLA_PERFORM_MATRIX_ABSORPTION=0
71+
export VLLM_DELAYED_SAMPLING="false"
72+
73+
# memory footprint tunning params
74+
export VLLM_GPU_MEMORY_UTILIZATION=${VLLM_GPU_MEMORY_UTILIZATION:-0.75}
75+
export VLLM_GRAPH_RESERVED_MEM=${VLLM_GRAPH_RESERVED_MEM:-0.4}
76+
export VLLM_GRAPH_PROMPT_RATIO=0
77+
78+
export VLLM_EP_SIZE=$TP_SIZE
79+
if [ "$PP_SIZE" -gt 1 ]; then
80+
if [ -n "$PP_LAYER_PARTITION" ]; then
81+
echo "PP_SIZE = ${PP_SIZE}, PP_LAYER_PARTITION = ${PP_LAYER_PARTITION}"
82+
export VLLM_PP_LAYER_PARTITION=$PP_LAYER_PARTITION
83+
else
84+
echo "Warning: PP_SIZE > 1 but PP_LAYER_PARTITION not provided"
85+
fi
86+
fi
87+
88+
if [ "$COMM_BACKEND" == "gloo" ]; then
89+
export VLLM_PP_USE_CPU_COMS=1
90+
fi
91+
92+
if [ "$KV_CACHE_DTYPE" == "fp8_inc" ]; then
93+
# Required to improve performance with FP8 KV cache.
94+
export VLLM_USE_FP8_MATMUL="true"
95+
fi
96+
97+
# Bucketing configuration
98+
BLOCK_SIZE=128
99+
export PT_HPU_RECIPE_CACHE_CONFIG="/data/${MAX_MODEL_LEN}_cache,false,${MAX_MODEL_LEN}"
100+
MAX_NUM_BATCHED_TOKENS=$MAX_MODEL_LEN
101+
102+
prompt_bs_min=1
103+
prompt_bs_step=$(( MAX_NUM_SEQS > 32 ? 32 : MAX_NUM_SEQS ))
104+
prompt_bs_max=$(( MAX_NUM_SEQS > 64 ? 64 : MAX_NUM_SEQS ))
105+
export VLLM_PROMPT_BS_BUCKET_MIN=${VLLM_PROMPT_BS_BUCKET_MIN:-$prompt_bs_min}
106+
export VLLM_PROMPT_BS_BUCKET_STEP=${VLLM_PROMPT_BS_BUCKET_STEP:-$prompt_bs_step}
107+
export VLLM_PROMPT_BS_BUCKET_MAX=${VLLM_PROMPT_BS_BUCKET_MAX:-$prompt_bs_max}
108+
109+
prompt_seq_min=128
110+
prompt_seq_step=128
111+
prompt_seq_max=$MAX_NUM_BATCHED_TOKENS
112+
export VLLM_PROMPT_SEQ_BUCKET_MIN=${VLLM_PROMPT_SEQ_BUCKET_MIN:-$prompt_seq_min}
113+
export VLLM_PROMPT_SEQ_BUCKET_STEP=${VLLM_PROMPT_SEQ_BUCKET_STEP:-$prompt_seq_step}
114+
export VLLM_PROMPT_SEQ_BUCKET_MAX=${VLLM_PROMPT_SEQ_BUCKET_MAX:-$prompt_seq_max}
115+
116+
decode_bs_min=1
117+
decode_bs_step=$(( MAX_NUM_SEQS > 32 ? 32 : MAX_NUM_SEQS ))
118+
decode_bs_max=$MAX_NUM_SEQS
119+
export VLLM_DECODE_BS_BUCKET_MIN=${VLLM_DECODE_BS_BUCKET_MIN:-$decode_bs_min}
120+
export VLLM_DECODE_BS_BUCKET_STEP=${VLLM_DECODE_BS_BUCKET_STEP:-$decode_bs_step}
121+
export VLLM_DECODE_BS_BUCKET_MAX=${VLLM_DECODE_BS_BUCKET_MAX:-$decode_bs_max}
122+
123+
decode_block_min=128
124+
decode_block_step=128
125+
decode_block_max=$(( ((MAX_NUM_SEQS * MAX_MODEL_LEN / BLOCK_SIZE) > 128) ? (MAX_NUM_SEQS * MAX_MODEL_LEN / BLOCK_SIZE) : 128 ))
126+
export VLLM_DECODE_BLOCK_BUCKET_MIN=${VLLM_DECODE_BLOCK_BUCKET_MIN:-$decode_block_min}
127+
export VLLM_DECODE_BLOCK_BUCKET_STEP=${VLLM_DECODE_BLOCK_BUCKET_STEP:-$decode_block_step}
128+
export VLLM_DECODE_BLOCK_BUCKET_MAX=${VLLM_DECODE_BLOCK_BUCKET_MAX:-$decode_block_max}
129+
130+
echo "Environments set for ${NUM_NODES}-node server: MAX_MODEL_LEN=${MAX_MODEL_LEN}, MAX_NUM_SEQS=${MAX_NUM_SEQS}, TP_SIZE=${TP_SIZE}, PP_SIZE=${PP_SIZE}, COMM_BACKEND=${COMM_BACKEND}"
131+
env | grep VLLM
132+
133+
python3 -m vllm.entrypoints.openai.api_server --host $HOST --port $PORT \
134+
--block-size $BLOCK_SIZE \
135+
--model $MODEL_PATH \
136+
--device hpu \
137+
--dtype bfloat16 \
138+
--kv-cache-dtype $KV_CACHE_DTYPE \
139+
--tensor-parallel-size $TP_SIZE \
140+
--pipeline-parallel-size $PP_SIZE \
141+
--trust-remote-code \
142+
--max-model-len $MAX_MODEL_LEN \
143+
--max-num-seqs $MAX_NUM_SEQS \
144+
--max-num-batched-tokens $MAX_NUM_BATCHED_TOKENS \
145+
--disable-log-requests \
146+
--use-padding-aware-scheduling \
147+
--use-v2-block-manager \
148+
--distributed_executor_backend ray \
149+
--gpu_memory_utilization $VLLM_GPU_MEMORY_UTILIZATION \
150+
--enable-reasoning \
151+
--reasoning-parser deepseek_r1

0 commit comments

Comments
 (0)