|
| 1 | +#!/bin/bash |
| 2 | +set -x |
| 3 | + |
| 4 | +# Usage: benchmark_server_param.sh NUM_NODES MAX_MODEL_LEN MAX_NUM_SEQS TP_SIZE PP_SIZE \ |
| 5 | +# COMM_BACKEND [PP_LAYER_PARTITION] [KV_CACHE_DTYPE] \ |
| 6 | +# [DO_WARMUP] [DO_PROFILE] [HOST] [PORT] [MODEL_PATH] [RESULTS_DIR] |
| 7 | +# |
| 8 | +# Arguments: |
| 9 | +# NUM_NODES Number of nodes to use for the server. |
| 10 | +# MAX_MODEL_LEN Maximum model length (number of tokens). |
| 11 | +# MAX_NUM_SEQS Maximum number of sequences to process concurrently. |
| 12 | +# TP_SIZE Tensor parallelism size. |
| 13 | +# PP_SIZE Pipeline parallelism size. |
| 14 | +# COMM_BACKEND Communication backend to use (e.g., hccl, gloo). |
| 15 | +# PP_LAYER_PARTITION (Optional) Layer partitioning for pipeline parallelism (comma-separated list). |
| 16 | +# KV_CACHE_DTYPE (Optional) Data type for KV cache (e.g., auto, fp8_inc). Default: auto. |
| 17 | +# DO_WARMUP (Optional) Whether to perform warmup before benchmarking (true/false). Default: true. |
| 18 | +# DO_PROFILE (Optional) Whether to enable profiling (true/false). Default: false. |
| 19 | +# HOST (Optional) Host address for the server. Default: 127.0.0.1. |
| 20 | +# PORT (Optional) Port for the server. Default: 8688. |
| 21 | +# MODEL_PATH (Optional) Path to the model. Default: /root/.cache/huggingface/DeepSeek-R1-BF16-w8afp8-dynamic-no-ste-G2. |
| 22 | +# RESULTS_DIR (Optional) Directory to store results and logs. Default: logs/test-results. |
| 23 | +# |
| 24 | +# Description: |
| 25 | +# This script launches a vLLM server with the specified configuration for benchmarking. |
| 26 | +# It supports various parallelism configurations (tensor and pipeline), communication backends, |
| 27 | +# and optional profiling. The script sets up the environment, configures memory and scheduling |
| 28 | +# parameters, and starts the server with the provided arguments. |
| 29 | +# |
| 30 | +# Use this script as part of a benchmarking workflow to evaluate the performance of vLLM |
| 31 | +# under different configurations. |
| 32 | + |
| 33 | +NUM_NODES=$1 |
| 34 | +MAX_MODEL_LEN=$2 |
| 35 | +MAX_NUM_SEQS=$3 |
| 36 | +TP_SIZE=$4 |
| 37 | +PP_SIZE=$5 |
| 38 | +COMM_BACKEND=$6 |
| 39 | +PP_LAYER_PARTITION=${7:-} |
| 40 | +KV_CACHE_DTYPE=${8:-auto} |
| 41 | +DO_WARMUP=${9:-true} |
| 42 | +DO_PROFILE=${10:-false} |
| 43 | +HOST=${11:-127.0.0.1} |
| 44 | +PORT=${12:-8688} |
| 45 | +MODEL_PATH=${13:-${MODEL_PATH:-/root/.cache/huggingface/DeepSeek-R1-BF16-w8afp8-dynamic-no-ste-G2}} |
| 46 | +RESULTS_DIR=${14:-logs/test-results} |
| 47 | + |
| 48 | +if [ "$DO_PROFILE" == "true" ]; then |
| 49 | + hl-prof-config --use-template profile_api --hw-trace off |
| 50 | + export HABANA_PROFILE=1 |
| 51 | + export VLLM_PROFILER_ENABLED=full |
| 52 | + export VLLM_TORCH_PROFILER_DIR=${RESULTS_DIR}/profiler/ |
| 53 | +fi |
| 54 | + |
| 55 | +# Environment settings |
| 56 | +export HABANA_VISIBLE_DEVICES="ALL" |
| 57 | +export PT_HPU_LAZY_MODE=1 |
| 58 | +export PT_HPU_ENABLE_LAZY_COLLECTIVES="true" |
| 59 | +export VLLM_RAY_DISABLE_LOG_TO_DRIVER="1" |
| 60 | +export RAY_IGNORE_UNHANDLED_ERRORS="1" |
| 61 | +export PT_HPU_WEIGHT_SHARING=0 |
| 62 | +export HABANA_VISIBLE_MODULES="0,1,2,3,4,5,6,7" |
| 63 | + |
| 64 | +if [ "$DO_WARMUP" == "true" ]; then |
| 65 | + export VLLM_SKIP_WARMUP=false |
| 66 | +else |
| 67 | + export VLLM_SKIP_WARMUP=true |
| 68 | +fi |
| 69 | +export VLLM_MLA_DISABLE_REQUANTIZATION=1 |
| 70 | +export VLLM_MLA_PERFORM_MATRIX_ABSORPTION=0 |
| 71 | +export VLLM_DELAYED_SAMPLING="false" |
| 72 | + |
| 73 | +# memory footprint tunning params |
| 74 | +export VLLM_GPU_MEMORY_UTILIZATION=${VLLM_GPU_MEMORY_UTILIZATION:-0.75} |
| 75 | +export VLLM_GRAPH_RESERVED_MEM=${VLLM_GRAPH_RESERVED_MEM:-0.4} |
| 76 | +export VLLM_GRAPH_PROMPT_RATIO=0 |
| 77 | + |
| 78 | +export VLLM_EP_SIZE=$TP_SIZE |
| 79 | +if [ "$PP_SIZE" -gt 1 ]; then |
| 80 | + if [ -n "$PP_LAYER_PARTITION" ]; then |
| 81 | + echo "PP_SIZE = ${PP_SIZE}, PP_LAYER_PARTITION = ${PP_LAYER_PARTITION}" |
| 82 | + export VLLM_PP_LAYER_PARTITION=$PP_LAYER_PARTITION |
| 83 | + else |
| 84 | + echo "Warning: PP_SIZE > 1 but PP_LAYER_PARTITION not provided" |
| 85 | + fi |
| 86 | +fi |
| 87 | + |
| 88 | +if [ "$COMM_BACKEND" == "gloo" ]; then |
| 89 | + export VLLM_PP_USE_CPU_COMS=1 |
| 90 | +fi |
| 91 | + |
| 92 | +if [ "$KV_CACHE_DTYPE" == "fp8_inc" ]; then |
| 93 | + # Required to improve performance with FP8 KV cache. |
| 94 | + export VLLM_USE_FP8_MATMUL="true" |
| 95 | +fi |
| 96 | + |
| 97 | +# Bucketing configuration |
| 98 | +BLOCK_SIZE=128 |
| 99 | +export PT_HPU_RECIPE_CACHE_CONFIG="/data/${MAX_MODEL_LEN}_cache,false,${MAX_MODEL_LEN}" |
| 100 | +MAX_NUM_BATCHED_TOKENS=$MAX_MODEL_LEN |
| 101 | + |
| 102 | +prompt_bs_min=1 |
| 103 | +prompt_bs_step=$(( MAX_NUM_SEQS > 32 ? 32 : MAX_NUM_SEQS )) |
| 104 | +prompt_bs_max=$(( MAX_NUM_SEQS > 64 ? 64 : MAX_NUM_SEQS )) |
| 105 | +export VLLM_PROMPT_BS_BUCKET_MIN=${VLLM_PROMPT_BS_BUCKET_MIN:-$prompt_bs_min} |
| 106 | +export VLLM_PROMPT_BS_BUCKET_STEP=${VLLM_PROMPT_BS_BUCKET_STEP:-$prompt_bs_step} |
| 107 | +export VLLM_PROMPT_BS_BUCKET_MAX=${VLLM_PROMPT_BS_BUCKET_MAX:-$prompt_bs_max} |
| 108 | + |
| 109 | +prompt_seq_min=128 |
| 110 | +prompt_seq_step=128 |
| 111 | +prompt_seq_max=$MAX_NUM_BATCHED_TOKENS |
| 112 | +export VLLM_PROMPT_SEQ_BUCKET_MIN=${VLLM_PROMPT_SEQ_BUCKET_MIN:-$prompt_seq_min} |
| 113 | +export VLLM_PROMPT_SEQ_BUCKET_STEP=${VLLM_PROMPT_SEQ_BUCKET_STEP:-$prompt_seq_step} |
| 114 | +export VLLM_PROMPT_SEQ_BUCKET_MAX=${VLLM_PROMPT_SEQ_BUCKET_MAX:-$prompt_seq_max} |
| 115 | + |
| 116 | +decode_bs_min=1 |
| 117 | +decode_bs_step=$(( MAX_NUM_SEQS > 32 ? 32 : MAX_NUM_SEQS )) |
| 118 | +decode_bs_max=$MAX_NUM_SEQS |
| 119 | +export VLLM_DECODE_BS_BUCKET_MIN=${VLLM_DECODE_BS_BUCKET_MIN:-$decode_bs_min} |
| 120 | +export VLLM_DECODE_BS_BUCKET_STEP=${VLLM_DECODE_BS_BUCKET_STEP:-$decode_bs_step} |
| 121 | +export VLLM_DECODE_BS_BUCKET_MAX=${VLLM_DECODE_BS_BUCKET_MAX:-$decode_bs_max} |
| 122 | + |
| 123 | +decode_block_min=128 |
| 124 | +decode_block_step=128 |
| 125 | +decode_block_max=$(( ((MAX_NUM_SEQS * MAX_MODEL_LEN / BLOCK_SIZE) > 128) ? (MAX_NUM_SEQS * MAX_MODEL_LEN / BLOCK_SIZE) : 128 )) |
| 126 | +export VLLM_DECODE_BLOCK_BUCKET_MIN=${VLLM_DECODE_BLOCK_BUCKET_MIN:-$decode_block_min} |
| 127 | +export VLLM_DECODE_BLOCK_BUCKET_STEP=${VLLM_DECODE_BLOCK_BUCKET_STEP:-$decode_block_step} |
| 128 | +export VLLM_DECODE_BLOCK_BUCKET_MAX=${VLLM_DECODE_BLOCK_BUCKET_MAX:-$decode_block_max} |
| 129 | + |
| 130 | +echo "Environments set for ${NUM_NODES}-node server: MAX_MODEL_LEN=${MAX_MODEL_LEN}, MAX_NUM_SEQS=${MAX_NUM_SEQS}, TP_SIZE=${TP_SIZE}, PP_SIZE=${PP_SIZE}, COMM_BACKEND=${COMM_BACKEND}" |
| 131 | +env | grep VLLM |
| 132 | + |
| 133 | +python3 -m vllm.entrypoints.openai.api_server --host $HOST --port $PORT \ |
| 134 | + --block-size $BLOCK_SIZE \ |
| 135 | + --model $MODEL_PATH \ |
| 136 | + --device hpu \ |
| 137 | + --dtype bfloat16 \ |
| 138 | + --kv-cache-dtype $KV_CACHE_DTYPE \ |
| 139 | + --tensor-parallel-size $TP_SIZE \ |
| 140 | + --pipeline-parallel-size $PP_SIZE \ |
| 141 | + --trust-remote-code \ |
| 142 | + --max-model-len $MAX_MODEL_LEN \ |
| 143 | + --max-num-seqs $MAX_NUM_SEQS \ |
| 144 | + --max-num-batched-tokens $MAX_NUM_BATCHED_TOKENS \ |
| 145 | + --disable-log-requests \ |
| 146 | + --use-padding-aware-scheduling \ |
| 147 | + --use-v2-block-manager \ |
| 148 | + --distributed_executor_backend ray \ |
| 149 | + --gpu_memory_utilization $VLLM_GPU_MEMORY_UTILIZATION \ |
| 150 | + --enable-reasoning \ |
| 151 | + --reasoning-parser deepseek_r1 |
0 commit comments