HabanaAI
diff --git a/‎examples/online_serving/run_cluster.sh‎
Lines changed: 62 additions & 13 deletions b/‎examples/online_serving/run_cluster.sh‎
Lines changed: 62 additions & 13 deletions
diff --git a/‎scripts/benchmark_client_param.sh‎
Lines changed: 67 additions & 0 deletions b/‎scripts/benchmark_client_param.sh‎
Lines changed: 67 additions & 0 deletions
diff --git a/‎scripts/benchmark_server_param.sh‎
Lines changed: 151 additions & 0 deletions b/‎scripts/benchmark_server_param.sh‎
Lines changed: 151 additions & 0 deletions
@@ -1,18 +1,49 @@
 #!/bin/bash
 
 # Check for minimum number of required arguments
+Help() {
+    # Display Help
+    echo "Usage: $0 docker_image head_node_address --head|--worker path_to_hf_home"
+    echo "       [-h] [-d hpu|gpu] [-c true|false] [-- additional_args..."]
+}
+
 if [ $# -lt 4 ]; then
-    echo "Usage: $0 docker_image head_node_address --head|--worker path_to_hf_home [additional_args...]"
+    Help
     exit 1
 fi
 
-# Assign the first three arguments and shift them away
+# Assign the first four arguments and shift them away
 DOCKER_IMAGE="$1"
 HEAD_NODE_ADDRESS="$2"
 NODE_TYPE="$3"  # Should be --head or --worker
 PATH_TO_HF_HOME="$4"
 shift 4
 
+PLATFORM="gpu"
+CLEANUP_ON_EXIT="true"
+
+# Get the options
+while getopts hd:c: flag; do
+    case $flag in
+    h) # display Help
+        Help
+        exit
+        ;;
+    d) # get the device type
+        PLATFORM=$OPTARG ;;
+    c) # get TP value
+        CLEANUP_ON_EXIT=$OPTARG ;;
+    \?) # Invalid option
+        echo "Error: Invalid option"
+        Help
+        exit
+        ;;
+    esac
+done
+
+# Shift the processed options and their arguments
+shift $((OPTIND - 1))
+
 # Additional arguments are passed directly to the Docker command
 ADDITIONAL_ARGS=("$@")
 
@@ -27,23 +58,41 @@ cleanup() {
     docker stop node
     docker rm node
 }
-trap cleanup EXIT
+if [[ "$CLEANUP_ON_EXIT" == "true" ]]; then
+    trap cleanup EXIT
+fi
 
 # Command setup for head or worker node
 RAY_START_CMD="ray start --block"
 if [ "${NODE_TYPE}" == "--head" ]; then
-    RAY_START_CMD+=" --head --port=6379"
+    RAY_START_CMD+=" --head --node-ip-address ${HEAD_NODE_ADDRESS} --port=6379"
 else
     RAY_START_CMD+=" --address=${HEAD_NODE_ADDRESS}:6379"
 fi
 
 # Run the docker command with the user specified parameters and additional arguments
-docker run \
-    --entrypoint /bin/bash \
-    --network host \
-    --name node \
-    --shm-size 10.24g \
-    --gpus all \
-    -v "${PATH_TO_HF_HOME}:/root/.cache/huggingface" \
-    "${ADDITIONAL_ARGS[@]}" \
-    "${DOCKER_IMAGE}" -c "${RAY_START_CMD}"
+if [[ "$PLATFORM" == "hpu" ]]; then
+    docker run \
+        -td \
+        --entrypoint /bin/bash \
+        --network host \
+        --ipc=host \
+        --name node \
+        --runtime=habana \
+        -e HABANA_VISIBLE_DEVICES=all \
+        -e GLOO_SOCKET_IFNAME=${GLOO_SOCKET_IFNAME} \
+        -e HCCL_SOCKET_IFNAME=${HCCL_SOCKET_IFNAME} \
+        -v "${PATH_TO_HF_HOME}:/root/.cache/huggingface" \
+        "${ADDITIONAL_ARGS[@]}" \
+        "${DOCKER_IMAGE}" -c "${RAY_START_CMD}"
+else
+    docker run \
+        --entrypoint /bin/bash \
+        --network host \
+        --name node \
+        --shm-size 10.24g \
+        --gpus all \
+        -v "${PATH_TO_HF_HOME}:/root/.cache/huggingface" \
+        "${ADDITIONAL_ARGS[@]}" \
+        "${DOCKER_IMAGE}" -c "${RAY_START_CMD}"
+fi
@@ -0,0 +1,67 @@
+#!/bin/bash
+set -e
+
+# Usage: source benchmark_client_param.sh
+#        test_benchmark_client_serving INPUT_LEN OUTPUT_LEN MAX_CONCURRENCY NUM_PROMPTS [LEN_RATIO] [HOST] [PORT] [MODEL_PATH] [RESULTS_DIR]
+#
+# Arguments:
+#   INPUT_LEN          Length of the input sequence (number of tokens).
+#   OUTPUT_LEN         Length of the output sequence (number of tokens).
+#   MAX_CONCURRENCY    Maximum number of concurrent requests to send to the server.
+#   NUM_PROMPTS        Number of prompts to send in the benchmark.
+#   LEN_RATIO          (Optional) Ratio of minimum to maximum input/output lengths generated. Default: 1.0.
+#   HOST               (Optional) Host address of the server. Default: 127.0.0.1.
+#   PORT               (Optional) Port of the server. Default: 8688.
+#   MODEL_PATH         (Optional) Path to the model. Default: /root/.cache/huggingface/DeepSeek-R1-BF16-w8afp8-dynamic-no-ste-G2.
+#   RESULTS_DIR        (Optional) Directory to store benchmark results. Default: logs/test-results.
+#
+# Description:
+#   This script defines a function `test_benchmark_client_serving` that runs a client-side benchmark
+#   for vLLM serving. It sends random input prompts to the server and measures performance metrics
+#   such as throughput and latency. The results are saved in JSON format in the specified results directory.
+#
+#   Use this script to evaluate the performance of a vLLM server under different client configurations,
+#   including varying input/output lengths, concurrency levels, and prompt counts.
+
+test_benchmark_client_serving() {
+  export PT_HPU_LAZY_MODE=1
+  INPUT_LEN=$1
+  OUTPUT_LEN=$2
+  MAX_CONCURRENCY=$3
+  NUM_PROMPTS=$4
+  LEN_RATIO=${5:-1.0}
+  HOST=${6:-127.0.0.1}
+  PORT=${7:-8688}
+  MODEL_PATH=${8:-${MODEL_PATH:-/root/.cache/huggingface/DeepSeek-R1-BF16-w8afp8-dynamic-no-ste-G2}}
+  RESULTS_DIR=${9:-logs/test-results}
+  mkdir -p "$RESULTS_DIR"
+
+  export no_proxy=localhost,${HOST},10.239.129.9
+
+  # Run serving benchmark
+  echo "Running serving benchmark: input=${INPUT_LEN}, output=${OUTPUT_LEN}, concurrency=${MAX_CONCURRENCY}, prompts=${NUM_PROMPTS}, ratio=${LEN_RATIO},"
+  echo "                           host=${HOST}, port=${PORT},"
+  echo "                           model=${MODEL_PATH},"
+  echo "                           results=${RESULTS_DIR}"
+
+  TIMESTAMP=$(TZ='Asia/Kolkata' date +%F-%H-%M-%S)
+  LOG_BASE="benchmark_${NUM_PROMPTS}prompts_${MAX_CONCURRENCY}bs_in${INPUT_LEN}_out${OUTPUT_LEN}_ratio${LEN_RATIO}_${TIMESTAMP}"
+
+  python3 ../benchmarks/benchmark_serving.py \
+      --backend vllm \
+      --model "${MODEL_PATH}" \
+      --trust-remote-code \
+      --host "${HOST}" \
+      --port "${PORT}" \
+      --dataset-name random \
+      --random-input-len "${INPUT_LEN}" \
+      --random-output-len "${OUTPUT_LEN}" \
+      --random-range-ratio "${LEN_RATIO}" \
+      --max-concurrency "${MAX_CONCURRENCY}" \
+      --num-prompts "${NUM_PROMPTS}" \
+      --request-rate inf \
+      --seed 0 \
+      --ignore-eos \
+      --save-result \
+      --result-filename "${RESULTS_DIR}/${LOG_BASE}.json"
+}
@@ -0,0 +1,151 @@
+#!/bin/bash
+set -x
+
+# Usage: benchmark_server_param.sh NUM_NODES MAX_MODEL_LEN MAX_NUM_SEQS TP_SIZE PP_SIZE \
+#                                  COMM_BACKEND [PP_LAYER_PARTITION] [KV_CACHE_DTYPE] \
+#                                  [DO_WARMUP] [DO_PROFILE] [HOST] [PORT] [MODEL_PATH] [RESULTS_DIR]
+#
+# Arguments:
+#   NUM_NODES          Number of nodes to use for the server.
+#   MAX_MODEL_LEN      Maximum model length (number of tokens).
+#   MAX_NUM_SEQS       Maximum number of sequences to process concurrently.
+#   TP_SIZE            Tensor parallelism size.
+#   PP_SIZE            Pipeline parallelism size.
+#   COMM_BACKEND       Communication backend to use (e.g., hccl, gloo).
+#   PP_LAYER_PARTITION (Optional) Layer partitioning for pipeline parallelism (comma-separated list).
+#   KV_CACHE_DTYPE     (Optional) Data type for KV cache (e.g., auto, fp8_inc). Default: auto.
+#   DO_WARMUP          (Optional) Whether to perform warmup before benchmarking (true/false). Default: true.
+#   DO_PROFILE         (Optional) Whether to enable profiling (true/false). Default: false.
+#   HOST               (Optional) Host address for the server. Default: 127.0.0.1.
+#   PORT               (Optional) Port for the server. Default: 8688.
+#   MODEL_PATH         (Optional) Path to the model. Default: /root/.cache/huggingface/DeepSeek-R1-BF16-w8afp8-dynamic-no-ste-G2.
+#   RESULTS_DIR        (Optional) Directory to store results and logs. Default: logs/test-results.
+#
+# Description:
+#   This script launches a vLLM server with the specified configuration for benchmarking.
+#   It supports various parallelism configurations (tensor and pipeline), communication backends,
+#   and optional profiling. The script sets up the environment, configures memory and scheduling
+#   parameters, and starts the server with the provided arguments.
+#
+#   Use this script as part of a benchmarking workflow to evaluate the performance of vLLM
+#   under different configurations.
+
+NUM_NODES=$1
+MAX_MODEL_LEN=$2
+MAX_NUM_SEQS=$3
+TP_SIZE=$4
+PP_SIZE=$5
+COMM_BACKEND=$6
+PP_LAYER_PARTITION=${7:-}
+KV_CACHE_DTYPE=${8:-auto}
+DO_WARMUP=${9:-true}
+DO_PROFILE=${10:-false}
+HOST=${11:-127.0.0.1}
+PORT=${12:-8688}
+MODEL_PATH=${13:-${MODEL_PATH:-/root/.cache/huggingface/DeepSeek-R1-BF16-w8afp8-dynamic-no-ste-G2}}
+RESULTS_DIR=${14:-logs/test-results}
+
+if [ "$DO_PROFILE" == "true" ]; then
+  hl-prof-config --use-template profile_api --hw-trace off
+  export HABANA_PROFILE=1
+  export VLLM_PROFILER_ENABLED=full
+  export VLLM_TORCH_PROFILER_DIR=${RESULTS_DIR}/profiler/
+fi
+
+# Environment settings
+export HABANA_VISIBLE_DEVICES="ALL"
+export PT_HPU_LAZY_MODE=1
+export PT_HPU_ENABLE_LAZY_COLLECTIVES="true"
+export VLLM_RAY_DISABLE_LOG_TO_DRIVER="1"
+export RAY_IGNORE_UNHANDLED_ERRORS="1"
+export PT_HPU_WEIGHT_SHARING=0
+export HABANA_VISIBLE_MODULES="0,1,2,3,4,5,6,7"
+
+if [ "$DO_WARMUP" == "true" ]; then
+  export VLLM_SKIP_WARMUP=false
+else
+  export VLLM_SKIP_WARMUP=true
+fi
+export VLLM_MLA_DISABLE_REQUANTIZATION=1
+export VLLM_MLA_PERFORM_MATRIX_ABSORPTION=0
+export VLLM_DELAYED_SAMPLING="false"
+
+# memory footprint tunning params
+export VLLM_GPU_MEMORY_UTILIZATION=${VLLM_GPU_MEMORY_UTILIZATION:-0.75}
+export VLLM_GRAPH_RESERVED_MEM=${VLLM_GRAPH_RESERVED_MEM:-0.4}
+export VLLM_GRAPH_PROMPT_RATIO=0
+
+export VLLM_EP_SIZE=$TP_SIZE
+if [ "$PP_SIZE" -gt 1 ]; then
+  if [ -n "$PP_LAYER_PARTITION" ]; then
+    echo "PP_SIZE = ${PP_SIZE}, PP_LAYER_PARTITION = ${PP_LAYER_PARTITION}"
+    export VLLM_PP_LAYER_PARTITION=$PP_LAYER_PARTITION
+  else
+    echo "Warning: PP_SIZE > 1 but PP_LAYER_PARTITION not provided"
+  fi
+fi
+
+if [ "$COMM_BACKEND" == "gloo" ]; then
+  export VLLM_PP_USE_CPU_COMS=1
+fi
+
+if [ "$KV_CACHE_DTYPE" == "fp8_inc" ]; then
+  # Required to improve performance with FP8 KV cache.
+  export VLLM_USE_FP8_MATMUL="true"
+fi
+
+# Bucketing configuration
+BLOCK_SIZE=128
+export PT_HPU_RECIPE_CACHE_CONFIG="/data/${MAX_MODEL_LEN}_cache,false,${MAX_MODEL_LEN}"
+MAX_NUM_BATCHED_TOKENS=$MAX_MODEL_LEN
+
+prompt_bs_min=1
+prompt_bs_step=$(( MAX_NUM_SEQS > 32 ? 32 : MAX_NUM_SEQS ))
+prompt_bs_max=$(( MAX_NUM_SEQS > 64 ? 64 : MAX_NUM_SEQS ))
+export VLLM_PROMPT_BS_BUCKET_MIN=${VLLM_PROMPT_BS_BUCKET_MIN:-$prompt_bs_min}
+export VLLM_PROMPT_BS_BUCKET_STEP=${VLLM_PROMPT_BS_BUCKET_STEP:-$prompt_bs_step}
+export VLLM_PROMPT_BS_BUCKET_MAX=${VLLM_PROMPT_BS_BUCKET_MAX:-$prompt_bs_max}
+
+prompt_seq_min=128
+prompt_seq_step=128
+prompt_seq_max=$MAX_NUM_BATCHED_TOKENS
+export VLLM_PROMPT_SEQ_BUCKET_MIN=${VLLM_PROMPT_SEQ_BUCKET_MIN:-$prompt_seq_min}
+export VLLM_PROMPT_SEQ_BUCKET_STEP=${VLLM_PROMPT_SEQ_BUCKET_STEP:-$prompt_seq_step}
+export VLLM_PROMPT_SEQ_BUCKET_MAX=${VLLM_PROMPT_SEQ_BUCKET_MAX:-$prompt_seq_max}
+
+decode_bs_min=1
+decode_bs_step=$(( MAX_NUM_SEQS > 32 ? 32 : MAX_NUM_SEQS ))
+decode_bs_max=$MAX_NUM_SEQS
+export VLLM_DECODE_BS_BUCKET_MIN=${VLLM_DECODE_BS_BUCKET_MIN:-$decode_bs_min}
+export VLLM_DECODE_BS_BUCKET_STEP=${VLLM_DECODE_BS_BUCKET_STEP:-$decode_bs_step}
+export VLLM_DECODE_BS_BUCKET_MAX=${VLLM_DECODE_BS_BUCKET_MAX:-$decode_bs_max}
+
+decode_block_min=128
+decode_block_step=128
+decode_block_max=$(( ((MAX_NUM_SEQS * MAX_MODEL_LEN / BLOCK_SIZE) > 128) ? (MAX_NUM_SEQS * MAX_MODEL_LEN / BLOCK_SIZE) : 128 ))
+export VLLM_DECODE_BLOCK_BUCKET_MIN=${VLLM_DECODE_BLOCK_BUCKET_MIN:-$decode_block_min}
+export VLLM_DECODE_BLOCK_BUCKET_STEP=${VLLM_DECODE_BLOCK_BUCKET_STEP:-$decode_block_step}
+export VLLM_DECODE_BLOCK_BUCKET_MAX=${VLLM_DECODE_BLOCK_BUCKET_MAX:-$decode_block_max}
+
+echo "Environments set for ${NUM_NODES}-node server: MAX_MODEL_LEN=${MAX_MODEL_LEN}, MAX_NUM_SEQS=${MAX_NUM_SEQS}, TP_SIZE=${TP_SIZE}, PP_SIZE=${PP_SIZE}, COMM_BACKEND=${COMM_BACKEND}"
+env | grep VLLM
+
+python3 -m vllm.entrypoints.openai.api_server --host $HOST --port $PORT \
+  --block-size $BLOCK_SIZE \
+  --model $MODEL_PATH \
+  --device hpu \
+  --dtype bfloat16 \
+  --kv-cache-dtype $KV_CACHE_DTYPE \
+  --tensor-parallel-size $TP_SIZE \
+  --pipeline-parallel-size $PP_SIZE \
+  --trust-remote-code \
+  --max-model-len $MAX_MODEL_LEN \
+  --max-num-seqs $MAX_NUM_SEQS \
+  --max-num-batched-tokens $MAX_NUM_BATCHED_TOKENS \
+  --disable-log-requests \
+  --use-padding-aware-scheduling \
+  --use-v2-block-manager \
+  --distributed_executor_backend ray \
+  --gpu_memory_utilization $VLLM_GPU_MEMORY_UTILIZATION \
+  --enable-reasoning \
+  --reasoning-parser deepseek_r1