diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index c362604f1..3eb2d4449 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -338,7 +338,7 @@ kimik2.5-int4-mi325x-vllm: - { tp: 8, conc-start: 4, conc-end: 64 } kimik2.5-fp4-mi355x-vllm: - image: vllm/vllm-openai-rocm:v0.16.0 + image: vllm/vllm-openai-rocm:v0.18.0 model: amd/Kimi-K2.5-MXFP4 model-prefix: kimik2.5 runner: mi355x @@ -350,14 +350,18 @@ kimik2.5-fp4-mi355x-vllm: osl: 1024 search-space: - { tp: 8, conc-start: 4, conc-end: 64 } + - { tp: 4, conc-start: 4, conc-end: 64 } - isl: 1024 osl: 8192 search-space: - { tp: 8, conc-start: 4, conc-end: 64 } + - { tp: 4, conc-start: 4, conc-end: 64 } + - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 } - isl: 8192 osl: 1024 search-space: - { tp: 8, conc-start: 4, conc-end: 64 } + - { tp: 4, conc-start: 4, conc-end: 64 } minimaxm2.5-fp8-mi355x-vllm: image: vllm/vllm-openai-rocm:v0.18.0 diff --git a/benchmarks/single_node/kimik2.5_fp4_mi355x.sh b/benchmarks/single_node/kimik2.5_fp4_mi355x.sh index bb522b396..a8bd01442 100755 --- a/benchmarks/single_node/kimik2.5_fp4_mi355x.sh +++ b/benchmarks/single_node/kimik2.5_fp4_mi355x.sh @@ -31,9 +31,29 @@ fi SERVER_LOG=/workspace/server.log PORT=${PORT:-8888} -# do not enable aiter due to Aiter MLA not currently supporting num_heads=8 -# https://github.com/vllm-project/vllm/issues/35641 -# export VLLM_ROCM_USE_AITER=1 +# If the machine runs a MEC FW older than 177, RCCL +# cannot reclaim some memory. +# Disable that features to avoid crashes. +# This is related to the changes in the driver at: +# https://rocm.docs.amd.com/en/docs-6.4.3/about/release-notes.html#amdgpu-driver-updates +version=`rocm-smi --showfw | grep MEC | head -n 1 | awk '{print $NF}'` +if [[ "$version" == "" || $version -lt 177 ]]; then + export HSA_NO_SCRATCH_RECLAIM=1 +fi + +export VLLM_ROCM_USE_AITER=1 +export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4 + +# Disable AITER RMSNorm for TP < 8 due to accuracy issues +if [ "${TP}" -lt 8 ]; then + export VLLM_ROCM_USE_AITER_RMSNORM=0 +fi + +if [ "${EP_SIZE:-0}" -gt 1 ]; then + EP=" --enable-expert-parallel" +else + EP=" " +fi # following AMD andy luo's recipe # https://x.com/linluo77/status/2017024513595301985 @@ -44,10 +64,11 @@ start_gpu_monitor set -x vllm serve $MODEL --port $PORT \ --tensor-parallel-size=$TP \ ---gpu-memory-utilization 0.95 \ +$EP \ +--gpu-memory-utilization 0.90 \ --max-model-len $MAX_MODEL_LEN \ ---block-size=64 \ ---disable-log-requests \ +--block-size=1 \ +--no-enable-prefix-caching \ --trust-remote-code \ --mm-encoder-tp-mode data > $SERVER_LOG 2>&1 & diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 03fb6e082..2648e746e 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1068,3 +1068,12 @@ - "dsr1-fp8-h200-sglang: v0.5.9-cu129-amd64 → v0.5.9-cu130" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/943 +- config-keys: + - kimik2.5-fp4-mi355x-vllm + description: + - "Upgrade vLLM ROCm image from v0.16.0 to v0.18.0" + - "Enable AITER with INT4 quick reduce; disable AITER RMSNorm for TP < 8 (accuracy)" + - "Add expert parallel, TP4, and TP4/EP4 search spaces" + - "Switch block-size 64 to 1 gpu-memory-utilization 0.95 to 0.90" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/936 + \ No newline at end of file