Skip to content
Closed
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions .github/configs/amd-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -349,15 +349,16 @@ kimik2.5-fp4-mi355x-vllm:
- isl: 1024
osl: 1024
search-space:
- { tp: 8, conc-start: 4, conc-end: 64 }
- { tp: 4, conc-start: 4, conc-end: 64 }
- isl: 1024
osl: 8192
search-space:
- { tp: 8, conc-start: 4, conc-end: 64 }
- { tp: 4, conc-start: 4, conc-end: 64 }
- { tp: 4, ep: 4, conc-start: 4, conc-end: 64 }
- isl: 8192
osl: 1024
search-space:
- { tp: 8, conc-start: 4, conc-end: 64 }
- { tp: 4, conc-start: 4, conc-end: 64 }

minimaxm2.5-fp8-mi355x-vllm:
image: vllm/vllm-openai-rocm:v0.15.1
Expand Down
30 changes: 24 additions & 6 deletions benchmarks/single_node/kimik2.5_fp4_mi355x.sh
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,27 @@ fi
SERVER_LOG=/workspace/server.log
PORT=${PORT:-8888}

# do not enable aiter due to Aiter MLA not currently supporting num_heads=8
# https://github.com/vllm-project/vllm/issues/35641
# export VLLM_ROCM_USE_AITER=1
# If the machine runs a MEC FW older than 177, RCCL
# cannot reclaim some memory.
# Disable that features to avoid crashes.
# This is related to the changes in the driver at:
# https://rocm.docs.amd.com/en/docs-6.4.3/about/release-notes.html#amdgpu-driver-updates
version=`rocm-smi --showfw | grep MEC | head -n 1 | awk '{print $NF}'`
if [[ "$version" == "" || $version -lt 177 ]]; then
export HSA_NO_SCRATCH_RECLAIM=1
fi

export VLLM_ROCM_USE_AITER=1
export VLLM_ROCM_USE_AITER_MLA=1
export VLLM_ROCM_USE_AITER_MOE=1
export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT8
export VLLM_ROCM_USE_AITER_TRITON_ROPE=1

if [ "$EP_SIZE" -gt 1 ]; then
EP=" --enable-expert-parallel"
else
EP=" "
Comment thread
chunfangamd marked this conversation as resolved.
fi

# following AMD andy luo's recipe
# https://x.com/linluo77/status/2017024513595301985
Expand All @@ -44,10 +62,10 @@ start_gpu_monitor
set -x
vllm serve $MODEL --port $PORT \
--tensor-parallel-size=$TP \
--gpu-memory-utilization 0.95 \
$EP \
--gpu-memory-utilization 0.90 \
--max-model-len $MAX_MODEL_LEN \
--block-size=64 \
--disable-log-requests \
--block-size=1 \

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

--no-enable-prefix-caching \

@ChuanLi1101 now we need this

--trust-remote-code \
--mm-encoder-tp-mode data > $SERVER_LOG 2>&1 &

Expand Down