Skip to content
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions .github/configs/amd-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -413,7 +413,7 @@ minimaxm2.5-fp8-mi325x-vllm:
- { tp: 4, conc-start: 4, conc-end: 64 }

gptoss-fp4-mi300x-vllm:
image: vllm/vllm-openai-rocm:v0.16.0
image: vllm/vllm-openai-rocm:v0.17.0
model: openai/gpt-oss-120b
model-prefix: gptoss
runner: mi300x
Expand Down Expand Up @@ -444,7 +444,7 @@ gptoss-fp4-mi300x-vllm:
- { tp: 8, conc-start: 4, conc-end: 16 }

gptoss-fp4-mi325x-vllm:
image: vllm/vllm-openai-rocm:v0.16.0
image: vllm/vllm-openai-rocm:v0.17.0
model: openai/gpt-oss-120b
model-prefix: gptoss
runner: mi325x
Expand Down Expand Up @@ -475,8 +475,8 @@ gptoss-fp4-mi325x-vllm:
- { tp: 8, conc-start: 4, conc-end: 16 }

gptoss-fp4-mi355x-vllm:
image: vllm/vllm-openai-rocm:v0.16.0
model: openai/gpt-oss-120b
image: vllm/vllm-openai-rocm:v0.17.0
model: amd/gpt-oss-120b-w-mxfp4-a-fp8
model-prefix: gptoss
runner: mi355x
precision: fp4
Expand Down
18 changes: 9 additions & 9 deletions benchmarks/single_node/gptoss_fp4_mi300x.sh
Original file line number Diff line number Diff line change
Expand Up @@ -33,23 +33,23 @@ if [ -n "$ROCR_VISIBLE_DEVICES" ]; then
export HIP_VISIBLE_DEVICES="$ROCR_VISIBLE_DEVICES"
fi

export AMDGCN_USE_BUFFER_OPS=0
export VLLM_ROCM_USE_AITER=1
export VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION=1
export VLLM_ROCM_USE_AITER_MHA=0
export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4
ATTN_BACKEND="--attention-backend ROCM_AITER_UNIFIED_ATTN"
FUSE_ROPE_KVCACHE="-cc.pass_config.fuse_rope_kvcache=True -cc.use_inductor_graph_partition=True"

SERVER_LOG=/workspace/server.log
PORT=${PORT:-8888}

set -x
vllm serve $MODEL --port $PORT \
--tensor-parallel-size=$TP \
--gpu-memory-utilization 0.95 \
--max-model-len $MAX_MODEL_LEN \
--compilation-config '{"cudagraph_mode": "FULL_AND_PIECEWISE"}' \
--block-size=64 \
--no-enable-prefix-caching \
--disable-log-requests > $SERVER_LOG 2>&1 &
$ATTN_BACKEND $FUSE_ROPE_KVCACHE \
--tensor-parallel-size=$TP \
--gpu-memory-utilization 0.95 \
--max-model-len $MAX_MODEL_LEN \
--block-size=64 \
--no-enable-prefix-caching > $SERVER_LOG 2>&1 &

SERVER_PID=$!

Expand Down
19 changes: 10 additions & 9 deletions benchmarks/single_node/gptoss_fp4_mi325x.sh
Original file line number Diff line number Diff line change
Expand Up @@ -33,22 +33,23 @@ if [ -n "$ROCR_VISIBLE_DEVICES" ]; then
export HIP_VISIBLE_DEVICES="$ROCR_VISIBLE_DEVICES"
fi

export AMDGCN_USE_BUFFER_OPS=0
export VLLM_ROCM_USE_AITER=1
export VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION=1
export VLLM_ROCM_USE_AITER_MHA=0
export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4
ATTN_BACKEND="--attention-backend ROCM_AITER_UNIFIED_ATTN"
FUSE_ROPE_KVCACHE="-cc.pass_config.fuse_rope_kvcache=True -cc.use_inductor_graph_partition=True"

SERVER_LOG=/workspace/server.log
PORT=${PORT:-8888}

set -x
vllm serve $MODEL --port $PORT \
--tensor-parallel-size=$TP \
--gpu-memory-utilization 0.95 \
--max-model-len $MAX_MODEL_LEN \
--compilation-config '{"cudagraph_mode": "FULL_AND_PIECEWISE"}' \
--block-size=64 \
--no-enable-prefix-caching \
--disable-log-requests > $SERVER_LOG 2>&1 &
$ATTN_BACKEND $FUSE_ROPE_KVCACHE \
--tensor-parallel-size=$TP \
--gpu-memory-utilization 0.95 \
--max-model-len $MAX_MODEL_LEN \
--block-size=64 \
--no-enable-prefix-caching > $SERVER_LOG 2>&1 &

SERVER_PID=$!

Expand Down
20 changes: 11 additions & 9 deletions benchmarks/single_node/gptoss_fp4_mi355x.sh
Original file line number Diff line number Diff line change
Expand Up @@ -33,22 +33,24 @@ if [ -n "$ROCR_VISIBLE_DEVICES" ]; then
export HIP_VISIBLE_DEVICES="$ROCR_VISIBLE_DEVICES"
fi

export AMDGCN_USE_BUFFER_OPS=0
export VLLM_ROCM_USE_AITER=1
export VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION=1
export VLLM_ROCM_USE_AITER_MHA=0
export VLLM_ROCM_USE_AITER_TRITON_ROPE=1
export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4
ATTN_BACKEND="--attention-backend ROCM_AITER_UNIFIED_ATTN"
FUSE_ROPE_KVCACHE="-cc.pass_config.fuse_rope_kvcache=True -cc.use_inductor_graph_partition=True"

SERVER_LOG=/workspace/server.log
PORT=${PORT:-8888}

set -x
vllm serve $MODEL --port $PORT \
--tensor-parallel-size=$TP \
--gpu-memory-utilization 0.95 \
--max-model-len $MAX_MODEL_LEN \
--compilation-config '{"cudagraph_mode": "FULL_AND_PIECEWISE"}' \
--block-size=64 \
--no-enable-prefix-caching \
--disable-log-requests > $SERVER_LOG 2>&1 &
$ATTN_BACKEND $FUSE_ROPE_KVCACHE \
--tensor-parallel-size=$TP \
--gpu-memory-utilization 0.95 \
--max-model-len $MAX_MODEL_LEN \
--block-size=64 \
--no-enable-prefix-caching > $SERVER_LOG 2>&1 &

SERVER_PID=$!

Expand Down
13 changes: 13 additions & 0 deletions perf-changelog.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -896,3 +896,16 @@
- "Expanding TP search space"
- "Adding kv-cache-fp8"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/865

- config-keys:
- gptoss-fp4-mi300x-vllm
- gptoss-fp4-mi325x-vllm
- gptoss-fp4-mi355x-vllm
description:
- "Update AMD GPT-OSS vLLM image from v0.16.0 to v0.17.0 for MI300X, MI325X, and MI355X"
- "MI355X: Switch model to amd/gpt-oss-120b-w-mxfp4-a-fp8 (MXFP4 weights + FP8 activations)"
- "MI355X: Add VLLM_ROCM_USE_AITER_TRITON_ROPE=1 for AITER triton RoPE kernel"
- "Add AMDGCN_USE_BUFFER_OPS=0 and VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4 env vars"
- "Switch to --attention-backend ROCM_AITER_UNIFIED_ATTN and add fuse_rope_kvcache compilation pass"
- "Remove deprecated VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION/VLLM_ROCM_USE_AITER_MHA env vars and compilation-config cudagraph_mode"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/867
25 changes: 16 additions & 9 deletions runners/launch_mi355x-amds.sh
Original file line number Diff line number Diff line change
Expand Up @@ -156,23 +156,30 @@ else

PARTITION="compute"
SQUASH_FILE="/var/lib/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh"
LOCK_FILE="${SQUASH_FILE}.lock"

set -x
salloc --partition=$PARTITION --gres=gpu:$TP --cpus-per-task=128 --time=180 --no-shell --job-name="$RUNNER_NAME"
JOB_ID=$(squeue --name="$RUNNER_NAME" -h -o %A | head -n1)

srun --jobid=$JOB_ID bash -c "docker stop \$(docker ps -a -q)"

if [[ "$FRAMEWORK" == "atom" ]]; then
srun --jobid=$JOB_ID bash -c "rm $SQUASH_FILE"
fi
# Use flock to serialize concurrent imports to the same squash file
srun --jobid=$JOB_ID bash -c "
exec 9>\"$LOCK_FILE\"
flock -w 600 9 || { echo 'Failed to acquire lock for $SQUASH_FILE'; exit 1; }
if [[ \"$FRAMEWORK\" == \"atom\" ]]; then
rm -f \"$SQUASH_FILE\"
fi
if unsquashfs -l \"$SQUASH_FILE\" > /dev/null 2>&1; then
echo 'Squash file already exists and is valid, skipping import'
else
rm -f \"$SQUASH_FILE\"
enroot import -o \"$SQUASH_FILE\" docker://$IMAGE
fi
"

srun --jobid=$JOB_ID bash -c "enroot import -o $SQUASH_FILE docker://$IMAGE"
if ! srun --jobid=$JOB_ID bash -c "unsquashfs -l $SQUASH_FILE > /dev/null"; then
echo "unsquashfs failed, removing $SQUASH_FILE and re-importing..."
srun --jobid=$JOB_ID bash -c "rm -f $SQUASH_FILE"
srun --jobid=$JOB_ID bash -c "enroot import -o $SQUASH_FILE docker://$IMAGE"
fi
export VLLM_CACHE_ROOT="/it-share/gharunners/.cache/vllm"

srun --jobid=$JOB_ID \
--container-image=$SQUASH_FILE \
Expand Down
Loading