Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/70b-tmpl.yml
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ jobs:
max-model-len: ${{ inputs.max-model-len }}
random-range-ratio: ${{ inputs.random-range-ratio }}
runner: b200
image: 'kedarpotdar147/vllm0.1:latest'
image: 'kedarpotdar147/vllm:05'
model: 'nvidia/Llama-3.3-70B-Instruct-FP8'
tp-list: '[1, 2, 4, 8]'
timeout: ${{ inputs.timeout }}
Expand Down
25 changes: 21 additions & 4 deletions benchmarks/70b_b200_slurm.sh
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,29 @@ hf download $MODEL
SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
PORT=$(( 8888 + $PORT_OFFSET ))

#nccl update
pip uninstall -y nvidia-nccl-cu12
pip install nvidia-nccl-cu12==2.26.2.post1

pip uninstall -y flashinfer-python
git clone --recursive https://github.com/flashinfer-ai/flashinfer.git
git checkout 9720182476ede910698f8d783c29b2ec91cec023
cd flashinfer
pip install .

export VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB='{"2":32,"4":32,"8":8}'

FUSION_FLAG='{"pass_config":{"enable_fi_allreduce_fusion":true,"enable_attn_fusion":true,"enable_noop":true},"custom_ops":["+quant_fp8","+rms_norm"],"cudagraph_mode":"FULL_DECODE_ONLY","splitting_ops":[]}'

NO_PREFIX_CACHING_FLAG="--no-enable-prefix-caching"

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This seems redundant



export TORCH_CUDA_ARCH_LIST="10.0"
vllm serve $MODEL --host 0.0.0.0 --port $PORT \
--trust-remote-code --quantization modelopt --kv-cache-dtype fp8 --gpu-memory-utilization 0.9 \
--pipeline-parallel-size 1 --tensor-parallel-size $TP --max-num-seqs $CONC --max-num-batched-tokens 8192 --max-model-len $MAX_MODEL_LEN \
--trust-remote-code --kv-cache-dtype fp8 --gpu-memory-utilization 0.9 \
--pipeline-parallel-size 1 --tensor-parallel-size $TP --max-num-seqs $CONC --max-num-batched-tokens 8192 --max-num-seqs 512 --max-model-len $MAX_MODEL_LEN \
--enable-chunked-prefill --async-scheduling --no-enable-prefix-caching \
--compilation-config '{"pass_config": {"enable_fi_allreduce_fusion": true}, "custom_ops": ["+rms_norm"], "level": 3}' \
--compilation-config ${FUSION_FLAG} \
--disable-log-requests > $SERVER_LOG 2>&1 &

set +x
Expand Down Expand Up @@ -54,4 +71,4 @@ python3 bench_serving/benchmark_serving.py \
--request-rate inf --ignore-eos \
--save-result --percentile-metrics 'ttft,tpot,itl,e2el' \
--result-dir /workspace/ \
--result-filename $RESULT_FILENAME.json
--result-filename $RESULT_FILENAME.json
2 changes: 1 addition & 1 deletion runners/launch_b200-nv.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ export PORT_OFFSET=${USER: -1}

MODEL_CODE="${1%%_*}"
PARTITION="dgx-b200"
SQUASH_FILE="/raid/image_${MODEL_CODE}_b200.sqsh"
SQUASH_FILE="/raid/image_${MODEL_CODE}_b200-2.sqsh"

salloc --partition=$PARTITION --gres=gpu:$TP --exclusive --time=180 --no-shell
JOB_ID=$(squeue -u $USER -h -o %A | head -n1)
Expand Down