SemiAnalysisAI · kimbochen · Sep 3, 2025 · Sep 2, 2025 · Sep 2, 2025 · Sep 3, 2025
diff --git a/.github/workflows/70b-tmpl.yml b/.github/workflows/70b-tmpl.yml
@@ -73,7 +73,7 @@ jobs:
       max-model-len: ${{ inputs.max-model-len }}
       random-range-ratio: ${{ inputs.random-range-ratio }}
       runner: b200
-      image: 'kedarpotdar147/vllm0.1:latest'
+      image: 'kedarpotdar147/vllm:05'
       model: 'nvidia/Llama-3.3-70B-Instruct-FP8'
       tp-list: '[1, 2, 4, 8]'
       timeout: ${{ inputs.timeout }}

diff --git a/benchmarks/70b_b200_slurm.sh b/benchmarks/70b_b200_slurm.sh
@@ -21,12 +21,29 @@ hf download $MODEL
 SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
 PORT=$(( 8888 + $PORT_OFFSET ))
 
+#nccl update
+pip uninstall -y nvidia-nccl-cu12
+pip install nvidia-nccl-cu12==2.26.2.post1
+
+pip uninstall -y flashinfer-python
+git clone --recursive https://github.com/flashinfer-ai/flashinfer.git
+git checkout 9720182476ede910698f8d783c29b2ec91cec023
+cd flashinfer
+pip install .
+
+export VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB='{"2":32,"4":32,"8":8}'
+
+FUSION_FLAG='{"pass_config":{"enable_fi_allreduce_fusion":true,"enable_attn_fusion":true,"enable_noop":true},"custom_ops":["+quant_fp8","+rms_norm"],"cudagraph_mode":"FULL_DECODE_ONLY","splitting_ops":[]}'
+
+NO_PREFIX_CACHING_FLAG="--no-enable-prefix-caching"
+
+
 export TORCH_CUDA_ARCH_LIST="10.0"
 vllm serve $MODEL --host 0.0.0.0 --port $PORT \
---trust-remote-code --quantization modelopt --kv-cache-dtype fp8 --gpu-memory-utilization 0.9 \
---pipeline-parallel-size 1 --tensor-parallel-size $TP --max-num-seqs $CONC --max-num-batched-tokens 8192 --max-model-len $MAX_MODEL_LEN \
+--trust-remote-code --kv-cache-dtype fp8 --gpu-memory-utilization 0.9 \
+--pipeline-parallel-size 1 --tensor-parallel-size $TP --max-num-seqs $CONC --max-num-batched-tokens 8192 --max-num-seqs 512 --max-model-len $MAX_MODEL_LEN \
 --enable-chunked-prefill --async-scheduling --no-enable-prefix-caching \
---compilation-config '{"pass_config": {"enable_fi_allreduce_fusion": true}, "custom_ops": ["+rms_norm"], "level": 3}' \
+--compilation-config ${FUSION_FLAG} \
 --disable-log-requests > $SERVER_LOG 2>&1 &
 
 set +x
@@ -54,4 +71,4 @@ python3 bench_serving/benchmark_serving.py \
 --request-rate inf --ignore-eos \
 --save-result --percentile-metrics 'ttft,tpot,itl,e2el' \
 --result-dir /workspace/ \
---result-filename $RESULT_FILENAME.json
+--result-filename $RESULT_FILENAME.json
diff --git a/runners/launch_b200-nv.sh b/runners/launch_b200-nv.sh
@@ -5,7 +5,7 @@ export PORT_OFFSET=${USER: -1}
 
 MODEL_CODE="${1%%_*}"
 PARTITION="dgx-b200"
-SQUASH_FILE="/raid/image_${MODEL_CODE}_b200.sqsh"
+SQUASH_FILE="/raid/image_${MODEL_CODE}_b200-2.sqsh"
 
 salloc --partition=$PARTITION --gres=gpu:$TP --exclusive --time=180 --no-shell
 JOB_ID=$(squeue -u $USER -h -o %A | head -n1)