Skip to content
35 changes: 34 additions & 1 deletion benchmarks/qwen3.5_bf16_b200.sh
Original file line number Diff line number Diff line change
Expand Up @@ -19,20 +19,53 @@ nvidia-smi

hf download "$MODEL"

export NCCL_NVLS_ENABLE=1
export SGL_ENABLE_JIT_DEEPGEMM=false
export SGLANG_ENABLE_FLASHINFER_GEMM=true
export PYTHONUNBUFFERED=1

SERVER_LOG=/workspace/server.log
PORT=${PORT:-8888}

MEM_FRAC_STATIC=0.8
# Low latency (conc 4,8): recv interval 10; max throughput (conc 16+): recv interval 30
if [[ $CONC -ge 16 ]]; then
SCHEDULER_RECV_INTERVAL=30
else
SCHEDULER_RECV_INTERVAL=10
fi

MEM_FRAC_STATIC=0.82
CHUNKED_PREFILL_SIZE=32768
MAX_PREFILL_TOKENS=32768
CUDA_GRAPH_MAX_BATCH_SIZE=$CONC
MAX_RUNNING_REQUESTS=128
CONTEXT_LENGTH=$((ISL + OSL + 20))

echo "SCHEDULER_RECV_INTERVAL: $SCHEDULER_RECV_INTERVAL, CONC: $CONC, ISL: $ISL, OSL: $OSL"

ps aux

set -x
PYTHONNOUSERSITE=1 python3 -m sglang.launch_server \
--model-path=$MODEL \
--served-model-name "Qwen/Qwen3.5-397B-A17B" \
--host=0.0.0.0 \
--port=$PORT \
--trust-remote-code \
--tensor-parallel-size=$TP \
--disable-radix-cache \
--mem-fraction-static $MEM_FRAC_STATIC \
--chunked-prefill-size $CHUNKED_PREFILL_SIZE \
--max-prefill-tokens $MAX_PREFILL_TOKENS \
--cuda-graph-max-bs $CUDA_GRAPH_MAX_BATCH_SIZE \
--max-running-requests $MAX_RUNNING_REQUESTS \
--context-length $CONTEXT_LENGTH \
--attention-backend trtllm_mha \
--moe-runner-backend flashinfer_trtllm \
--tokenizer-worker-num 6 \
--stream-interval 30 \
--scheduler-recv-interval $SCHEDULER_RECV_INTERVAL \
--enable-flashinfer-allreduce-fusion \
> $SERVER_LOG 2>&1 &

SERVER_PID=$!
Expand Down
10 changes: 10 additions & 0 deletions perf-changelog.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -671,3 +671,13 @@
- "Environment: VLLM_ROCM_USE_AITER=1"
- "TP=2 and TP=4, concurrency 4-64 for 1k1k, 1k8k, and 8k1k sequence lengths"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/755

- config-keys:
- qwen3.5-bf16-b200-sglang
description:
- "Update Qwen3.5-397B-A17B BF16 SGLang B200 benchmark launch config"
- "Image: lmsysorg/sglang:nightly-dev-20260216-d3bae71e"
- "Add trtllm_mha attention backend, flashinfer_trtllm MOE runner"
- "Add context-length, tokenizer-worker-num, env tuning (NCCL_NVLS_ENABLE, SGLANG_ENABLE_FLASHINFER_GEMM)"
- "Set cuda-graph-max-bs to match concurrency, scheduler-recv-interval based on concurrency"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/758