sgl-project · faradawn · Apr 30, 2026
diff --git a/docs/autoregressive/Qwen/Qwen3.5.md b/docs/autoregressive/Qwen/Qwen3.5.md
@@ -73,7 +73,6 @@ import Qwen35ConfigGenerator from '@site/src/components/autoregressive/Qwen35Con
 - To speed up weight loading for this large model, add `--model-loader-extra-config='{"enable_multithread_load": "true","num_threads": 64}'` to the launch command.
 - **CUDA IPC Transport**: Add `SGLANG_USE_CUDA_IPC_TRANSPORT=1` as an environment variable to use CUDA IPC for transferring multimodal features, significantly improving TTFT (Time To First Token). Note: this consumes additional memory proportional to image size, so you may need to lower `--mem-fraction-static` or `--max-running-requests`.
 - **Multimodal Attention Backend**: Use `--mm-attention-backend fa3` on H100/H200 for better vision performance, or `--mm-attention-backend fa4` on B200/B300.
-- **B200 (FP8)**: Add `--enable-flashinfer-allreduce-fusion` for optimized throughput on Blackwell.
 - For processing large images or videos, you may need to lower `--mem-fraction-static` to leave room for image feature tensors.
 - Hardware requirements:
     - **BF16**: ~397B parameters require ~800GB of GPU memory for weights.

diff --git a/src/components/autoregressive/Qwen35ConfigGenerator/index.js b/src/components/autoregressive/Qwen35ConfigGenerator/index.js
@@ -286,8 +286,8 @@ const Qwen35ConfigGenerator = () => {
         cmd += ` \\\n  --tokenizer-worker-num 6`;
       }
 
-      // Enable allreduce fusion for all Qwen3.5 configs (skip for FP4: benchmark only enables this for TP≥8).
-      if (quantization !== 'fp4') {
+      // Enable allreduce fusion for all Qwen3.5 configs (skip for FP4 and B200 FP8: benchmark does not enable it).
+      if (quantization !== 'fp4' && !(hardware === 'b200' && quantization === 'fp8')) {
         cmd += ` \\\n  --enable-flashinfer-allreduce-fusion`;
       }
 
@@ -299,6 +299,19 @@ const Qwen35ConfigGenerator = () => {
         }
       }
 
+      // B200 FP8-specific optimizations (validated in InferenceX#1027)
+      if (hardware === 'b200' && quantization === 'fp8') {
+        cmd += ` \\\n  --enable-symm-mem`;
+        cmd += ` \\\n  --disable-radix-cache`;
+        if (MOE_MODELS.has(model)) {
+          cmd += ` \\\n  --mamba-ssm-dtype bfloat16`;
+        }
+        cmd += ` \\\n  --moe-runner-backend flashinfer_trtllm`;
+        cmd += ` \\\n  --chunked-prefill-size 16384`;
+        cmd += ` \\\n  --max-prefill-tokens 16384`;
+        cmd += ` \\\n  --stream-interval 50`;
+      }
+
       // Append backend configurations
       if (hardware === 'b200' || hardware === 'b300') {
         cmd += ` \\\n  --attention-backend trtllm_mha`;