diff --git a/docs/autoregressive/Qwen/Qwen3.5.md b/docs/autoregressive/Qwen/Qwen3.5.md
index 4281e437..3815fb9c 100644
--- a/docs/autoregressive/Qwen/Qwen3.5.md
+++ b/docs/autoregressive/Qwen/Qwen3.5.md
@@ -73,7 +73,6 @@ import Qwen35ConfigGenerator from '@site/src/components/autoregressive/Qwen35Con
 - To speed up weight loading for this large model, add `--model-loader-extra-config='{"enable_multithread_load": "true","num_threads": 64}'` to the launch command.
 - **CUDA IPC Transport**: Add `SGLANG_USE_CUDA_IPC_TRANSPORT=1` as an environment variable to use CUDA IPC for transferring multimodal features, significantly improving TTFT (Time To First Token). Note: this consumes additional memory proportional to image size, so you may need to lower `--mem-fraction-static` or `--max-running-requests`.
 - **Multimodal Attention Backend**: Use `--mm-attention-backend fa3` on H100/H200 for better vision performance, or `--mm-attention-backend fa4` on B200/B300.
-- **B200 (FP8)**: Add `--enable-flashinfer-allreduce-fusion` for optimized throughput on Blackwell.
 - For processing large images or videos, you may need to lower `--mem-fraction-static` to leave room for image feature tensors.
 - Hardware requirements:
     - **BF16**: ~397B parameters require ~800GB of GPU memory for weights.
diff --git a/src/components/autoregressive/Qwen35ConfigGenerator/index.js b/src/components/autoregressive/Qwen35ConfigGenerator/index.js
index 5c364003..c0d5ee96 100644
--- a/src/components/autoregressive/Qwen35ConfigGenerator/index.js
+++ b/src/components/autoregressive/Qwen35ConfigGenerator/index.js
@@ -251,8 +251,13 @@ const Qwen35ConfigGenerator = () => {
       const epValue = hwConfig.ep;
       const memFraction = hwConfig.mem;
 
+      // Prepend SGLANG_ENABLE_SPEC_V2=1 for B200 FP8 + MTP (validated InferenceX#1065)
+      const envPrefix = (hardware === 'b200' && quantization === 'fp8' && speculative === 'enabled')
+        ? 'SGLANG_ENABLE_SPEC_V2=1 '
+        : '';
+
       // Initialize the base command
-      let cmd = `sglang serve --model-path ${modelName}`;
+      let cmd = `${envPrefix}sglang serve --model-path ${modelName}`;
       if (tpValue > 1) {
         cmd += ` \\\n  --tp ${tpValue}`;
       }
@@ -286,8 +291,8 @@ const Qwen35ConfigGenerator = () => {
         cmd += ` \\\n  --tokenizer-worker-num 6`;
       }
 
-      // Enable allreduce fusion for all Qwen3.5 configs (skip for FP4: benchmark only enables this for TP≥8).
-      if (quantization !== 'fp4') {
+      // Enable allreduce fusion for all Qwen3.5 configs (skip for FP4 and B200 FP8: benchmark does not enable it).
+      if (quantization !== 'fp4' && !(hardware === 'b200' && quantization === 'fp8')) {
         cmd += ` \\\n  --enable-flashinfer-allreduce-fusion`;
       }
 
@@ -299,6 +304,22 @@ const Qwen35ConfigGenerator = () => {
         }
       }
 
+      // B200 FP8-specific optimizations (validated in InferenceX#1027 and #1065 for MTP)
+      if (hardware === 'b200' && quantization === 'fp8') {
+        cmd += ` \\\n  --enable-symm-mem`;
+        cmd += ` \\\n  --disable-radix-cache`;
+        if (MOE_MODELS.has(model)) {
+          cmd += ` \\\n  --mamba-ssm-dtype bfloat16`;
+        }
+        cmd += ` \\\n  --moe-runner-backend flashinfer_trtllm`;
+        cmd += ` \\\n  --chunked-prefill-size 16384`;
+        cmd += ` \\\n  --max-prefill-tokens 16384`;
+        cmd += ` \\\n  --stream-interval 50`;
+        if (speculative === 'enabled') {
+          cmd += ` \\\n  --tokenizer-worker-num 6`;
+        }
+      }
+
       // Append backend configurations
       if (hardware === 'b200' || hardware === 'b300') {
         cmd += ` \\\n  --attention-backend trtllm_mha`;