diff --git a/docs/autoregressive/Qwen/Qwen3.5.md b/docs/autoregressive/Qwen/Qwen3.5.md index 4281e437..3815fb9c 100644 --- a/docs/autoregressive/Qwen/Qwen3.5.md +++ b/docs/autoregressive/Qwen/Qwen3.5.md @@ -73,7 +73,6 @@ import Qwen35ConfigGenerator from '@site/src/components/autoregressive/Qwen35Con - To speed up weight loading for this large model, add `--model-loader-extra-config='{"enable_multithread_load": "true","num_threads": 64}'` to the launch command. - **CUDA IPC Transport**: Add `SGLANG_USE_CUDA_IPC_TRANSPORT=1` as an environment variable to use CUDA IPC for transferring multimodal features, significantly improving TTFT (Time To First Token). Note: this consumes additional memory proportional to image size, so you may need to lower `--mem-fraction-static` or `--max-running-requests`. - **Multimodal Attention Backend**: Use `--mm-attention-backend fa3` on H100/H200 for better vision performance, or `--mm-attention-backend fa4` on B200/B300. -- **B200 (FP8)**: Add `--enable-flashinfer-allreduce-fusion` for optimized throughput on Blackwell. - For processing large images or videos, you may need to lower `--mem-fraction-static` to leave room for image feature tensors. - Hardware requirements: - **BF16**: ~397B parameters require ~800GB of GPU memory for weights. diff --git a/src/components/autoregressive/Qwen35ConfigGenerator/index.js b/src/components/autoregressive/Qwen35ConfigGenerator/index.js index 5c364003..c0d5ee96 100644 --- a/src/components/autoregressive/Qwen35ConfigGenerator/index.js +++ b/src/components/autoregressive/Qwen35ConfigGenerator/index.js @@ -251,8 +251,13 @@ const Qwen35ConfigGenerator = () => { const epValue = hwConfig.ep; const memFraction = hwConfig.mem; + // Prepend SGLANG_ENABLE_SPEC_V2=1 for B200 FP8 + MTP (validated InferenceX#1065) + const envPrefix = (hardware === 'b200' && quantization === 'fp8' && speculative === 'enabled') + ? 'SGLANG_ENABLE_SPEC_V2=1 ' + : ''; + // Initialize the base command - let cmd = `sglang serve --model-path ${modelName}`; + let cmd = `${envPrefix}sglang serve --model-path ${modelName}`; if (tpValue > 1) { cmd += ` \\\n --tp ${tpValue}`; } @@ -286,8 +291,8 @@ const Qwen35ConfigGenerator = () => { cmd += ` \\\n --tokenizer-worker-num 6`; } - // Enable allreduce fusion for all Qwen3.5 configs (skip for FP4: benchmark only enables this for TP≥8). - if (quantization !== 'fp4') { + // Enable allreduce fusion for all Qwen3.5 configs (skip for FP4 and B200 FP8: benchmark does not enable it). + if (quantization !== 'fp4' && !(hardware === 'b200' && quantization === 'fp8')) { cmd += ` \\\n --enable-flashinfer-allreduce-fusion`; } @@ -299,6 +304,22 @@ const Qwen35ConfigGenerator = () => { } } + // B200 FP8-specific optimizations (validated in InferenceX#1027 and #1065 for MTP) + if (hardware === 'b200' && quantization === 'fp8') { + cmd += ` \\\n --enable-symm-mem`; + cmd += ` \\\n --disable-radix-cache`; + if (MOE_MODELS.has(model)) { + cmd += ` \\\n --mamba-ssm-dtype bfloat16`; + } + cmd += ` \\\n --moe-runner-backend flashinfer_trtllm`; + cmd += ` \\\n --chunked-prefill-size 16384`; + cmd += ` \\\n --max-prefill-tokens 16384`; + cmd += ` \\\n --stream-interval 50`; + if (speculative === 'enabled') { + cmd += ` \\\n --tokenizer-worker-num 6`; + } + } + // Append backend configurations if (hardware === 'b200' || hardware === 'b300') { cmd += ` \\\n --attention-backend trtllm_mha`;