diff --git a/data/models/generated/v0.5.8/qwen35.yaml b/data/models/generated/v0.5.8/qwen35.yaml index 53afb4de..136dbe9e 100644 --- a/data/models/generated/v0.5.8/qwen35.yaml +++ b/data/models/generated/v0.5.8/qwen35.yaml @@ -132,3 +132,285 @@ families: - '4' prefill: null decode: null + - name: Qwen3.5-397B-A17B-FP8 + model_path: Qwen/Qwen3.5-397B-A17B-FP8 + attributes: + llm: + thinking_capability: hybrid + tool_parser: qwen3_coder + reasoning_parser: qwen3 + chat_template: null + hardware: + H100: + configurations: + - name: default + attributes: + nodes: single + optimization: balanced + quantization: fp8 + quantized_model_path: null + engine: + env_vars: {} + tp: 8 + dp: null + ep: null + enable_dp_attention: null + extra_args: [] + prefill: null + decode: null + - name: speculative-mtp + attributes: + nodes: single + optimization: low-latency + quantization: fp8 + quantized_model_path: null + engine: + env_vars: {} + tp: 8 + dp: null + ep: null + enable_dp_attention: null + extra_args: + - --speculative-algo + - NEXTN + - --speculative-num-steps + - '3' + - --speculative-eagle-topk + - '1' + - --speculative-num-draft-tokens + - '4' + prefill: null + decode: null + H200: + configurations: + - name: default + attributes: + nodes: single + optimization: balanced + quantization: fp8 + quantized_model_path: null + engine: + env_vars: {} + tp: 4 + dp: null + ep: null + enable_dp_attention: null + extra_args: [] + prefill: null + decode: null + - name: speculative-mtp + attributes: + nodes: single + optimization: low-latency + quantization: fp8 + quantized_model_path: null + engine: + env_vars: {} + tp: 4 + dp: null + ep: null + enable_dp_attention: null + extra_args: + - --speculative-algo + - NEXTN + - --speculative-num-steps + - '3' + - --speculative-eagle-topk + - '1' + - --speculative-num-draft-tokens + - '4' + prefill: null + decode: null + B200: + configurations: + - name: default + attributes: + nodes: single + optimization: balanced + quantization: fp8 + quantized_model_path: null + engine: + env_vars: {} + tp: 4 + dp: null + ep: null + enable_dp_attention: null + extra_args: [] + prefill: null + decode: null + - name: speculative-mtp + attributes: + nodes: single + optimization: low-latency + quantization: fp8 + quantized_model_path: null + engine: + env_vars: {} + tp: 4 + dp: null + ep: null + enable_dp_attention: null + extra_args: + - --speculative-algo + - NEXTN + - --speculative-num-steps + - '3' + - --speculative-eagle-topk + - '1' + - --speculative-num-draft-tokens + - '4' + prefill: null + decode: null + B300: + configurations: + - name: default + attributes: + nodes: single + optimization: balanced + quantization: fp8 + quantized_model_path: null + engine: + env_vars: {} + tp: 2 + dp: null + ep: null + enable_dp_attention: null + extra_args: [] + prefill: null + decode: null + - name: speculative-mtp + attributes: + nodes: single + optimization: low-latency + quantization: fp8 + quantized_model_path: null + engine: + env_vars: {} + tp: 2 + dp: null + ep: null + enable_dp_attention: null + extra_args: + - --speculative-algo + - NEXTN + - --speculative-num-steps + - '3' + - --speculative-eagle-topk + - '1' + - --speculative-num-draft-tokens + - '4' + prefill: null + decode: null + - name: Qwen3.5-397B-A17B-NVFP4 + model_path: nvidia/Qwen3.5-397B-A17B-NVFP4 + attributes: + llm: + thinking_capability: hybrid + tool_parser: qwen3_coder + reasoning_parser: qwen3 + chat_template: null + hardware: + B200: + configurations: + - name: default + attributes: + nodes: single + optimization: balanced + quantization: fp4 + quantized_model_path: null + engine: + env_vars: {} + tp: 4 + dp: null + ep: null + enable_dp_attention: null + extra_args: + - --attention-backend + - trtllm_mha + - --moe-runner-backend + - flashinfer_trtllm + - --fp4-gemm-backend + - flashinfer_cutlass + prefill: null + decode: null + - name: speculative-mtp + attributes: + nodes: single + optimization: low-latency + quantization: fp4 + quantized_model_path: null + engine: + env_vars: {} + tp: 4 + dp: null + ep: null + enable_dp_attention: null + extra_args: + - --attention-backend + - trtllm_mha + - --moe-runner-backend + - flashinfer_trtllm + - --fp4-gemm-backend + - flashinfer_cutlass + - --speculative-algo + - NEXTN + - --speculative-num-steps + - '3' + - --speculative-eagle-topk + - '1' + - --speculative-num-draft-tokens + - '4' + prefill: null + decode: null + B300: + configurations: + - name: default + attributes: + nodes: single + optimization: balanced + quantization: fp4 + quantized_model_path: null + engine: + env_vars: {} + tp: 2 + dp: null + ep: null + enable_dp_attention: null + extra_args: + - --attention-backend + - trtllm_mha + - --moe-runner-backend + - flashinfer_trtllm + - --fp4-gemm-backend + - flashinfer_cutlass + prefill: null + decode: null + - name: speculative-mtp + attributes: + nodes: single + optimization: low-latency + quantization: fp4 + quantized_model_path: null + engine: + env_vars: {} + tp: 2 + dp: null + ep: null + enable_dp_attention: null + extra_args: + - --attention-backend + - trtllm_mha + - --moe-runner-backend + - flashinfer_trtllm + - --fp4-gemm-backend + - flashinfer_cutlass + - --speculative-algo + - NEXTN + - --speculative-num-steps + - '3' + - --speculative-eagle-topk + - '1' + - --speculative-num-draft-tokens + - '4' + prefill: null + decode: null diff --git a/data/models/src/v0.5.8/qwen35.yaml b/data/models/src/v0.5.8/qwen35.yaml index 0b63f00a..9e1893d3 100644 --- a/data/models/src/v0.5.8/qwen35.yaml +++ b/data/models/src/v0.5.8/qwen35.yaml @@ -1,10 +1,18 @@ # Qwen3.5 Model Configurations (Simplified Format) # This file is compiled to data/models/generated/qwen35.yaml # -# GPU requirements: +# GPU requirements (BF16): # H100: tp=16 (model ~800GB in BF16, each rank needs ~100GB > 80GB) # H200: tp=8 # B200: tp=8 +# +# GPU requirements (FP8): +# H100: tp=8 (model ~400GB in FP8, each rank needs ~50GB < 80GB) +# H200: tp=8 +# B200: tp=8 +# +# GPU requirements (FP4): +# B200 (183GB) and B300 (275GB) - FP4 requires Blackwell architecture vendor: qwen @@ -41,3 +49,34 @@ families: models: - name: Qwen3.5-397B-A17B quantization: bf16 + + - name: Qwen3.5-397B-A17B-FP8 + quantization: fp8 + hardware: + H100: { tp: 8 } + H200: { tp: 4 } + B200: { tp: 4 } + B300: { tp: 2 } + + - name: Qwen3.5-397B-A17B-NVFP4 + quantization: fp4 + model_path: nvidia/Qwen3.5-397B-A17B-NVFP4 + hardware: + B200: + tp: 4 + extra_args: + - --attention-backend + - trtllm_mha + - --moe-runner-backend + - flashinfer_trtllm + - --fp4-gemm-backend + - flashinfer_cutlass + B300: + tp: 2 + extra_args: + - --attention-backend + - trtllm_mha + - --moe-runner-backend + - flashinfer_trtllm + - --fp4-gemm-backend + - flashinfer_cutlass diff --git a/docs/autoregressive/Qwen/Qwen3.5.md b/docs/autoregressive/Qwen/Qwen3.5.md index 34c937bd..22beaa56 100644 --- a/docs/autoregressive/Qwen/Qwen3.5.md +++ b/docs/autoregressive/Qwen/Qwen3.5.md @@ -18,6 +18,8 @@ Qwen3.5 features a Gated Delta Networks combined with sparse Mixture-of-Experts **Available Models:** - **BF16 (Full precision)**: [Qwen/Qwen3.5-397B-A17B](https://huggingface.co/Qwen/Qwen3.5-397B-A17B) +- **FP8 (8-bit Quantized)**: [Qwen/Qwen3.5-397B-A17B-FP8](https://huggingface.co/Qwen/Qwen3.5-397B-A17B-FP8) +- **FP4 (4-bit Quantized)**: [nvidia/Qwen3.5-397B-A17B-NVFP4](https://huggingface.co/nvidia/Qwen3.5-397B-A17B-NVFP4) **License:** Apache 2.0 @@ -49,22 +51,34 @@ import Qwen35ConfigGenerator from '@site/src/components/autoregressive/Qwen35Con ### 3.2 Configuration Tips -- The model has ~397B parameters in BF16, requiring ~800GB of GPU memory for weights alone. -- **H100 (80GB)** requires tp=16 (2 nodes) since each rank needs ~100GB at tp=8. -- **H200 (141GB)** and **B200 (192GB)** can run with tp=8 on a single node. - Speculative decoding (MTP) can significantly reduce latency for interactive use cases. - The `--mem-fraction-static` flag is recommended for optimal memory utilization, adjust it based on your hardware and workload. - Context length defaults to 262,144 tokens. If you encounter OOM errors, consider reducing it, but maintain at least 128K to preserve thinking capabilities. - To speed up weight loading for this large model, add `--model-loader-extra-config='{"enable_multithread_load": "true","num_threads": 64}'` to the launch command. - **CUDA IPC Transport**: Add `SGLANG_USE_CUDA_IPC_TRANSPORT=1` as an environment variable to use CUDA IPC for transferring multimodal features, significantly improving TTFT (Time To First Token). Note: this consumes additional memory proportional to image size, so you may need to lower `--mem-fraction-static` or `--max-running-requests`. -- **Multimodal Attention Backend**: Use `--mm-attention-backend fa3` on H100/H200 for better vision performance, or `--mm-attention-backend fa4` on B200. +- **Multimodal Attention Backend**: Use `--mm-attention-backend fa3` on H100/H200 for better vision performance, or `--mm-attention-backend fa4` on B200/B300. - For processing large images or videos, you may need to lower `--mem-fraction-static` to leave room for image feature tensors. - -| Hardware | TP | -| -------- | -- | -| H100 | 16 | -| H200 | 8 | -| B200 | 8 | +- Hardware requirements: + - **BF16**: ~397B parameters require ~800GB of GPU memory for weights. + - **H100 (80GB)** requires tp=16 (2 nodes) since each rank needs ~100GB at tp=8. + - **H200 (141GB)** runs with tp=8. + - **B200 (183GB)** runs with tp=8. + - **B300 (275GB)** runs with tp=4. + - **FP8**: The FP8 quantized model requires ~400GB for weights, cutting memory in half. + - **H100 (80GB)** runs with tp=8. + - **H200 (141GB)** runs with tp=4. + - **B200 (183GB)** runs with tp=4. + - **B300 (275GB)** runs with tp=2. + - **FP4**: The FP4 quantized model requires ~250GB for weights, cutting memory by almost 4x. Only compatible with B200/B300 (Blackwell architecture). + - **B200 (183GB)** runs with tp=4. + - **B300 (275GB)** runs with tp=2. + +| Hardware | Memory | BF16 TP | FP8 TP | FP4 TP | +| -------- | ------ | ------- | ------ | --------------- | +| H100 | 80GB | 16 | 8 | N/A | +| H200 | 141GB | 8 | 4 | N/A | +| B200 | 183GB | 8 | 4 | 4 | +| B300 | 275GB | 4 | 2 | 2 | ## 4. Model Invocation diff --git a/docs/intro.md b/docs/intro.md index 134f5c6d..0c8ba70f 100644 --- a/docs/intro.md +++ b/docs/intro.md @@ -28,7 +28,7 @@ Each recipe provides step-by-step instructions to help you quickly implement SGL #### Qwen -- [ ] [Qwen3.5](./autoregressive/Qwen/Qwen3.5.md) NEW +- [x] [Qwen3.5](./autoregressive/Qwen/Qwen3.5.md) NEW - [x] [Qwen3](./autoregressive/Qwen/Qwen3.md) - [x] [Qwen3-Next](./autoregressive/Qwen/Qwen3-Next.md) - [x] [Qwen3-VL](./autoregressive/Qwen/Qwen3-VL.md) diff --git a/src/components/autoregressive/Qwen35ConfigGenerator/index.js b/src/components/autoregressive/Qwen35ConfigGenerator/index.js index 871088c4..713d015b 100644 --- a/src/components/autoregressive/Qwen35ConfigGenerator/index.js +++ b/src/components/autoregressive/Qwen35ConfigGenerator/index.js @@ -4,12 +4,23 @@ import ConfigGenerator from '../../base/ConfigGenerator'; /** * Qwen3.5-397B-A17B Configuration Generator * Supports Qwen3.5 397B (17B active) MoE VLM deployment configuration - * with reasoning parser, tool calling, and speculative decoding + * with reasoning parser, tool calling, speculative decoding, and quantization options * - * GPU requirements: + * GPU requirements (BF16): * H100: tp=16 (model ~800GB in BF16, each rank needs ~100GB > 80GB) * H200: tp=8 * B200: tp=8 + * B300: tp=4 + * + * GPU requirements (FP8): + * H100: tp=8 (model ~400GB in FP8, each rank needs ~50GB < 80GB) + * H200: tp=4 + * B200: tp=4 + * B300: tp=2 + * + * GPU requirements (FP4): + * B200: tp=4 (FP4 requires Blackwell) + * B300: tp=2 (FP4 requires Blackwell) */ const Qwen35ConfigGenerator = () => { const config = { @@ -19,10 +30,23 @@ const Qwen35ConfigGenerator = () => { hardware: { name: 'hardware', title: 'Hardware Platform', + getDynamicItems: (values) => { + const isNvfp4 = values.quantization === 'fp4'; + return [ + { id: 'h100', label: 'H100', default: !isNvfp4, disabled: isNvfp4 }, + { id: 'h200', label: 'H200', default: false, disabled: isNvfp4 }, + { id: 'b200', label: 'B200', default: false, disabled: false }, + { id: 'b300', label: 'B300', default: isNvfp4, disabled: false } + ]; + } + }, + quantization: { + name: 'quantization', + title: 'Quantization', items: [ - { id: 'h200', label: 'H200', default: true }, - { id: 'b200', label: 'B200', default: false }, - { id: 'h100', label: 'H100', default: false } + { id: 'bf16', label: 'BF16', default: false }, + { id: 'fp8', label: 'FP8', default: true }, + { id: 'fp4', label: 'FP4', default: false } ] }, reasoning: { @@ -55,17 +79,29 @@ const Qwen35ConfigGenerator = () => { }, modelConfigs: { - h100: { bf16: { tp: 16, mem: 0.8 } }, - h200: { bf16: { tp: 8, mem: 0.8 } }, - b200: { bf16: { tp: 8, mem: 0.82 } } + h100: { bf16: { tp: 16, mem: 0.8 }, fp8: { tp: 8, mem: 0.8 } }, + h200: { bf16: { tp: 8, mem: 0.8 }, fp8: { tp: 4, mem: 0.8 } }, + b200: { bf16: { tp: 8, mem: 0.8 }, fp8: { tp: 4, mem: 0.8 }, fp4: { tp: 4, mem: 0.8 } }, + b300: { bf16: { tp: 4, mem: 0.8 }, fp8: { tp: 2, mem: 0.8 }, fp4: { tp: 2, mem: 0.8 } } }, generateCommand: function (values) { - const { hardware, speculative } = values; + const { hardware, quantization, speculative } = values; + + // Validate hardware supports the quantization + const hwConfig = this.modelConfigs[hardware]?.[quantization]; + if (!hwConfig) { + return '# Please select compatible hardware for the chosen quantization\n# FP4 requires B200/B300 (Blackwell)'; + } - const modelName = `${this.modelFamily}/Qwen3.5-397B-A17B`; + let modelName; + if (quantization === 'fp4') { + modelName = 'nvidia/Qwen3.5-397B-A17B-NVFP4'; + } else { + const quantSuffix = quantization === 'fp8' ? '-FP8' : ''; + modelName = `${this.modelFamily}/Qwen3.5-397B-A17B${quantSuffix}`; + } - const hwConfig = this.modelConfigs[hardware].bf16; const tpValue = hwConfig.tp; const memFraction = hwConfig.mem; @@ -74,8 +110,9 @@ const Qwen35ConfigGenerator = () => { cmd += ` --model ${modelName}`; cmd += ` \\\n --tp ${tpValue}`; - // Apply commandRule from all options + // Apply commandRule from all options except quantization (handled via model name) Object.entries(this.options).forEach(([key, option]) => { + if (key === 'quantization') return; if (option.commandRule) { const rule = option.commandRule(values[key]); if (rule) { @@ -84,18 +121,27 @@ const Qwen35ConfigGenerator = () => { } }); - // Append B200-specific backend configurations - if (hardware === 'b200') { + // Enable allreduce fusion for all Qwen3.5 configs. + cmd += ` \\\n --enable-flashinfer-allreduce-fusion`; + + // Append backend configurations + if (hardware === 'b200' || hardware === 'b300') { cmd += ` \\\n --attention-backend trtllm_mha`; - cmd += ` \\\n --moe-runner-backend flashinfer_trtllm`; - cmd += ` \\\n --disable-radix-cache`; - cmd += ` \\\n --enable-flashinfer-allreduce-fusion`; + } + + // Append B200/B300-specific backend configurations + if (hardware === 'b200' || hardware === 'b300') { if (speculative === 'disabled') { cmd += ` \\\n --tokenizer-worker-num 6`; } } - // Add memory fraction + // FP4-specific backend settings + if (quantization === 'fp4') { + cmd += ' \\\n --moe-runner-backend flashinfer_trtllm \\\n --fp4-gemm-backend flashinfer_cutlass \\\n --kv-cache-dtype fp8_e4m3'; + } + + // Add memory fraction last cmd += ` \\\n --mem-fraction-static ${memFraction}`; return cmd;