diff --git a/data/models/generated/v0.5.8/qwen35.yaml b/data/models/generated/v0.5.8/qwen35.yaml
index 53afb4de..136dbe9e 100644
--- a/data/models/generated/v0.5.8/qwen35.yaml
+++ b/data/models/generated/v0.5.8/qwen35.yaml
@@ -132,3 +132,285 @@ families:
- '4'
prefill: null
decode: null
+ - name: Qwen3.5-397B-A17B-FP8
+ model_path: Qwen/Qwen3.5-397B-A17B-FP8
+ attributes:
+ llm:
+ thinking_capability: hybrid
+ tool_parser: qwen3_coder
+ reasoning_parser: qwen3
+ chat_template: null
+ hardware:
+ H100:
+ configurations:
+ - name: default
+ attributes:
+ nodes: single
+ optimization: balanced
+ quantization: fp8
+ quantized_model_path: null
+ engine:
+ env_vars: {}
+ tp: 8
+ dp: null
+ ep: null
+ enable_dp_attention: null
+ extra_args: []
+ prefill: null
+ decode: null
+ - name: speculative-mtp
+ attributes:
+ nodes: single
+ optimization: low-latency
+ quantization: fp8
+ quantized_model_path: null
+ engine:
+ env_vars: {}
+ tp: 8
+ dp: null
+ ep: null
+ enable_dp_attention: null
+ extra_args:
+ - --speculative-algo
+ - NEXTN
+ - --speculative-num-steps
+ - '3'
+ - --speculative-eagle-topk
+ - '1'
+ - --speculative-num-draft-tokens
+ - '4'
+ prefill: null
+ decode: null
+ H200:
+ configurations:
+ - name: default
+ attributes:
+ nodes: single
+ optimization: balanced
+ quantization: fp8
+ quantized_model_path: null
+ engine:
+ env_vars: {}
+ tp: 4
+ dp: null
+ ep: null
+ enable_dp_attention: null
+ extra_args: []
+ prefill: null
+ decode: null
+ - name: speculative-mtp
+ attributes:
+ nodes: single
+ optimization: low-latency
+ quantization: fp8
+ quantized_model_path: null
+ engine:
+ env_vars: {}
+ tp: 4
+ dp: null
+ ep: null
+ enable_dp_attention: null
+ extra_args:
+ - --speculative-algo
+ - NEXTN
+ - --speculative-num-steps
+ - '3'
+ - --speculative-eagle-topk
+ - '1'
+ - --speculative-num-draft-tokens
+ - '4'
+ prefill: null
+ decode: null
+ B200:
+ configurations:
+ - name: default
+ attributes:
+ nodes: single
+ optimization: balanced
+ quantization: fp8
+ quantized_model_path: null
+ engine:
+ env_vars: {}
+ tp: 4
+ dp: null
+ ep: null
+ enable_dp_attention: null
+ extra_args: []
+ prefill: null
+ decode: null
+ - name: speculative-mtp
+ attributes:
+ nodes: single
+ optimization: low-latency
+ quantization: fp8
+ quantized_model_path: null
+ engine:
+ env_vars: {}
+ tp: 4
+ dp: null
+ ep: null
+ enable_dp_attention: null
+ extra_args:
+ - --speculative-algo
+ - NEXTN
+ - --speculative-num-steps
+ - '3'
+ - --speculative-eagle-topk
+ - '1'
+ - --speculative-num-draft-tokens
+ - '4'
+ prefill: null
+ decode: null
+ B300:
+ configurations:
+ - name: default
+ attributes:
+ nodes: single
+ optimization: balanced
+ quantization: fp8
+ quantized_model_path: null
+ engine:
+ env_vars: {}
+ tp: 2
+ dp: null
+ ep: null
+ enable_dp_attention: null
+ extra_args: []
+ prefill: null
+ decode: null
+ - name: speculative-mtp
+ attributes:
+ nodes: single
+ optimization: low-latency
+ quantization: fp8
+ quantized_model_path: null
+ engine:
+ env_vars: {}
+ tp: 2
+ dp: null
+ ep: null
+ enable_dp_attention: null
+ extra_args:
+ - --speculative-algo
+ - NEXTN
+ - --speculative-num-steps
+ - '3'
+ - --speculative-eagle-topk
+ - '1'
+ - --speculative-num-draft-tokens
+ - '4'
+ prefill: null
+ decode: null
+ - name: Qwen3.5-397B-A17B-NVFP4
+ model_path: nvidia/Qwen3.5-397B-A17B-NVFP4
+ attributes:
+ llm:
+ thinking_capability: hybrid
+ tool_parser: qwen3_coder
+ reasoning_parser: qwen3
+ chat_template: null
+ hardware:
+ B200:
+ configurations:
+ - name: default
+ attributes:
+ nodes: single
+ optimization: balanced
+ quantization: fp4
+ quantized_model_path: null
+ engine:
+ env_vars: {}
+ tp: 4
+ dp: null
+ ep: null
+ enable_dp_attention: null
+ extra_args:
+ - --attention-backend
+ - trtllm_mha
+ - --moe-runner-backend
+ - flashinfer_trtllm
+ - --fp4-gemm-backend
+ - flashinfer_cutlass
+ prefill: null
+ decode: null
+ - name: speculative-mtp
+ attributes:
+ nodes: single
+ optimization: low-latency
+ quantization: fp4
+ quantized_model_path: null
+ engine:
+ env_vars: {}
+ tp: 4
+ dp: null
+ ep: null
+ enable_dp_attention: null
+ extra_args:
+ - --attention-backend
+ - trtllm_mha
+ - --moe-runner-backend
+ - flashinfer_trtllm
+ - --fp4-gemm-backend
+ - flashinfer_cutlass
+ - --speculative-algo
+ - NEXTN
+ - --speculative-num-steps
+ - '3'
+ - --speculative-eagle-topk
+ - '1'
+ - --speculative-num-draft-tokens
+ - '4'
+ prefill: null
+ decode: null
+ B300:
+ configurations:
+ - name: default
+ attributes:
+ nodes: single
+ optimization: balanced
+ quantization: fp4
+ quantized_model_path: null
+ engine:
+ env_vars: {}
+ tp: 2
+ dp: null
+ ep: null
+ enable_dp_attention: null
+ extra_args:
+ - --attention-backend
+ - trtllm_mha
+ - --moe-runner-backend
+ - flashinfer_trtllm
+ - --fp4-gemm-backend
+ - flashinfer_cutlass
+ prefill: null
+ decode: null
+ - name: speculative-mtp
+ attributes:
+ nodes: single
+ optimization: low-latency
+ quantization: fp4
+ quantized_model_path: null
+ engine:
+ env_vars: {}
+ tp: 2
+ dp: null
+ ep: null
+ enable_dp_attention: null
+ extra_args:
+ - --attention-backend
+ - trtllm_mha
+ - --moe-runner-backend
+ - flashinfer_trtllm
+ - --fp4-gemm-backend
+ - flashinfer_cutlass
+ - --speculative-algo
+ - NEXTN
+ - --speculative-num-steps
+ - '3'
+ - --speculative-eagle-topk
+ - '1'
+ - --speculative-num-draft-tokens
+ - '4'
+ prefill: null
+ decode: null
diff --git a/data/models/src/v0.5.8/qwen35.yaml b/data/models/src/v0.5.8/qwen35.yaml
index 0b63f00a..9e1893d3 100644
--- a/data/models/src/v0.5.8/qwen35.yaml
+++ b/data/models/src/v0.5.8/qwen35.yaml
@@ -1,10 +1,18 @@
# Qwen3.5 Model Configurations (Simplified Format)
# This file is compiled to data/models/generated/qwen35.yaml
#
-# GPU requirements:
+# GPU requirements (BF16):
# H100: tp=16 (model ~800GB in BF16, each rank needs ~100GB > 80GB)
# H200: tp=8
# B200: tp=8
+#
+# GPU requirements (FP8):
+# H100: tp=8 (model ~400GB in FP8, each rank needs ~50GB < 80GB)
+# H200: tp=8
+# B200: tp=8
+#
+# GPU requirements (FP4):
+# B200 (183GB) and B300 (275GB) - FP4 requires Blackwell architecture
vendor: qwen
@@ -41,3 +49,34 @@ families:
models:
- name: Qwen3.5-397B-A17B
quantization: bf16
+
+ - name: Qwen3.5-397B-A17B-FP8
+ quantization: fp8
+ hardware:
+ H100: { tp: 8 }
+ H200: { tp: 4 }
+ B200: { tp: 4 }
+ B300: { tp: 2 }
+
+ - name: Qwen3.5-397B-A17B-NVFP4
+ quantization: fp4
+ model_path: nvidia/Qwen3.5-397B-A17B-NVFP4
+ hardware:
+ B200:
+ tp: 4
+ extra_args:
+ - --attention-backend
+ - trtllm_mha
+ - --moe-runner-backend
+ - flashinfer_trtllm
+ - --fp4-gemm-backend
+ - flashinfer_cutlass
+ B300:
+ tp: 2
+ extra_args:
+ - --attention-backend
+ - trtllm_mha
+ - --moe-runner-backend
+ - flashinfer_trtllm
+ - --fp4-gemm-backend
+ - flashinfer_cutlass
diff --git a/docs/autoregressive/Qwen/Qwen3.5.md b/docs/autoregressive/Qwen/Qwen3.5.md
index 34c937bd..22beaa56 100644
--- a/docs/autoregressive/Qwen/Qwen3.5.md
+++ b/docs/autoregressive/Qwen/Qwen3.5.md
@@ -18,6 +18,8 @@ Qwen3.5 features a Gated Delta Networks combined with sparse Mixture-of-Experts
**Available Models:**
- **BF16 (Full precision)**: [Qwen/Qwen3.5-397B-A17B](https://huggingface.co/Qwen/Qwen3.5-397B-A17B)
+- **FP8 (8-bit Quantized)**: [Qwen/Qwen3.5-397B-A17B-FP8](https://huggingface.co/Qwen/Qwen3.5-397B-A17B-FP8)
+- **FP4 (4-bit Quantized)**: [nvidia/Qwen3.5-397B-A17B-NVFP4](https://huggingface.co/nvidia/Qwen3.5-397B-A17B-NVFP4)
**License:** Apache 2.0
@@ -49,22 +51,34 @@ import Qwen35ConfigGenerator from '@site/src/components/autoregressive/Qwen35Con
### 3.2 Configuration Tips
-- The model has ~397B parameters in BF16, requiring ~800GB of GPU memory for weights alone.
-- **H100 (80GB)** requires tp=16 (2 nodes) since each rank needs ~100GB at tp=8.
-- **H200 (141GB)** and **B200 (192GB)** can run with tp=8 on a single node.
- Speculative decoding (MTP) can significantly reduce latency for interactive use cases.
- The `--mem-fraction-static` flag is recommended for optimal memory utilization, adjust it based on your hardware and workload.
- Context length defaults to 262,144 tokens. If you encounter OOM errors, consider reducing it, but maintain at least 128K to preserve thinking capabilities.
- To speed up weight loading for this large model, add `--model-loader-extra-config='{"enable_multithread_load": "true","num_threads": 64}'` to the launch command.
- **CUDA IPC Transport**: Add `SGLANG_USE_CUDA_IPC_TRANSPORT=1` as an environment variable to use CUDA IPC for transferring multimodal features, significantly improving TTFT (Time To First Token). Note: this consumes additional memory proportional to image size, so you may need to lower `--mem-fraction-static` or `--max-running-requests`.
-- **Multimodal Attention Backend**: Use `--mm-attention-backend fa3` on H100/H200 for better vision performance, or `--mm-attention-backend fa4` on B200.
+- **Multimodal Attention Backend**: Use `--mm-attention-backend fa3` on H100/H200 for better vision performance, or `--mm-attention-backend fa4` on B200/B300.
- For processing large images or videos, you may need to lower `--mem-fraction-static` to leave room for image feature tensors.
-
-| Hardware | TP |
-| -------- | -- |
-| H100 | 16 |
-| H200 | 8 |
-| B200 | 8 |
+- Hardware requirements:
+ - **BF16**: ~397B parameters require ~800GB of GPU memory for weights.
+ - **H100 (80GB)** requires tp=16 (2 nodes) since each rank needs ~100GB at tp=8.
+ - **H200 (141GB)** runs with tp=8.
+ - **B200 (183GB)** runs with tp=8.
+ - **B300 (275GB)** runs with tp=4.
+ - **FP8**: The FP8 quantized model requires ~400GB for weights, cutting memory in half.
+ - **H100 (80GB)** runs with tp=8.
+ - **H200 (141GB)** runs with tp=4.
+ - **B200 (183GB)** runs with tp=4.
+ - **B300 (275GB)** runs with tp=2.
+ - **FP4**: The FP4 quantized model requires ~250GB for weights, cutting memory by almost 4x. Only compatible with B200/B300 (Blackwell architecture).
+ - **B200 (183GB)** runs with tp=4.
+ - **B300 (275GB)** runs with tp=2.
+
+| Hardware | Memory | BF16 TP | FP8 TP | FP4 TP |
+| -------- | ------ | ------- | ------ | --------------- |
+| H100 | 80GB | 16 | 8 | N/A |
+| H200 | 141GB | 8 | 4 | N/A |
+| B200 | 183GB | 8 | 4 | 4 |
+| B300 | 275GB | 4 | 2 | 2 |
## 4. Model Invocation
diff --git a/docs/intro.md b/docs/intro.md
index 134f5c6d..0c8ba70f 100644
--- a/docs/intro.md
+++ b/docs/intro.md
@@ -28,7 +28,7 @@ Each recipe provides step-by-step instructions to help you quickly implement SGL
#### Qwen
-- [ ] [Qwen3.5](./autoregressive/Qwen/Qwen3.5.md) NEW
+- [x] [Qwen3.5](./autoregressive/Qwen/Qwen3.5.md) NEW
- [x] [Qwen3](./autoregressive/Qwen/Qwen3.md)
- [x] [Qwen3-Next](./autoregressive/Qwen/Qwen3-Next.md)
- [x] [Qwen3-VL](./autoregressive/Qwen/Qwen3-VL.md)
diff --git a/src/components/autoregressive/Qwen35ConfigGenerator/index.js b/src/components/autoregressive/Qwen35ConfigGenerator/index.js
index 871088c4..713d015b 100644
--- a/src/components/autoregressive/Qwen35ConfigGenerator/index.js
+++ b/src/components/autoregressive/Qwen35ConfigGenerator/index.js
@@ -4,12 +4,23 @@ import ConfigGenerator from '../../base/ConfigGenerator';
/**
* Qwen3.5-397B-A17B Configuration Generator
* Supports Qwen3.5 397B (17B active) MoE VLM deployment configuration
- * with reasoning parser, tool calling, and speculative decoding
+ * with reasoning parser, tool calling, speculative decoding, and quantization options
*
- * GPU requirements:
+ * GPU requirements (BF16):
* H100: tp=16 (model ~800GB in BF16, each rank needs ~100GB > 80GB)
* H200: tp=8
* B200: tp=8
+ * B300: tp=4
+ *
+ * GPU requirements (FP8):
+ * H100: tp=8 (model ~400GB in FP8, each rank needs ~50GB < 80GB)
+ * H200: tp=4
+ * B200: tp=4
+ * B300: tp=2
+ *
+ * GPU requirements (FP4):
+ * B200: tp=4 (FP4 requires Blackwell)
+ * B300: tp=2 (FP4 requires Blackwell)
*/
const Qwen35ConfigGenerator = () => {
const config = {
@@ -19,10 +30,23 @@ const Qwen35ConfigGenerator = () => {
hardware: {
name: 'hardware',
title: 'Hardware Platform',
+ getDynamicItems: (values) => {
+ const isNvfp4 = values.quantization === 'fp4';
+ return [
+ { id: 'h100', label: 'H100', default: !isNvfp4, disabled: isNvfp4 },
+ { id: 'h200', label: 'H200', default: false, disabled: isNvfp4 },
+ { id: 'b200', label: 'B200', default: false, disabled: false },
+ { id: 'b300', label: 'B300', default: isNvfp4, disabled: false }
+ ];
+ }
+ },
+ quantization: {
+ name: 'quantization',
+ title: 'Quantization',
items: [
- { id: 'h200', label: 'H200', default: true },
- { id: 'b200', label: 'B200', default: false },
- { id: 'h100', label: 'H100', default: false }
+ { id: 'bf16', label: 'BF16', default: false },
+ { id: 'fp8', label: 'FP8', default: true },
+ { id: 'fp4', label: 'FP4', default: false }
]
},
reasoning: {
@@ -55,17 +79,29 @@ const Qwen35ConfigGenerator = () => {
},
modelConfigs: {
- h100: { bf16: { tp: 16, mem: 0.8 } },
- h200: { bf16: { tp: 8, mem: 0.8 } },
- b200: { bf16: { tp: 8, mem: 0.82 } }
+ h100: { bf16: { tp: 16, mem: 0.8 }, fp8: { tp: 8, mem: 0.8 } },
+ h200: { bf16: { tp: 8, mem: 0.8 }, fp8: { tp: 4, mem: 0.8 } },
+ b200: { bf16: { tp: 8, mem: 0.8 }, fp8: { tp: 4, mem: 0.8 }, fp4: { tp: 4, mem: 0.8 } },
+ b300: { bf16: { tp: 4, mem: 0.8 }, fp8: { tp: 2, mem: 0.8 }, fp4: { tp: 2, mem: 0.8 } }
},
generateCommand: function (values) {
- const { hardware, speculative } = values;
+ const { hardware, quantization, speculative } = values;
+
+ // Validate hardware supports the quantization
+ const hwConfig = this.modelConfigs[hardware]?.[quantization];
+ if (!hwConfig) {
+ return '# Please select compatible hardware for the chosen quantization\n# FP4 requires B200/B300 (Blackwell)';
+ }
- const modelName = `${this.modelFamily}/Qwen3.5-397B-A17B`;
+ let modelName;
+ if (quantization === 'fp4') {
+ modelName = 'nvidia/Qwen3.5-397B-A17B-NVFP4';
+ } else {
+ const quantSuffix = quantization === 'fp8' ? '-FP8' : '';
+ modelName = `${this.modelFamily}/Qwen3.5-397B-A17B${quantSuffix}`;
+ }
- const hwConfig = this.modelConfigs[hardware].bf16;
const tpValue = hwConfig.tp;
const memFraction = hwConfig.mem;
@@ -74,8 +110,9 @@ const Qwen35ConfigGenerator = () => {
cmd += ` --model ${modelName}`;
cmd += ` \\\n --tp ${tpValue}`;
- // Apply commandRule from all options
+ // Apply commandRule from all options except quantization (handled via model name)
Object.entries(this.options).forEach(([key, option]) => {
+ if (key === 'quantization') return;
if (option.commandRule) {
const rule = option.commandRule(values[key]);
if (rule) {
@@ -84,18 +121,27 @@ const Qwen35ConfigGenerator = () => {
}
});
- // Append B200-specific backend configurations
- if (hardware === 'b200') {
+ // Enable allreduce fusion for all Qwen3.5 configs.
+ cmd += ` \\\n --enable-flashinfer-allreduce-fusion`;
+
+ // Append backend configurations
+ if (hardware === 'b200' || hardware === 'b300') {
cmd += ` \\\n --attention-backend trtllm_mha`;
- cmd += ` \\\n --moe-runner-backend flashinfer_trtllm`;
- cmd += ` \\\n --disable-radix-cache`;
- cmd += ` \\\n --enable-flashinfer-allreduce-fusion`;
+ }
+
+ // Append B200/B300-specific backend configurations
+ if (hardware === 'b200' || hardware === 'b300') {
if (speculative === 'disabled') {
cmd += ` \\\n --tokenizer-worker-num 6`;
}
}
- // Add memory fraction
+ // FP4-specific backend settings
+ if (quantization === 'fp4') {
+ cmd += ' \\\n --moe-runner-backend flashinfer_trtllm \\\n --fp4-gemm-backend flashinfer_cutlass \\\n --kv-cache-dtype fp8_e4m3';
+ }
+
+ // Add memory fraction last
cmd += ` \\\n --mem-fraction-static ${memFraction}`;
return cmd;