sgl-project · JustinTong0323 · Mar 2, 2026 · Feb 18, 2026 · Feb 24, 2026 · Feb 24, 2026
diff --git a/data/models/generated/v0.5.8/qwen35.yaml b/data/models/generated/v0.5.8/qwen35.yaml
@@ -132,3 +132,285 @@ families:
             - '4'
           prefill: null
           decode: null
+  - name: Qwen3.5-397B-A17B-FP8
+    model_path: Qwen/Qwen3.5-397B-A17B-FP8
+    attributes:
+      llm:
+        thinking_capability: hybrid
+        tool_parser: qwen3_coder
+        reasoning_parser: qwen3
+        chat_template: null
+    hardware:
+      H100:
+        configurations:
+        - name: default
+          attributes:
+            nodes: single
+            optimization: balanced
+            quantization: fp8
+          quantized_model_path: null
+          engine:
+            env_vars: {}
+            tp: 8
+            dp: null
+            ep: null
+            enable_dp_attention: null
+            extra_args: []
+          prefill: null
+          decode: null
+        - name: speculative-mtp
+          attributes:
+            nodes: single
+            optimization: low-latency
+            quantization: fp8
+          quantized_model_path: null
+          engine:
+            env_vars: {}
+            tp: 8
+            dp: null
+            ep: null
+            enable_dp_attention: null
+            extra_args:
+            - --speculative-algo
+            - NEXTN
+            - --speculative-num-steps
+            - '3'
+            - --speculative-eagle-topk
+            - '1'
+            - --speculative-num-draft-tokens
+            - '4'
+          prefill: null
+          decode: null
+      H200:
+        configurations:
+        - name: default
+          attributes:
+            nodes: single
+            optimization: balanced
+            quantization: fp8
+          quantized_model_path: null
+          engine:
+            env_vars: {}
+            tp: 4
+            dp: null
+            ep: null
+            enable_dp_attention: null
+            extra_args: []
+          prefill: null
+          decode: null
+        - name: speculative-mtp
+          attributes:
+            nodes: single
+            optimization: low-latency
+            quantization: fp8
+          quantized_model_path: null
+          engine:
+            env_vars: {}
+            tp: 4
+            dp: null
+            ep: null
+            enable_dp_attention: null
+            extra_args:
+            - --speculative-algo
+            - NEXTN
+            - --speculative-num-steps
+            - '3'
+            - --speculative-eagle-topk
+            - '1'
+            - --speculative-num-draft-tokens
+            - '4'
+          prefill: null
+          decode: null
+      B200:
+        configurations:
+        - name: default
+          attributes:
+            nodes: single
+            optimization: balanced
+            quantization: fp8
+          quantized_model_path: null
+          engine:
+            env_vars: {}
+            tp: 4
+            dp: null
+            ep: null
+            enable_dp_attention: null
+            extra_args: []
+          prefill: null
+          decode: null
+        - name: speculative-mtp
+          attributes:
+            nodes: single
+            optimization: low-latency
+            quantization: fp8
+          quantized_model_path: null
+          engine:
+            env_vars: {}
+            tp: 4
+            dp: null
+            ep: null
+            enable_dp_attention: null
+            extra_args:
+            - --speculative-algo
+            - NEXTN
+            - --speculative-num-steps
+            - '3'
+            - --speculative-eagle-topk
+            - '1'
+            - --speculative-num-draft-tokens
+            - '4'
+          prefill: null
+          decode: null
+      B300:
+        configurations:
+        - name: default
+          attributes:
+            nodes: single
+            optimization: balanced
+            quantization: fp8
+          quantized_model_path: null
+          engine:
+            env_vars: {}
+            tp: 2
+            dp: null
+            ep: null
+            enable_dp_attention: null
+            extra_args: []
+          prefill: null
+          decode: null
+        - name: speculative-mtp
+          attributes:
+            nodes: single
+            optimization: low-latency
+            quantization: fp8
+          quantized_model_path: null
+          engine:
+            env_vars: {}
+            tp: 2
+            dp: null
+            ep: null
+            enable_dp_attention: null
+            extra_args:
+            - --speculative-algo
+            - NEXTN
+            - --speculative-num-steps
+            - '3'
+            - --speculative-eagle-topk
+            - '1'
+            - --speculative-num-draft-tokens
+            - '4'
+          prefill: null
+          decode: null
+  - name: Qwen3.5-397B-A17B-NVFP4
+    model_path: nvidia/Qwen3.5-397B-A17B-NVFP4
+    attributes:
+      llm:
+        thinking_capability: hybrid
+        tool_parser: qwen3_coder
+        reasoning_parser: qwen3
+        chat_template: null
+    hardware:
+      B200:
+        configurations:
+        - name: default
+          attributes:
+            nodes: single
+            optimization: balanced
+            quantization: fp4
+          quantized_model_path: null
+          engine:
+            env_vars: {}
+            tp: 4
+            dp: null
+            ep: null
+            enable_dp_attention: null
+            extra_args:
+            - --attention-backend
+            - trtllm_mha
+            - --moe-runner-backend
+            - flashinfer_trtllm
+            - --fp4-gemm-backend
+            - flashinfer_cutlass
+          prefill: null
+          decode: null
+        - name: speculative-mtp
+          attributes:
+            nodes: single
+            optimization: low-latency
+            quantization: fp4
+          quantized_model_path: null
+          engine:
+            env_vars: {}
+            tp: 4
+            dp: null
+            ep: null
+            enable_dp_attention: null
+            extra_args:
+            - --attention-backend
+            - trtllm_mha
+            - --moe-runner-backend
+            - flashinfer_trtllm
+            - --fp4-gemm-backend
+            - flashinfer_cutlass
+            - --speculative-algo
+            - NEXTN
+            - --speculative-num-steps
+            - '3'
+            - --speculative-eagle-topk
+            - '1'
+            - --speculative-num-draft-tokens
+            - '4'
+          prefill: null
+          decode: null
+      B300:
+        configurations:
+        - name: default
+          attributes:
+            nodes: single
+            optimization: balanced
+            quantization: fp4
+          quantized_model_path: null
+          engine:
+            env_vars: {}
+            tp: 2
+            dp: null
+            ep: null
+            enable_dp_attention: null
+            extra_args:
+            - --attention-backend
+            - trtllm_mha
+            - --moe-runner-backend
+            - flashinfer_trtllm
+            - --fp4-gemm-backend
+            - flashinfer_cutlass
+          prefill: null
+          decode: null
+        - name: speculative-mtp
+          attributes:
+            nodes: single
+            optimization: low-latency
+            quantization: fp4
+          quantized_model_path: null
+          engine:
+            env_vars: {}
+            tp: 2
+            dp: null
+            ep: null
+            enable_dp_attention: null
+            extra_args:
+            - --attention-backend
+            - trtllm_mha
+            - --moe-runner-backend
+            - flashinfer_trtllm
+            - --fp4-gemm-backend
+            - flashinfer_cutlass
+            - --speculative-algo
+            - NEXTN
+            - --speculative-num-steps
+            - '3'
+            - --speculative-eagle-topk
+            - '1'
+            - --speculative-num-draft-tokens
+            - '4'
+          prefill: null
+          decode: null
diff --git a/data/models/src/v0.5.8/qwen35.yaml b/data/models/src/v0.5.8/qwen35.yaml
@@ -1,10 +1,18 @@
 # Qwen3.5 Model Configurations (Simplified Format)
 # This file is compiled to data/models/generated/qwen35.yaml
 #
-# GPU requirements:
+# GPU requirements (BF16):
 #   H100: tp=16 (model ~800GB in BF16, each rank needs ~100GB > 80GB)
 #   H200: tp=8
 #   B200: tp=8
+#
+# GPU requirements (FP8):
+#   H100: tp=8 (model ~400GB in FP8, each rank needs ~50GB < 80GB)
+#   H200: tp=8
+#   B200: tp=8
+#
+# GPU requirements (FP4):
+#   B200 (183GB) and B300 (275GB) - FP4 requires Blackwell architecture
 
 vendor: qwen
 
@@ -41,3 +49,34 @@ families:
     models:
       - name: Qwen3.5-397B-A17B
         quantization: bf16
+
+      - name: Qwen3.5-397B-A17B-FP8
+        quantization: fp8
+        hardware:
+          H100: { tp: 8 }
+          H200: { tp: 4 }
+          B200: { tp: 4 }
+          B300: { tp: 2 }
+
+      - name: Qwen3.5-397B-A17B-NVFP4
+        quantization: fp4
+        model_path: nvidia/Qwen3.5-397B-A17B-NVFP4
+        hardware:
+          B200:
+            tp: 4
+            extra_args:
+              - --attention-backend
+              - trtllm_mha
+              - --moe-runner-backend
+              - flashinfer_trtllm
+              - --fp4-gemm-backend
+              - flashinfer_cutlass
+          B300:
+            tp: 2
+            extra_args:
+              - --attention-backend
+              - trtllm_mha
+              - --moe-runner-backend
+              - flashinfer_trtllm
+              - --fp4-gemm-backend
+              - flashinfer_cutlass
diff --git a/docs/autoregressive/Qwen/Qwen3.5.md b/docs/autoregressive/Qwen/Qwen3.5.md
@@ -18,6 +18,8 @@ Qwen3.5 features a Gated Delta Networks combined with sparse Mixture-of-Experts
 **Available Models:**
 
 - **BF16 (Full precision)**: [Qwen/Qwen3.5-397B-A17B](https://huggingface.co/Qwen/Qwen3.5-397B-A17B)
+- **FP8 (8-bit Quantized)**: [Qwen/Qwen3.5-397B-A17B-FP8](https://huggingface.co/Qwen/Qwen3.5-397B-A17B-FP8)
+- **FP4 (4-bit Quantized)**: [nvidia/Qwen3.5-397B-A17B-NVFP4](https://huggingface.co/nvidia/Qwen3.5-397B-A17B-NVFP4)
 
 **License:** Apache 2.0
 
@@ -49,22 +51,34 @@ import Qwen35ConfigGenerator from '@site/src/components/autoregressive/Qwen35Con
 
 ### 3.2 Configuration Tips
 
-- The model has ~397B parameters in BF16, requiring ~800GB of GPU memory for weights alone.
-- **H100 (80GB)** requires tp=16 (2 nodes) since each rank needs ~100GB at tp=8.
-- **H200 (141GB)** and **B200 (192GB)** can run with tp=8 on a single node.
 - Speculative decoding (MTP) can significantly reduce latency for interactive use cases.
 - The `--mem-fraction-static` flag is recommended for optimal memory utilization, adjust it based on your hardware and workload.
 - Context length defaults to 262,144 tokens. If you encounter OOM errors, consider reducing it, but maintain at least 128K to preserve thinking capabilities.
 - To speed up weight loading for this large model, add `--model-loader-extra-config='{"enable_multithread_load": "true","num_threads": 64}'` to the launch command.
 - **CUDA IPC Transport**: Add `SGLANG_USE_CUDA_IPC_TRANSPORT=1` as an environment variable to use CUDA IPC for transferring multimodal features, significantly improving TTFT (Time To First Token). Note: this consumes additional memory proportional to image size, so you may need to lower `--mem-fraction-static` or `--max-running-requests`.
-- **Multimodal Attention Backend**: Use `--mm-attention-backend fa3` on H100/H200 for better vision performance, or `--mm-attention-backend fa4` on B200.
+- **Multimodal Attention Backend**: Use `--mm-attention-backend fa3` on H100/H200 for better vision performance, or `--mm-attention-backend fa4` on B200/B300.
 - For processing large images or videos, you may need to lower `--mem-fraction-static` to leave room for image feature tensors.
-
-| Hardware | TP |
-| -------- | -- |
-| H100     | 16 |
-| H200     | 8  |
-| B200     | 8  |
+- Hardware requirements:
+    - **BF16**: ~397B parameters require ~800GB of GPU memory for weights.
+        - **H100 (80GB)** requires tp=16 (2 nodes) since each rank needs ~100GB at tp=8.
+        - **H200 (141GB)** runs with tp=8.
+        - **B200 (183GB)** runs with tp=8.
+        - **B300 (275GB)** runs with tp=4.
+    - **FP8**: The FP8 quantized model requires ~400GB for weights, cutting memory in half.
+        - **H100 (80GB)** runs with tp=8.
+        - **H200 (141GB)** runs with tp=4.
+        - **B200 (183GB)** runs with tp=4.
+        - **B300 (275GB)** runs with tp=2.
+    - **FP4**: The FP4 quantized model requires ~250GB for weights, cutting memory by almost 4x. Only compatible with B200/B300 (Blackwell architecture).
+        - **B200 (183GB)** runs with tp=4.
+        - **B300 (275GB)** runs with tp=2.
+
+| Hardware | Memory | BF16 TP | FP8 TP | FP4 TP |
+| -------- | ------ | ------- | ------ | --------------- |
+| H100     | 80GB   | 16      | 8      | N/A             |
+| H200     | 141GB  | 8       | 4      | N/A             |
+| B200     | 183GB  | 8       | 4      | 4               |
+| B300     | 275GB  | 4       | 2      | 2               |
 
 ## 4. Model Invocation