From e586a78127d2b50ac052938633b5c63cf0809abf Mon Sep 17 00:00:00 2001 From: BBuf <1182563586@qq.com> Date: Sun, 19 Apr 2026 17:19:36 +0800 Subject: [PATCH 1/5] Add Qwen Image ModelOpt FP8 diffusion support --- docs/diffusion/quantization.md | 42 ++++- .../sglang-diffusion-modelopt-quant/SKILL.md | 31 +++- .../runtime/models/dits/qwen_image.py | 144 +++++++++++++++--- .../tools/build_modelopt_fp8_transformer.py | 26 +++- 4 files changed, 210 insertions(+), 33 deletions(-) diff --git a/docs/diffusion/quantization.md b/docs/diffusion/quantization.md index 970b1ee5d743..a8850ca1dda4 100644 --- a/docs/diffusion/quantization.md +++ b/docs/diffusion/quantization.md @@ -43,21 +43,21 @@ backend. | quant_family | checkpoint form | canonical CLI | supported models | extra dependency | platform / notes | |-------------------|--------------------------------------------------------------------------------------------|------------------------------------------------------------------------|-----------------------------------------|---------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------| | `fp8` | Quantized transformer component folder, or safetensors with `quantization_config` metadata | `--transformer-path` or `--transformer-weights-path` | ALL | None | Component-folder and single-file flows are both supported | -| `modelopt-fp8` | Converted ModelOpt FP8 transformer directory or repo with `config.json` | `--transformer-path` | FLUX.1, FLUX.2, Wan2.2 | None | Serialized config stays `quant_method=modelopt` with `quant_algo=FP8`; `dit_layerwise_offload` is supported and `dit_cpu_offload` stays disabled | +| `modelopt-fp8` | Converted ModelOpt FP8 transformer directory or repo with `config.json` | `--transformer-path` | FLUX.1, FLUX.2, Wan2.2, Qwen Image, Qwen Image Edit | None | Serialized config stays `quant_method=modelopt` with `quant_algo=FP8`; `dit_layerwise_offload` is supported and `dit_cpu_offload` stays disabled | | `modelopt-nvfp4` | Mixed transformer directory/repo with `config.json`, or raw NVFP4 safetensors export/repo | `--transformer-path` for mixed overrides; `--transformer-weights-path` for raw exports | FLUX.1, FLUX.2, Wan2.2 | None | Mixed override repos keep the base model separate; raw exports such as `black-forest-labs/FLUX.2-dev-NVFP4` still use the weights-path flow | | `nunchaku-svdq` | Pre-quantized Nunchaku transformer weights, usually named `svdq-{int4\|fp4}_r{rank}-...` | `--transformer-weights-path` | Model-specific support such as Qwen-Image, FLUX, and Z-Image | `nunchaku` | SGLang can infer precision and rank from the filename and supports both `int4` and `nvfp4` | | `msmodelslim` | Pre-quantized msmodelslim transformer weights | `--model-path` | Wan2.2 family | None | Currently only compatible with the Ascend NPU family and supports both `w8a8` and `w4a4` | ## Validated ModelOpt Checkpoints -This section is the canonical support matrix for the six diffusion ModelOpt -checkpoints currently wired up in SGLang docs and B200 CI coverage. +This section is the canonical support matrix for the diffusion ModelOpt +checkpoints currently wired up in SGLang docs and validation coverage. Published checkpoints keep the serialized quantization config as `quant_method=modelopt`; the FP8 vs NVFP4 split below is a documentation label derived from `quant_algo`. -Five of the six repos live under `BBuf/*`. The FLUX.2 NVFP4 entry keeps the +Seven of the eight repos live under `BBuf/*`. The FLUX.2 NVFP4 entry keeps the official `black-forest-labs/FLUX.2-dev-NVFP4` repo. | Quant Algo | Base Model | Preferred CLI | HF Repo | Current Scope | Notes | @@ -65,12 +65,15 @@ official `black-forest-labs/FLUX.2-dev-NVFP4` repo. | `FP8` | `black-forest-labs/FLUX.1-dev` | `--transformer-path` | `BBuf/flux1-dev-modelopt-fp8-sglang-transformer` | single-transformer override, deterministic latent/image comparison, H100 benchmark, torch-profiler trace | SGLang converter keeps a validated BF16 fallback set for modulation and FF projection layers; use `--model-id FLUX.1-dev` for local mirrors | | `FP8` | `black-forest-labs/FLUX.2-dev` | `--transformer-path` | `BBuf/flux2-dev-modelopt-fp8-sglang-transformer` | single-transformer override load and generation path | published SGLang-ready transformer override | | `FP8` | `Wan-AI/Wan2.2-T2V-A14B-Diffusers` | `--transformer-path` | `BBuf/wan22-t2v-a14b-modelopt-fp8-sglang-transformer` | primary `transformer` quantized, `transformer_2` kept BF16 | primary-transformer-only path; keep `transformer_2` on the base checkpoint, and do not describe this as dual-transformer full-model FP8 unless that path is validated separately | +| `FP8` | `Qwen/Qwen-Image` | `--transformer-path` | `BBuf/Qwen-Image-ModelOpt-FP8-SGLang` | single-transformer override, BF16-vs-FP8 image comparison, H100 benchmark, torch-profiler trace | shares the Qwen Image FP8 fallback preset; keep `img_in`, `txt_in`, timestep embedder, `norm_out.linear`, `proj_out`, `img_mod`/`txt_mod`, and `img_mlp.net.2` in BF16 | +| `FP8` | `Qwen/Qwen-Image-Edit-2511` | `--transformer-path` | `BBuf/Qwen-Image-Edit-ModelOpt-FP8-SGLang` | TI2I edit smoke, BF16-vs-FP8 image comparison, H100 benchmark | shares `QwenImageTransformer2DModel` with Qwen Image and uses the same Qwen Image FP8 fallback preset | | `NVFP4` | `black-forest-labs/FLUX.1-dev` | `--transformer-path` | `BBuf/flux1-dev-modelopt-nvfp4-sglang-transformer` | mixed BF16+NVFP4 transformer override, correctness validation, 4x RTX 5090 benchmark, torch-profiler trace | use `build_modelopt_nvfp4_transformer.py`; validated builder keeps selected FLUX.1 modules in BF16 and sets `swap_weight_nibbles=false` | | `NVFP4` | `black-forest-labs/FLUX.2-dev` | `--transformer-weights-path` | `black-forest-labs/FLUX.2-dev-NVFP4` | packed-QKV load path | official raw export repo; validated packed export detection and runtime layout handling | | `NVFP4` | `Wan-AI/Wan2.2-T2V-A14B-Diffusers` | `--transformer-path` | `BBuf/wan22-t2v-a14b-modelopt-nvfp4-sglang-transformer` | primary `transformer` quantized with ModelOpt NVFP4, `transformer_2` kept BF16 | primary-transformer-only path; keep `transformer_2` on the base checkpoint, and current B200/Blackwell bring-up uses `SGLANG_DIFFUSION_FLASHINFER_FP4_GEMM_BACKEND=cudnn` | -These six checkpoints are also the intended case set for the B200 diffusion CI -job (`multimodal-gen-test-1-b200`). +The FLUX and Wan entries are also the intended case set for the B200 diffusion +CI job (`multimodal-gen-test-1-b200`). The Qwen Image FP8 entries are currently +H100 manual-validation artifacts. ## ModelOpt FP8 @@ -96,6 +99,23 @@ sglang generate \ --save-output ``` +```bash +sglang generate \ + --model-path Qwen/Qwen-Image \ + --transformer-path BBuf/Qwen-Image-ModelOpt-FP8-SGLang \ + --prompt "A tiny astronaut reading a book under a glass greenhouse" \ + --save-output +``` + +```bash +sglang generate \ + --model-path Qwen/Qwen-Image-Edit-2511 \ + --transformer-path BBuf/Qwen-Image-Edit-ModelOpt-FP8-SGLang \ + --image-path /path/to/input.png \ + --prompt "Turn the scene into a warm watercolor illustration" \ + --save-output +``` + ### Notes - `--transformer-path` is the canonical flag for converted ModelOpt FP8 @@ -112,6 +132,16 @@ sglang generate \ - On disk, the quantization config stays `quant_method=modelopt` with `quant_algo=FP8`; the `modelopt-fp8` label in this document is a support family name, not a serialized config key. +- `Qwen/Qwen-Image` and `Qwen/Qwen-Image-Edit-2511` share the `qwen-image` + converter preset. Use `--model-type qwen-image` to force it, or rely on + auto-detection from `_class_name=QwenImageTransformer2DModel`. +- The validated Qwen Image FP8 fallback preset keeps `img_in`, `txt_in`, + timestep embedder linear layers, `norm_out.linear`, `proj_out`, + `transformer_blocks.*.(img_mod|txt_mod)`, and + `transformer_blocks.*.img_mlp.net.2` in BF16. +- For Qwen Image FP8 conversion, write explicit BF16 fallback tensors before + honoring ModelOpt ignored weights. Otherwise converter stats can report a + fallback while the output checkpoint still retains the source FP8 tensor. - To build the converted checkpoint yourself from a ModelOpt diffusers export, use `python -m sglang.multimodal_gen.tools.build_modelopt_fp8_transformer`. diff --git a/python/sglang/multimodal_gen/.claude/skills/sglang-diffusion-modelopt-quant/SKILL.md b/python/sglang/multimodal_gen/.claude/skills/sglang-diffusion-modelopt-quant/SKILL.md index 227bea36942c..0c60baf1ad16 100644 --- a/python/sglang/multimodal_gen/.claude/skills/sglang-diffusion-modelopt-quant/SKILL.md +++ b/python/sglang/multimodal_gen/.claude/skills/sglang-diffusion-modelopt-quant/SKILL.md @@ -61,9 +61,9 @@ This repo now contains: - trajectory similarity validation: [`python/sglang/multimodal_gen/tools/compare_diffusion_trajectory_similarity.py`](../../../tools/compare_diffusion_trajectory_similarity.py) -Validated documentation and CI coverage currently center on six ModelOpt diffusion transformer override families: +Validated documentation and CI coverage currently center on these ModelOpt diffusion transformer override families: -- FP8: FLUX.1-dev, FLUX.2-dev, Wan2.2 +- FP8: FLUX.1-dev, FLUX.2-dev, Wan2.2, Qwen Image, Qwen Image Edit - NVFP4: FLUX.1-dev, FLUX.2-dev, Wan2.2 Treat a new family, a new precision, or a new checkpoint layout as unsupported until it has a documented matrix row and a matching validation story. @@ -173,6 +173,33 @@ For `FLUX.1-dev`, the validated fallback set currently keeps these modules in BF Use `--model-type flux1` to force that profile, or rely on `--model-type auto` when the export config identifies `FluxTransformer2DModel`. +Qwen Image and Qwen Image Edit share `QwenImageTransformer2DModel`, so one +ModelOpt FP8 fallback preset covers both. The validated Qwen Image fallback set +keeps these modules in BF16: + +- `img_in` +- `txt_in` +- `time_text_embed.timestep_embedder.linear_1` +- `time_text_embed.timestep_embedder.linear_2` +- `norm_out.linear` +- `proj_out` +- `transformer_blocks.*.img_mlp.net.2` +- `transformer_blocks.*.img_mod` +- `transformer_blocks.*.txt_mod` + +Use `--model-type qwen-image` to force that profile, or rely on +`--model-type auto` when the export config identifies +`QwenImageTransformer2DModel`. + +Qwen modulation weights can appear in safetensors as `.img_mod.1.weight` and +`.txt_mod.1.weight`. Canonicalize those module names to `.img_mod` and +`.txt_mod` before fallback matching. + +For Qwen Image FP8, explicit BF16 fallback tensors must be written before +honoring ModelOpt ignored weights. Otherwise converter stats can report a +fallback while the output checkpoint still retains the source FP8 tensor, which +causes severe image-quality regressions. + For FLUX.1-dev NVFP4 model families that need a mixed BF16+NVFP4 checkpoint, build the merged transformer explicitly: ```bash diff --git a/python/sglang/multimodal_gen/runtime/models/dits/qwen_image.py b/python/sglang/multimodal_gen/runtime/models/dits/qwen_image.py index 1b3bc8a4ae4d..bfb5a19cdb63 100644 --- a/python/sglang/multimodal_gen/runtime/models/dits/qwen_image.py +++ b/python/sglang/multimodal_gen/runtime/models/dits/qwen_image.py @@ -10,7 +10,6 @@ import torch import torch.nn as nn import torch.nn.functional as F -from diffusers.models.attention import FeedForward from diffusers.models.embeddings import TimestepEmbedding, Timesteps from diffusers.models.modeling_outputs import Transformer2DModelOutput from diffusers.models.normalization import AdaLayerNormContinuous @@ -532,10 +531,27 @@ def __init__( prefix=f"{prefix}.to_qkv", ) else: - # Use separate Q/K/V projections for non-quantized models - self.to_q = ReplicatedLinear(dim, self.inner_dim, bias=True) - self.to_k = ReplicatedLinear(dim, self.inner_dim, bias=True) - self.to_v = ReplicatedLinear(dim, self.inner_dim, bias=True) + self.to_q = ReplicatedLinear( + dim, + self.inner_dim, + bias=True, + quant_config=quant_config, + prefix=f"{prefix}.to_q", + ) + self.to_k = ReplicatedLinear( + dim, + self.inner_dim, + bias=True, + quant_config=quant_config, + prefix=f"{prefix}.to_k", + ) + self.to_v = ReplicatedLinear( + dim, + self.inner_dim, + bias=True, + quant_config=quant_config, + prefix=f"{prefix}.to_v", + ) if self.qk_norm: self.norm_q = RMSNorm(head_dim, eps=eps) if qk_norm else nn.Identity() @@ -552,15 +568,26 @@ def __init__( prefix=f"{prefix}.to_added_qkv", ) else: - # Use separate Q/K/V projections for non-quantized models self.add_q_proj = ReplicatedLinear( - added_kv_proj_dim, self.inner_dim, bias=True + added_kv_proj_dim, + self.inner_dim, + bias=True, + quant_config=quant_config, + prefix=f"{prefix}.add_q_proj", ) self.add_k_proj = ReplicatedLinear( - added_kv_proj_dim, self.inner_dim, bias=True + added_kv_proj_dim, + self.inner_dim, + bias=True, + quant_config=quant_config, + prefix=f"{prefix}.add_k_proj", ) self.add_v_proj = ReplicatedLinear( - added_kv_proj_dim, self.inner_dim, bias=True + added_kv_proj_dim, + self.inner_dim, + bias=True, + quant_config=quant_config, + prefix=f"{prefix}.add_v_proj", ) if context_pre_only is not None and not context_pre_only: @@ -701,6 +728,65 @@ def forward( return img_attn_output, txt_attn_output +class QwenImageGELU(nn.Module): + def __init__( + self, + dim: int, + inner_dim: int, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + ) -> None: + super().__init__() + self.proj = ReplicatedLinear( + dim, + inner_dim, + bias=True, + quant_config=quant_config, + prefix=f"{prefix}.proj", + ) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states, _ = self.proj(hidden_states) + return F.gelu(hidden_states, approximate="tanh") + + +class QwenImageFeedForward(nn.Module): + def __init__( + self, + dim: int, + dim_out: int, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + mult: int = 4, + ) -> None: + super().__init__() + inner_dim = dim * mult + self.net = nn.ModuleList( + [ + QwenImageGELU( + dim, + inner_dim, + quant_config=quant_config, + prefix=f"{prefix}.net.0", + ), + nn.Dropout(0.0), + ReplicatedLinear( + inner_dim, + dim_out, + bias=True, + quant_config=quant_config, + prefix=f"{prefix}.net.2", + ), + ] + ) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.net[0](hidden_states) + hidden_states = self.net[1](hidden_states) + hidden_states, _ = self.net[2](hidden_states) + return hidden_states + + class QwenImageTransformerBlock(nn.Module): def __init__( self, @@ -790,15 +876,17 @@ def __init__( activation_fn="gelu-approximate", ) else: - self.img_mlp = FeedForward( + self.img_mlp = QwenImageFeedForward( dim=dim, dim_out=dim, - activation_fn="gelu-approximate", + quant_config=quant_config, + prefix=f"{prefix}.img_mlp", ) - self.txt_mlp = FeedForward( + self.txt_mlp = QwenImageFeedForward( dim=dim, dim_out=dim, - activation_fn="gelu-approximate", + quant_config=quant_config, + prefix=f"{prefix}.txt_mlp", ) if nunchaku_enabled: @@ -1120,8 +1208,20 @@ def __init__( self.txt_norm = RMSNorm(joint_attention_dim, eps=1e-6) - self.img_in = nn.Linear(in_channels, self.inner_dim) - self.txt_in = nn.Linear(joint_attention_dim, self.inner_dim) + self.img_in = ReplicatedLinear( + in_channels, + self.inner_dim, + bias=True, + quant_config=quant_config, + prefix="img_in", + ) + self.txt_in = ReplicatedLinear( + joint_attention_dim, + self.inner_dim, + bias=True, + quant_config=quant_config, + prefix="txt_in", + ) self.transformer_blocks = nn.ModuleList( [ @@ -1140,8 +1240,12 @@ def __init__( self.norm_out = AdaLayerNormContinuous( self.inner_dim, self.inner_dim, elementwise_affine=False, eps=1e-6 ) - self.proj_out = nn.Linear( - self.inner_dim, patch_size * patch_size * self.out_channels, bias=True + self.proj_out = ReplicatedLinear( + self.inner_dim, + patch_size * patch_size * self.out_channels, + bias=True, + quant_config=quant_config, + prefix="proj_out", ) self.timestep_zero = torch.zeros( @@ -1224,7 +1328,7 @@ def forward( if isinstance(encoder_hidden_states, list): encoder_hidden_states = encoder_hidden_states[0] - hidden_states = self.img_in(hidden_states) + hidden_states, _ = self.img_in(hidden_states) timestep = (timestep / 1000).to(hidden_states.dtype) @@ -1236,7 +1340,7 @@ def forward( modulate_index = None encoder_hidden_states = self.txt_norm(encoder_hidden_states) - encoder_hidden_states = self.txt_in(encoder_hidden_states) + encoder_hidden_states, _ = self.txt_in(encoder_hidden_states) temb = self.time_text_embed(timestep, hidden_states, additional_t_cond) @@ -1274,7 +1378,7 @@ def forward( # Use only the image part (hidden_states) from the dual-stream blocks hidden_states = self.norm_out(hidden_states, temb_txt) - output = self.proj_out(hidden_states) + output, _ = self.proj_out(hidden_states) return output diff --git a/python/sglang/multimodal_gen/tools/build_modelopt_fp8_transformer.py b/python/sglang/multimodal_gen/tools/build_modelopt_fp8_transformer.py index 1f29761bb4a1..923ee5f3275b 100644 --- a/python/sglang/multimodal_gen/tools/build_modelopt_fp8_transformer.py +++ b/python/sglang/multimodal_gen/tools/build_modelopt_fp8_transformer.py @@ -76,6 +76,15 @@ r"^transformer_blocks\.(0|43|44|45|46|47)\.(attn1|attn2|audio_attn1|audio_attn2|audio_to_video_attn|video_to_audio_attn)\.to_out\.0$", r"^transformer_blocks\.(0|43|44|45|46|47)\.(ff|audio_ff)\.proj_(in|out)$", ] +DEFAULT_QWEN_IMAGE_KEEP_BF16_PATTERNS = [ + r"^img_in$", + r"^txt_in$", + r"^time_text_embed\.timestep_embedder\.linear_[12]$", + r"^norm_out\.linear$", + r"^proj_out$", + r"^transformer_blocks\.\d+\.img_mlp\.net\.2$", + r"^transformer_blocks\.\d+\.(img_mod|txt_mod)$", +] def _resolve_transformer_dir(path: str) -> str: @@ -173,6 +182,7 @@ def _module_name_variants(weight_name: str) -> list[str]: canonicalized.append( re.sub(r"(\.audio_ff|\.ff)\.net\.2$", r"\1.proj_out", variant) ) + canonicalized.append(re.sub(r"(\.(img_mod|txt_mod))\.1$", r"\1", variant)) variants.extend(canonicalized) deduped: list[str] = [] @@ -259,12 +269,16 @@ def get_default_keep_bf16_patterns( return list(DEFAULT_FLUX1_KEEP_BF16_PATTERNS) if model_type == "flux2": return list(DEFAULT_FLUX2_KEEP_BF16_PATTERNS) + if model_type == "qwen-image": + return list(DEFAULT_QWEN_IMAGE_KEEP_BF16_PATTERNS) if model_type == "none": return [] if class_name == "FluxTransformer2DModel": return list(DEFAULT_FLUX1_KEEP_BF16_PATTERNS) if class_name == "Flux2Transformer2DModel": return list(DEFAULT_FLUX2_KEEP_BF16_PATTERNS) + if class_name == "QwenImageTransformer2DModel": + return list(DEFAULT_QWEN_IMAGE_KEEP_BF16_PATTERNS) return [] @@ -552,13 +566,14 @@ def build_modelopt_fp8_transformer( if name in fallback_scale_names: del shard_tensors[name] continue + if name in fallback_tensors: + shard_tensors[name] = fallback_tensors[name] + continue if name.endswith(".weight") and is_ignored_by_modelopt( name, ignore_patterns ): preserved_ignored_weight_count += 1 continue - if name in fallback_tensors: - shard_tensors[name] = fallback_tensors[name] scale_key = _resolve_scale_key(name, fp8_scale_map) if ( name.endswith(".weight") @@ -645,12 +660,13 @@ def _parse_args() -> argparse.Namespace: ) parser.add_argument( "--model-type", - choices=["auto", "flux1", "flux2", "ltx2", "none"], + choices=["auto", "flux1", "flux2", "ltx2", "qwen-image", "none"], default="auto", help=( "Optional model-family BF16 fallback profile. 'none' uses the generic " - "conversion path. 'auto' enables the validated FLUX.1 / FLUX.2 / LTX-2 " - "fallback set when the export config matches those transformer classes." + "conversion path. 'auto' enables the validated FLUX.1 / FLUX.2 / LTX-2 / " + "Qwen Image fallback set when the export config matches those transformer " + "classes." ), ) parser.add_argument( From 8f424f4a0c4e8fdc8151c2424d0f968f40982f90 Mon Sep 17 00:00:00 2001 From: BBuf <1182563586@qq.com> Date: Mon, 20 Apr 2026 10:55:33 +0800 Subject: [PATCH 2/5] Add HunyuanVideo ModelOpt FP8 diffusion support --- docs/diffusion/quantization.md | 23 +- .../benchmarks/bench_offline_throughput.py | 4 +- .../multimodal_gen/runtime/layers/linear.py | 8 +- .../runtime/models/dits/hunyuanvideo.py | 25 ++- .../tools/build_modelopt_fp8_transformer.py | 199 ++++++++++++++++-- 5 files changed, 227 insertions(+), 32 deletions(-) diff --git a/docs/diffusion/quantization.md b/docs/diffusion/quantization.md index a8850ca1dda4..3f1ca9e1e6ce 100644 --- a/docs/diffusion/quantization.md +++ b/docs/diffusion/quantization.md @@ -43,7 +43,7 @@ backend. | quant_family | checkpoint form | canonical CLI | supported models | extra dependency | platform / notes | |-------------------|--------------------------------------------------------------------------------------------|------------------------------------------------------------------------|-----------------------------------------|---------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------| | `fp8` | Quantized transformer component folder, or safetensors with `quantization_config` metadata | `--transformer-path` or `--transformer-weights-path` | ALL | None | Component-folder and single-file flows are both supported | -| `modelopt-fp8` | Converted ModelOpt FP8 transformer directory or repo with `config.json` | `--transformer-path` | FLUX.1, FLUX.2, Wan2.2, Qwen Image, Qwen Image Edit | None | Serialized config stays `quant_method=modelopt` with `quant_algo=FP8`; `dit_layerwise_offload` is supported and `dit_cpu_offload` stays disabled | +| `modelopt-fp8` | Converted ModelOpt FP8 transformer directory or repo with `config.json` | `--transformer-path` | FLUX.1, FLUX.2, Wan2.2, Qwen Image, Qwen Image Edit, HunyuanVideo | None | Serialized config stays `quant_method=modelopt` with `quant_algo=FP8`; `dit_layerwise_offload` is supported and `dit_cpu_offload` stays disabled | | `modelopt-nvfp4` | Mixed transformer directory/repo with `config.json`, or raw NVFP4 safetensors export/repo | `--transformer-path` for mixed overrides; `--transformer-weights-path` for raw exports | FLUX.1, FLUX.2, Wan2.2 | None | Mixed override repos keep the base model separate; raw exports such as `black-forest-labs/FLUX.2-dev-NVFP4` still use the weights-path flow | | `nunchaku-svdq` | Pre-quantized Nunchaku transformer weights, usually named `svdq-{int4\|fp4}_r{rank}-...` | `--transformer-weights-path` | Model-specific support such as Qwen-Image, FLUX, and Z-Image | `nunchaku` | SGLang can infer precision and rank from the filename and supports both `int4` and `nvfp4` | | `msmodelslim` | Pre-quantized msmodelslim transformer weights | `--model-path` | Wan2.2 family | None | Currently only compatible with the Ascend NPU family and supports both `w8a8` and `w4a4` | @@ -67,6 +67,7 @@ official `black-forest-labs/FLUX.2-dev-NVFP4` repo. | `FP8` | `Wan-AI/Wan2.2-T2V-A14B-Diffusers` | `--transformer-path` | `BBuf/wan22-t2v-a14b-modelopt-fp8-sglang-transformer` | primary `transformer` quantized, `transformer_2` kept BF16 | primary-transformer-only path; keep `transformer_2` on the base checkpoint, and do not describe this as dual-transformer full-model FP8 unless that path is validated separately | | `FP8` | `Qwen/Qwen-Image` | `--transformer-path` | `BBuf/Qwen-Image-ModelOpt-FP8-SGLang` | single-transformer override, BF16-vs-FP8 image comparison, H100 benchmark, torch-profiler trace | shares the Qwen Image FP8 fallback preset; keep `img_in`, `txt_in`, timestep embedder, `norm_out.linear`, `proj_out`, `img_mod`/`txt_mod`, and `img_mlp.net.2` in BF16 | | `FP8` | `Qwen/Qwen-Image-Edit-2511` | `--transformer-path` | `BBuf/Qwen-Image-Edit-ModelOpt-FP8-SGLang` | TI2I edit smoke, BF16-vs-FP8 image comparison, H100 benchmark | shares `QwenImageTransformer2DModel` with Qwen Image and uses the same Qwen Image FP8 fallback preset | +| `FP8` | `hunyuanvideo-community/HunyuanVideo` | `--transformer-path` | `BBuf/HunyuanVideo-ModelOpt-FP8-SGLang` | single-transformer override, BF16-vs-FP8 video comparison, H100 benchmark, torch-profiler trace | HunyuanVideo uses different ModelOpt/diffusers and SGLang runtime module names; the converter maps those names before writing FP8 scale tensors and BF16 fallback ignores | | `NVFP4` | `black-forest-labs/FLUX.1-dev` | `--transformer-path` | `BBuf/flux1-dev-modelopt-nvfp4-sglang-transformer` | mixed BF16+NVFP4 transformer override, correctness validation, 4x RTX 5090 benchmark, torch-profiler trace | use `build_modelopt_nvfp4_transformer.py`; validated builder keeps selected FLUX.1 modules in BF16 and sets `swap_weight_nibbles=false` | | `NVFP4` | `black-forest-labs/FLUX.2-dev` | `--transformer-weights-path` | `black-forest-labs/FLUX.2-dev-NVFP4` | packed-QKV load path | official raw export repo; validated packed export detection and runtime layout handling | | `NVFP4` | `Wan-AI/Wan2.2-T2V-A14B-Diffusers` | `--transformer-path` | `BBuf/wan22-t2v-a14b-modelopt-nvfp4-sglang-transformer` | primary `transformer` quantized with ModelOpt NVFP4, `transformer_2` kept BF16 | primary-transformer-only path; keep `transformer_2` on the base checkpoint, and current B200/Blackwell bring-up uses `SGLANG_DIFFUSION_FLASHINFER_FP4_GEMM_BACKEND=cudnn` | @@ -116,6 +117,15 @@ sglang generate \ --save-output ``` +```bash +sglang generate \ + --model-path hunyuanvideo-community/HunyuanVideo \ + --transformer-path BBuf/HunyuanVideo-ModelOpt-FP8-SGLang \ + --height 544 --width 960 --num-frames 17 \ + --prompt "A cinematic shot of a red sports car driving through rain at night" \ + --save-output +``` + ### Notes - `--transformer-path` is the canonical flag for converted ModelOpt FP8 @@ -142,6 +152,17 @@ sglang generate \ - For Qwen Image FP8 conversion, write explicit BF16 fallback tensors before honoring ModelOpt ignored weights. Otherwise converter stats can report a fallback while the output checkpoint still retains the source FP8 tensor. +- `hunyuanvideo-community/HunyuanVideo` uses the `hunyuan-video` converter + preset. Use `--model-type hunyuan-video` to force it, or rely on + auto-detection from `_class_name=HunyuanVideoTransformer3DModel`. +- The validated HunyuanVideo FP8 fallback preset keeps `context_embedder`, + `x_embedder.proj`, timestep/guidance/text embedder linear layers, + `norm_out.linear`, `proj_out`, double-block modulation linear layers, and + single-block modulation linear layers in BF16. +- HunyuanVideo ModelOpt exports use diffusers module names that do not match + SGLang runtime module names for fused QKV and fused QKV+MLP layers. The + converter maps the names before selecting scale tensors and before writing + the runtime ignore list. - To build the converted checkpoint yourself from a ModelOpt diffusers export, use `python -m sglang.multimodal_gen.tools.build_modelopt_fp8_transformer`. diff --git a/python/sglang/multimodal_gen/benchmarks/bench_offline_throughput.py b/python/sglang/multimodal_gen/benchmarks/bench_offline_throughput.py index 5f0a5996c051..10a50b598ec2 100644 --- a/python/sglang/multimodal_gen/benchmarks/bench_offline_throughput.py +++ b/python/sglang/multimodal_gen/benchmarks/bench_offline_throughput.py @@ -427,9 +427,9 @@ def main(): ServerArgs.add_cli_args(parser) BenchArgs.add_cli_args(parser) - args = parser.parse_args() + args, unknown_args = parser.parse_known_args() - server_args = ServerArgs.from_cli_args(args) + server_args = ServerArgs.from_cli_args(args, unknown_args) bench_args = BenchArgs.from_cli_args(args) set_global_server_args(server_args) diff --git a/python/sglang/multimodal_gen/runtime/layers/linear.py b/python/sglang/multimodal_gen/runtime/layers/linear.py index 75f57ff48755..a428f95db8d0 100644 --- a/python/sglang/multimodal_gen/runtime/layers/linear.py +++ b/python/sglang/multimodal_gen/runtime/layers/linear.py @@ -231,6 +231,7 @@ def __init__( skip_bias_add: bool = False, params_dtype: torch.dtype | None = None, quant_config: QuantizationConfig | None = None, + output_sizes: list[int] | None = None, prefix: str = "", ): super().__init__( @@ -244,10 +245,11 @@ def __init__( # All the linear layer supports quant method. assert self.quant_method is not None + output_partition_sizes = output_sizes or [self.output_size] self.quant_method.create_weights( self, self.input_size, - [self.output_size], + output_partition_sizes, self.input_size, self.output_size, self.params_dtype, @@ -496,7 +498,6 @@ def weight_loader( loaded_weight: torch.Tensor, loaded_shard_id: int | None = None, ) -> None: - param_data = param.data output_dim = getattr(param, "output_dim", None) # Special case for AQLM codebooks. @@ -828,7 +829,6 @@ def weight_loader( loaded_weight: torch.Tensor, loaded_shard_id: str | None = None, ): - param_data = param.data output_dim = getattr(param, "output_dim", None) # Special case for AQLM codebooks. @@ -865,7 +865,6 @@ def weight_loader( ] for shard_id, shard_offset, shard_size in shard_offsets: - loaded_weight_shard = loaded_weight.narrow( output_dim, shard_offset, shard_size ) @@ -1036,7 +1035,6 @@ def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor): param_data.copy_(loaded_weight) def weight_loader_v2(self, param: BasevLLMParameter, loaded_weight: torch.Tensor): - # Special case for loading scales off disk, which often do not # have a shape (such as in the case of AutoFP8). if len(loaded_weight.shape) == 0: diff --git a/python/sglang/multimodal_gen/runtime/models/dits/hunyuanvideo.py b/python/sglang/multimodal_gen/runtime/models/dits/hunyuanvideo.py index 09a233ec9176..8f0bdabc54d1 100644 --- a/python/sglang/multimodal_gen/runtime/models/dits/hunyuanvideo.py +++ b/python/sglang/multimodal_gen/runtime/models/dits/hunyuanvideo.py @@ -95,6 +95,7 @@ def __init__( params_dtype=dtype, prefix=f"{prefix}.img_attn_qkv", quant_config=quant_config, + output_sizes=[hidden_size] * 3, ) self.img_attn_q_norm = RMSNorm(head_dim, eps=1e-6, dtype=dtype) @@ -142,7 +143,9 @@ def __init__( hidden_size * 3, bias=True, params_dtype=dtype, + prefix=f"{prefix}.txt_attn_qkv", quant_config=quant_config, + output_sizes=[hidden_size] * 3, ) # QK norm layers for text @@ -154,6 +157,7 @@ def __init__( hidden_size, bias=True, params_dtype=dtype, + prefix=f"{prefix}.txt_attn_proj", quant_config=quant_config, ) @@ -162,6 +166,7 @@ def __init__( mlp_hidden_dim, bias=True, dtype=dtype, + prefix=f"{prefix}.txt_mlp", quant_config=quant_config, ) @@ -220,9 +225,10 @@ def forward( img_k = self.img_attn_k_norm(img_k.contiguous()).to(img_v) # Apply rotary embeddings cos, sin = freqs_cis - img_q, img_k = _apply_rotary_emb( - img_q, cos, sin, is_neox_style=False - ), _apply_rotary_emb(img_k, cos, sin, is_neox_style=False) + img_q, img_k = ( + _apply_rotary_emb(img_q, cos, sin, is_neox_style=False), + _apply_rotary_emb(img_k, cos, sin, is_neox_style=False), + ) # Prepare text for attention using fused operation txt_attn_input = self.txt_attn_norm(txt, txt_attn_shift, txt_attn_scale) @@ -304,6 +310,7 @@ def __init__( params_dtype=dtype, prefix=f"{prefix}.linear1", quant_config=quant_config, + output_sizes=[hidden_size] * 3 + [mlp_hidden_dim], ) # Combined projection and MLP output @@ -386,9 +393,10 @@ def forward( img_v, txt_v = v[:, :-txt_len], v[:, -txt_len:] # Apply rotary embeddings to image parts cos, sin = freqs_cis - img_q, img_k = _apply_rotary_emb( - img_q, cos, sin, is_neox_style=False - ), _apply_rotary_emb(img_k, cos, sin, is_neox_style=False) + img_q, img_k = ( + _apply_rotary_emb(img_q, cos, sin, is_neox_style=False), + _apply_rotary_emb(img_k, cos, sin, is_neox_style=False), + ) # Run distributed attention img_attn_output, txt_attn_output = self.attn( @@ -682,7 +690,6 @@ def maybe_cache_states( self.previous_residual = hidden_states - original_hidden_states def should_skip_forward_for_cached_states(self, **kwargs) -> bool: - forward_context = get_forward_context() forward_batch = forward_context.forward_batch if forward_batch is None: @@ -742,9 +749,7 @@ def should_skip_forward_for_cached_states(self, **kwargs) -> bool: img_mod2_shift, img_mod2_scale, img_mod2_gate, - ) = ( - self.double_blocks[0].img_mod(vec_).chunk(6, dim=-1) - ) + ) = self.double_blocks[0].img_mod(vec_).chunk(6, dim=-1) normed_inp = self.double_blocks[0].img_attn_norm.norm(inp) modulated_inp = modulate(normed_inp, shift=img_mod1_shift, scale=img_mod1_scale) if self.cnt == 0 or self.cnt == num_inference_steps - 1: diff --git a/python/sglang/multimodal_gen/tools/build_modelopt_fp8_transformer.py b/python/sglang/multimodal_gen/tools/build_modelopt_fp8_transformer.py index 923ee5f3275b..f2d965efd967 100644 --- a/python/sglang/multimodal_gen/tools/build_modelopt_fp8_transformer.py +++ b/python/sglang/multimodal_gen/tools/build_modelopt_fp8_transformer.py @@ -29,7 +29,7 @@ import shutil from collections import defaultdict from pathlib import Path -from typing import Iterable, Mapping, Sequence +from typing import Callable, Iterable, Mapping, Sequence import torch from safetensors import safe_open @@ -85,6 +85,128 @@ r"^transformer_blocks\.\d+\.img_mlp\.net\.2$", r"^transformer_blocks\.\d+\.(img_mod|txt_mod)$", ] +DEFAULT_HUNYUANVIDEO_KEEP_BF16_PATTERNS = [ + r"^context_embedder\.", + r"^x_embedder\.proj$", + r"^time_text_embed\.(timestep_embedder|guidance_embedder|text_embedder)\.linear_[12]$", + r"^norm_out\.linear$", + r"^proj_out$", + r"^transformer_blocks\.\d+\.norm1\.linear$", + r"^transformer_blocks\.\d+\.norm1_context\.linear$", + r"^single_transformer_blocks\.\d+\.norm\.linear$", +] +HUNYUANVIDEO_RUNTIME_NAME_REPLACEMENTS = [ + ( + r"^context_embedder\.time_text_embed\.timestep_embedder\.linear_1$", + r"txt_in.t_embedder.mlp.fc_in", + ), + ( + r"^context_embedder\.time_text_embed\.timestep_embedder\.linear_2$", + r"txt_in.t_embedder.mlp.fc_out", + ), + (r"^context_embedder\.proj_in$", r"txt_in.input_embedder"), + ( + r"^context_embedder\.time_text_embed\.text_embedder\.linear_1$", + r"txt_in.c_embedder.fc_in", + ), + ( + r"^context_embedder\.time_text_embed\.text_embedder\.linear_2$", + r"txt_in.c_embedder.fc_out", + ), + ( + r"^context_embedder\.token_refiner\.refiner_blocks\.(\d+)\.norm1$", + r"txt_in.refiner_blocks.\1.norm1", + ), + ( + r"^context_embedder\.token_refiner\.refiner_blocks\.(\d+)\.norm2$", + r"txt_in.refiner_blocks.\1.norm2", + ), + ( + r"^context_embedder\.token_refiner\.refiner_blocks\.(\d+)\.attn\.to_[qkv]$", + r"txt_in.refiner_blocks.\1.self_attn_qkv", + ), + ( + r"^context_embedder\.token_refiner\.refiner_blocks\.(\d+)\.attn\.to_out\.0$", + r"txt_in.refiner_blocks.\1.self_attn_proj", + ), + ( + r"^context_embedder\.token_refiner\.refiner_blocks\.(\d+)\.ff\.net\.0(?:\.proj)?$", + r"txt_in.refiner_blocks.\1.mlp.fc_in", + ), + ( + r"^context_embedder\.token_refiner\.refiner_blocks\.(\d+)\.ff\.net\.2(?:\.proj)?$", + r"txt_in.refiner_blocks.\1.mlp.fc_out", + ), + ( + r"^context_embedder\.token_refiner\.refiner_blocks\.(\d+)\.norm_out\.linear$", + r"txt_in.refiner_blocks.\1.adaLN_modulation.linear", + ), + (r"^x_embedder\.proj$", r"img_in.proj"), + (r"^time_text_embed\.timestep_embedder\.linear_1$", r"time_in.mlp.fc_in"), + (r"^time_text_embed\.timestep_embedder\.linear_2$", r"time_in.mlp.fc_out"), + (r"^time_text_embed\.guidance_embedder\.linear_1$", r"guidance_in.mlp.fc_in"), + (r"^time_text_embed\.guidance_embedder\.linear_2$", r"guidance_in.mlp.fc_out"), + (r"^time_text_embed\.text_embedder\.linear_1$", r"vector_in.fc_in"), + (r"^time_text_embed\.text_embedder\.linear_2$", r"vector_in.fc_out"), + (r"^transformer_blocks\.(\d+)\.norm1\.linear$", r"double_blocks.\1.img_mod.linear"), + ( + r"^transformer_blocks\.(\d+)\.norm1_context\.linear$", + r"double_blocks.\1.txt_mod.linear", + ), + (r"^transformer_blocks\.(\d+)\.attn\.norm_q$", r"double_blocks.\1.img_attn_q_norm"), + (r"^transformer_blocks\.(\d+)\.attn\.norm_k$", r"double_blocks.\1.img_attn_k_norm"), + (r"^transformer_blocks\.(\d+)\.attn\.to_[qkv]$", r"double_blocks.\1.img_attn_qkv"), + ( + r"^transformer_blocks\.(\d+)\.attn\.add_[qkv]_proj$", + r"double_blocks.\1.txt_attn_qkv", + ), + ( + r"^transformer_blocks\.(\d+)\.attn\.to_out\.0$", + r"double_blocks.\1.img_attn_proj", + ), + ( + r"^transformer_blocks\.(\d+)\.attn\.to_add_out$", + r"double_blocks.\1.txt_attn_proj", + ), + ( + r"^transformer_blocks\.(\d+)\.attn\.norm_added_q$", + r"double_blocks.\1.txt_attn_q_norm", + ), + ( + r"^transformer_blocks\.(\d+)\.attn\.norm_added_k$", + r"double_blocks.\1.txt_attn_k_norm", + ), + ( + r"^transformer_blocks\.(\d+)\.ff\.net\.0(?:\.proj)?$", + r"double_blocks.\1.img_mlp.fc_in", + ), + ( + r"^transformer_blocks\.(\d+)\.ff\.net\.2(?:\.proj)?$", + r"double_blocks.\1.img_mlp.fc_out", + ), + ( + r"^transformer_blocks\.(\d+)\.ff_context\.net\.0(?:\.proj)?$", + r"double_blocks.\1.txt_mlp.fc_in", + ), + ( + r"^transformer_blocks\.(\d+)\.ff_context\.net\.2(?:\.proj)?$", + r"double_blocks.\1.txt_mlp.fc_out", + ), + (r"^single_transformer_blocks\.(\d+)\.attn\.norm_q$", r"single_blocks.\1.q_norm"), + (r"^single_transformer_blocks\.(\d+)\.attn\.norm_k$", r"single_blocks.\1.k_norm"), + ( + r"^single_transformer_blocks\.(\d+)\.attn\.to_[qkv]$", + r"single_blocks.\1.linear1", + ), + (r"^single_transformer_blocks\.(\d+)\.proj_mlp$", r"single_blocks.\1.linear1"), + (r"^single_transformer_blocks\.(\d+)\.proj_out$", r"single_blocks.\1.linear2"), + ( + r"^single_transformer_blocks\.(\d+)\.norm\.linear$", + r"single_blocks.\1.modulation.linear", + ), + (r"^norm_out\.linear$", r"final_layer.adaLN_modulation.linear"), + (r"^proj_out$", r"final_layer.linear"), +] def _resolve_transformer_dir(path: str) -> str: @@ -166,7 +288,27 @@ def _load_first_shard_metadata( return dict(f.metadata() or {}) -def _module_name_variants(weight_name: str) -> list[str]: +def _map_hunyuanvideo_runtime_module_name(module_name: str) -> list[str]: + mapped_names: list[str] = [] + for pattern, replacement in HUNYUANVIDEO_RUNTIME_NAME_REPLACEMENTS: + mapped = re.sub(pattern, replacement, module_name) + if mapped != module_name: + mapped_names.append(mapped) + return mapped_names + + +def _get_runtime_module_name_mapper( + *, model_type: str, class_name: str | None +) -> Callable[[str], list[str]] | None: + if model_type == "hunyuan-video" or class_name == "HunyuanVideoTransformer3DModel": + return _map_hunyuanvideo_runtime_module_name + return None + + +def _module_name_variants( + weight_name: str, + runtime_name_mapper: Callable[[str], list[str]] | None = None, +) -> list[str]: module_name = weight_name[:-7] if weight_name.endswith(".weight") else weight_name variants = [module_name] @@ -184,6 +326,11 @@ def _module_name_variants(weight_name: str) -> list[str]: ) canonicalized.append(re.sub(r"(\.(img_mod|txt_mod))\.1$", r"\1", variant)) variants.extend(canonicalized) + if runtime_name_mapper is not None: + runtime_variants: list[str] = [] + for variant in variants: + runtime_variants.extend(runtime_name_mapper(variant)) + variants.extend(runtime_variants) deduped: list[str] = [] for variant in variants: @@ -192,8 +339,11 @@ def _module_name_variants(weight_name: str) -> list[str]: return deduped -def _preferred_module_name(weight_name: str) -> str: - return _module_name_variants(weight_name)[-1] +def _preferred_module_name( + weight_name: str, + runtime_name_mapper: Callable[[str], list[str]] | None = None, +) -> str: + return _module_name_variants(weight_name, runtime_name_mapper)[-1] def _scale_key_candidates(weight_name: str) -> list[str]: @@ -271,6 +421,8 @@ def get_default_keep_bf16_patterns( return list(DEFAULT_FLUX2_KEEP_BF16_PATTERNS) if model_type == "qwen-image": return list(DEFAULT_QWEN_IMAGE_KEEP_BF16_PATTERNS) + if model_type == "hunyuan-video": + return list(DEFAULT_HUNYUANVIDEO_KEEP_BF16_PATTERNS) if model_type == "none": return [] if class_name == "FluxTransformer2DModel": @@ -279,12 +431,15 @@ def get_default_keep_bf16_patterns( return list(DEFAULT_FLUX2_KEEP_BF16_PATTERNS) if class_name == "QwenImageTransformer2DModel": return list(DEFAULT_QWEN_IMAGE_KEEP_BF16_PATTERNS) + if class_name == "HunyuanVideoTransformer3DModel": + return list(DEFAULT_HUNYUANVIDEO_KEEP_BF16_PATTERNS) return [] def should_keep_bf16( weight_name: str, keep_bf16_patterns: Sequence[str], + runtime_name_mapper: Callable[[str], list[str]] | None = None, ) -> bool: if not keep_bf16_patterns: return False @@ -292,13 +447,14 @@ def should_keep_bf16( return any( re.search(pattern, module_name) for pattern in keep_bf16_patterns - for module_name in _module_name_variants(weight_name) + for module_name in _module_name_variants(weight_name, runtime_name_mapper) ) def is_ignored_by_modelopt( weight_name: str, ignore_patterns: Sequence[str], + runtime_name_mapper: Callable[[str], list[str]] | None = None, ) -> bool: if not ignore_patterns: return False @@ -307,7 +463,7 @@ def is_ignored_by_modelopt( regex_str = pattern.replace(".", r"\.").replace("*", r".*") if any( re.fullmatch(regex_str, module_name) - for module_name in _module_name_variants(weight_name) + for module_name in _module_name_variants(weight_name, runtime_name_mapper) ): return True return False @@ -424,6 +580,9 @@ def build_modelopt_fp8_transformer( source_weight_map=source_weight_map_all, ) class_name = config.get("_class_name") + runtime_name_mapper = _get_runtime_module_name_mapper( + model_type=model_type, class_name=class_name + ) ignore_patterns = list(quant_config.get("ignore", []) or []) patterns = list( get_default_keep_bf16_patterns(model_type=model_type, class_name=class_name) @@ -463,7 +622,8 @@ def build_modelopt_fp8_transformer( fallback_weight_names = sorted( weight_name for weight_name in source_weight_map - if weight_name.endswith(".weight") and should_keep_bf16(weight_name, patterns) + if weight_name.endswith(".weight") + and should_keep_bf16(weight_name, patterns, runtime_name_mapper) ) fallback_weight_names_set = set(fallback_weight_names) @@ -492,14 +652,17 @@ def build_modelopt_fp8_transformer( auto_ignore_modules = sorted( { - _preferred_module_name(weight_name) + _preferred_module_name(weight_name, runtime_name_mapper) for weight_name in source_weight_map if weight_name.endswith(".weight") and _resolve_scale_key(weight_name, fp8_scale_map) is None } ) fallback_ignore_modules = sorted( - {_preferred_module_name(weight_name) for weight_name in fallback_weight_names} + { + _preferred_module_name(weight_name, runtime_name_mapper) + for weight_name in fallback_weight_names + } ) ignore_patterns = sorted( { @@ -570,7 +733,7 @@ def build_modelopt_fp8_transformer( shard_tensors[name] = fallback_tensors[name] continue if name.endswith(".weight") and is_ignored_by_modelopt( - name, ignore_patterns + name, ignore_patterns, runtime_name_mapper ): preserved_ignored_weight_count += 1 continue @@ -620,7 +783,7 @@ def build_modelopt_fp8_transformer( for name in source_weight_map if name.endswith(".weight") and _resolve_scale_key(name, fp8_scale_map) is not None - and not is_ignored_by_modelopt(name, ignore_patterns) + and not is_ignored_by_modelopt(name, ignore_patterns, runtime_name_mapper) ), "bf16_fallback_weights": len(fallback_weight_names), "preserved_ignored_weights": preserved_ignored_weight_count, @@ -660,13 +823,21 @@ def _parse_args() -> argparse.Namespace: ) parser.add_argument( "--model-type", - choices=["auto", "flux1", "flux2", "ltx2", "qwen-image", "none"], + choices=[ + "auto", + "flux1", + "flux2", + "ltx2", + "qwen-image", + "hunyuan-video", + "none", + ], default="auto", help=( "Optional model-family BF16 fallback profile. 'none' uses the generic " "conversion path. 'auto' enables the validated FLUX.1 / FLUX.2 / LTX-2 / " - "Qwen Image fallback set when the export config matches those transformer " - "classes." + "Qwen Image / HunyuanVideo fallback set when the export config matches " + "those transformer classes." ), ) parser.add_argument( From 050bbf06ff1cde5e0d91c0f47b57394c58f10e17 Mon Sep 17 00:00:00 2001 From: BBuf <1182563586@qq.com> Date: Mon, 27 Apr 2026 09:57:08 +0800 Subject: [PATCH 3/5] Format HunyuanVideo FP8 mod chunk --- .../sglang/multimodal_gen/runtime/models/dits/hunyuanvideo.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/python/sglang/multimodal_gen/runtime/models/dits/hunyuanvideo.py b/python/sglang/multimodal_gen/runtime/models/dits/hunyuanvideo.py index 8f0bdabc54d1..db2942caf4f8 100644 --- a/python/sglang/multimodal_gen/runtime/models/dits/hunyuanvideo.py +++ b/python/sglang/multimodal_gen/runtime/models/dits/hunyuanvideo.py @@ -749,7 +749,9 @@ def should_skip_forward_for_cached_states(self, **kwargs) -> bool: img_mod2_shift, img_mod2_scale, img_mod2_gate, - ) = self.double_blocks[0].img_mod(vec_).chunk(6, dim=-1) + ) = ( + self.double_blocks[0].img_mod(vec_).chunk(6, dim=-1) + ) normed_inp = self.double_blocks[0].img_attn_norm.norm(inp) modulated_inp = modulate(normed_inp, shift=img_mod1_shift, scale=img_mod1_scale) if self.cnt == 0 or self.cnt == num_inference_steps - 1: From 27b84fc73fdfdb1694b83c8b11954bf01363c3a0 Mon Sep 17 00:00:00 2001 From: BBuf <1182563586@qq.com> Date: Mon, 27 Apr 2026 10:11:09 +0800 Subject: [PATCH 4/5] Revert "Add Qwen Image ModelOpt FP8 diffusion support" This reverts commit e586a78127d2b50ac052938633b5c63cf0809abf. --- docs/diffusion/quantization.md | 39 +---- .../sglang-diffusion-modelopt-quant/SKILL.md | 37 ++--- .../runtime/models/dits/qwen_image.py | 144 +++--------------- .../tools/build_modelopt_fp8_transformer.py | 17 +-- 4 files changed, 42 insertions(+), 195 deletions(-) diff --git a/docs/diffusion/quantization.md b/docs/diffusion/quantization.md index 3f1ca9e1e6ce..df60c1668a40 100644 --- a/docs/diffusion/quantization.md +++ b/docs/diffusion/quantization.md @@ -43,21 +43,21 @@ backend. | quant_family | checkpoint form | canonical CLI | supported models | extra dependency | platform / notes | |-------------------|--------------------------------------------------------------------------------------------|------------------------------------------------------------------------|-----------------------------------------|---------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------| | `fp8` | Quantized transformer component folder, or safetensors with `quantization_config` metadata | `--transformer-path` or `--transformer-weights-path` | ALL | None | Component-folder and single-file flows are both supported | -| `modelopt-fp8` | Converted ModelOpt FP8 transformer directory or repo with `config.json` | `--transformer-path` | FLUX.1, FLUX.2, Wan2.2, Qwen Image, Qwen Image Edit, HunyuanVideo | None | Serialized config stays `quant_method=modelopt` with `quant_algo=FP8`; `dit_layerwise_offload` is supported and `dit_cpu_offload` stays disabled | +| `modelopt-fp8` | Converted ModelOpt FP8 transformer directory or repo with `config.json` | `--transformer-path` | FLUX.1, FLUX.2, Wan2.2, HunyuanVideo | None | Serialized config stays `quant_method=modelopt` with `quant_algo=FP8`; `dit_layerwise_offload` is supported and `dit_cpu_offload` stays disabled | | `modelopt-nvfp4` | Mixed transformer directory/repo with `config.json`, or raw NVFP4 safetensors export/repo | `--transformer-path` for mixed overrides; `--transformer-weights-path` for raw exports | FLUX.1, FLUX.2, Wan2.2 | None | Mixed override repos keep the base model separate; raw exports such as `black-forest-labs/FLUX.2-dev-NVFP4` still use the weights-path flow | | `nunchaku-svdq` | Pre-quantized Nunchaku transformer weights, usually named `svdq-{int4\|fp4}_r{rank}-...` | `--transformer-weights-path` | Model-specific support such as Qwen-Image, FLUX, and Z-Image | `nunchaku` | SGLang can infer precision and rank from the filename and supports both `int4` and `nvfp4` | | `msmodelslim` | Pre-quantized msmodelslim transformer weights | `--model-path` | Wan2.2 family | None | Currently only compatible with the Ascend NPU family and supports both `w8a8` and `w4a4` | ## Validated ModelOpt Checkpoints -This section is the canonical support matrix for the diffusion ModelOpt +This section is the canonical support matrix for the seven diffusion ModelOpt checkpoints currently wired up in SGLang docs and validation coverage. Published checkpoints keep the serialized quantization config as `quant_method=modelopt`; the FP8 vs NVFP4 split below is a documentation label derived from `quant_algo`. -Seven of the eight repos live under `BBuf/*`. The FLUX.2 NVFP4 entry keeps the +Six of the seven repos live under `BBuf/*`. The FLUX.2 NVFP4 entry keeps the official `black-forest-labs/FLUX.2-dev-NVFP4` repo. | Quant Algo | Base Model | Preferred CLI | HF Repo | Current Scope | Notes | @@ -65,16 +65,14 @@ official `black-forest-labs/FLUX.2-dev-NVFP4` repo. | `FP8` | `black-forest-labs/FLUX.1-dev` | `--transformer-path` | `BBuf/flux1-dev-modelopt-fp8-sglang-transformer` | single-transformer override, deterministic latent/image comparison, H100 benchmark, torch-profiler trace | SGLang converter keeps a validated BF16 fallback set for modulation and FF projection layers; use `--model-id FLUX.1-dev` for local mirrors | | `FP8` | `black-forest-labs/FLUX.2-dev` | `--transformer-path` | `BBuf/flux2-dev-modelopt-fp8-sglang-transformer` | single-transformer override load and generation path | published SGLang-ready transformer override | | `FP8` | `Wan-AI/Wan2.2-T2V-A14B-Diffusers` | `--transformer-path` | `BBuf/wan22-t2v-a14b-modelopt-fp8-sglang-transformer` | primary `transformer` quantized, `transformer_2` kept BF16 | primary-transformer-only path; keep `transformer_2` on the base checkpoint, and do not describe this as dual-transformer full-model FP8 unless that path is validated separately | -| `FP8` | `Qwen/Qwen-Image` | `--transformer-path` | `BBuf/Qwen-Image-ModelOpt-FP8-SGLang` | single-transformer override, BF16-vs-FP8 image comparison, H100 benchmark, torch-profiler trace | shares the Qwen Image FP8 fallback preset; keep `img_in`, `txt_in`, timestep embedder, `norm_out.linear`, `proj_out`, `img_mod`/`txt_mod`, and `img_mlp.net.2` in BF16 | -| `FP8` | `Qwen/Qwen-Image-Edit-2511` | `--transformer-path` | `BBuf/Qwen-Image-Edit-ModelOpt-FP8-SGLang` | TI2I edit smoke, BF16-vs-FP8 image comparison, H100 benchmark | shares `QwenImageTransformer2DModel` with Qwen Image and uses the same Qwen Image FP8 fallback preset | | `FP8` | `hunyuanvideo-community/HunyuanVideo` | `--transformer-path` | `BBuf/HunyuanVideo-ModelOpt-FP8-SGLang` | single-transformer override, BF16-vs-FP8 video comparison, H100 benchmark, torch-profiler trace | HunyuanVideo uses different ModelOpt/diffusers and SGLang runtime module names; the converter maps those names before writing FP8 scale tensors and BF16 fallback ignores | | `NVFP4` | `black-forest-labs/FLUX.1-dev` | `--transformer-path` | `BBuf/flux1-dev-modelopt-nvfp4-sglang-transformer` | mixed BF16+NVFP4 transformer override, correctness validation, 4x RTX 5090 benchmark, torch-profiler trace | use `build_modelopt_nvfp4_transformer.py`; validated builder keeps selected FLUX.1 modules in BF16 and sets `swap_weight_nibbles=false` | | `NVFP4` | `black-forest-labs/FLUX.2-dev` | `--transformer-weights-path` | `black-forest-labs/FLUX.2-dev-NVFP4` | packed-QKV load path | official raw export repo; validated packed export detection and runtime layout handling | | `NVFP4` | `Wan-AI/Wan2.2-T2V-A14B-Diffusers` | `--transformer-path` | `BBuf/wan22-t2v-a14b-modelopt-nvfp4-sglang-transformer` | primary `transformer` quantized with ModelOpt NVFP4, `transformer_2` kept BF16 | primary-transformer-only path; keep `transformer_2` on the base checkpoint, and current B200/Blackwell bring-up uses `SGLANG_DIFFUSION_FLASHINFER_FP4_GEMM_BACKEND=cudnn` | The FLUX and Wan entries are also the intended case set for the B200 diffusion -CI job (`multimodal-gen-test-1-b200`). The Qwen Image FP8 entries are currently -H100 manual-validation artifacts. +CI job (`multimodal-gen-test-1-b200`). The HunyuanVideo FP8 entry is currently +an H100 manual-validation artifact. ## ModelOpt FP8 @@ -100,23 +98,6 @@ sglang generate \ --save-output ``` -```bash -sglang generate \ - --model-path Qwen/Qwen-Image \ - --transformer-path BBuf/Qwen-Image-ModelOpt-FP8-SGLang \ - --prompt "A tiny astronaut reading a book under a glass greenhouse" \ - --save-output -``` - -```bash -sglang generate \ - --model-path Qwen/Qwen-Image-Edit-2511 \ - --transformer-path BBuf/Qwen-Image-Edit-ModelOpt-FP8-SGLang \ - --image-path /path/to/input.png \ - --prompt "Turn the scene into a warm watercolor illustration" \ - --save-output -``` - ```bash sglang generate \ --model-path hunyuanvideo-community/HunyuanVideo \ @@ -142,16 +123,6 @@ sglang generate \ - On disk, the quantization config stays `quant_method=modelopt` with `quant_algo=FP8`; the `modelopt-fp8` label in this document is a support family name, not a serialized config key. -- `Qwen/Qwen-Image` and `Qwen/Qwen-Image-Edit-2511` share the `qwen-image` - converter preset. Use `--model-type qwen-image` to force it, or rely on - auto-detection from `_class_name=QwenImageTransformer2DModel`. -- The validated Qwen Image FP8 fallback preset keeps `img_in`, `txt_in`, - timestep embedder linear layers, `norm_out.linear`, `proj_out`, - `transformer_blocks.*.(img_mod|txt_mod)`, and - `transformer_blocks.*.img_mlp.net.2` in BF16. -- For Qwen Image FP8 conversion, write explicit BF16 fallback tensors before - honoring ModelOpt ignored weights. Otherwise converter stats can report a - fallback while the output checkpoint still retains the source FP8 tensor. - `hunyuanvideo-community/HunyuanVideo` uses the `hunyuan-video` converter preset. Use `--model-type hunyuan-video` to force it, or rely on auto-detection from `_class_name=HunyuanVideoTransformer3DModel`. diff --git a/python/sglang/multimodal_gen/.claude/skills/sglang-diffusion-modelopt-quant/SKILL.md b/python/sglang/multimodal_gen/.claude/skills/sglang-diffusion-modelopt-quant/SKILL.md index 0c60baf1ad16..655aa1df5fa5 100644 --- a/python/sglang/multimodal_gen/.claude/skills/sglang-diffusion-modelopt-quant/SKILL.md +++ b/python/sglang/multimodal_gen/.claude/skills/sglang-diffusion-modelopt-quant/SKILL.md @@ -63,7 +63,7 @@ This repo now contains: Validated documentation and CI coverage currently center on these ModelOpt diffusion transformer override families: -- FP8: FLUX.1-dev, FLUX.2-dev, Wan2.2, Qwen Image, Qwen Image Edit +- FP8: FLUX.1-dev, FLUX.2-dev, Wan2.2, HunyuanVideo - NVFP4: FLUX.1-dev, FLUX.2-dev, Wan2.2 Treat a new family, a new precision, or a new checkpoint layout as unsupported until it has a documented matrix row and a matching validation story. @@ -173,32 +173,27 @@ For `FLUX.1-dev`, the validated fallback set currently keeps these modules in BF Use `--model-type flux1` to force that profile, or rely on `--model-type auto` when the export config identifies `FluxTransformer2DModel`. -Qwen Image and Qwen Image Edit share `QwenImageTransformer2DModel`, so one -ModelOpt FP8 fallback preset covers both. The validated Qwen Image fallback set -keeps these modules in BF16: +HunyuanVideo uses `HunyuanVideoTransformer3DModel`, so the validated +HunyuanVideo FP8 fallback preset keeps these modules in BF16: -- `img_in` -- `txt_in` -- `time_text_embed.timestep_embedder.linear_1` -- `time_text_embed.timestep_embedder.linear_2` +- `context_embedder.*` +- `x_embedder.proj` +- `time_text_embed.(timestep_embedder|guidance_embedder|text_embedder).linear_[12]` - `norm_out.linear` - `proj_out` -- `transformer_blocks.*.img_mlp.net.2` -- `transformer_blocks.*.img_mod` -- `transformer_blocks.*.txt_mod` +- `transformer_blocks.*.norm1.linear` +- `transformer_blocks.*.norm1_context.linear` +- `single_transformer_blocks.*.norm.linear` -Use `--model-type qwen-image` to force that profile, or rely on +Use `--model-type hunyuan-video` to force that profile, or rely on `--model-type auto` when the export config identifies -`QwenImageTransformer2DModel`. - -Qwen modulation weights can appear in safetensors as `.img_mod.1.weight` and -`.txt_mod.1.weight`. Canonicalize those module names to `.img_mod` and -`.txt_mod` before fallback matching. +`HunyuanVideoTransformer3DModel`. -For Qwen Image FP8, explicit BF16 fallback tensors must be written before -honoring ModelOpt ignored weights. Otherwise converter stats can report a -fallback while the output checkpoint still retains the source FP8 tensor, which -causes severe image-quality regressions. +HunyuanVideo ModelOpt exports use diffusers module names that differ from +SGLang runtime names for fused QKV and fused QKV+MLP layers. Keep the +diffusers-to-runtime mapping in `build_modelopt_fp8_transformer.py` in sync +with `runtime/models/dits/hunyuanvideo.py` before trusting converted scale +tensors. For FLUX.1-dev NVFP4 model families that need a mixed BF16+NVFP4 checkpoint, build the merged transformer explicitly: diff --git a/python/sglang/multimodal_gen/runtime/models/dits/qwen_image.py b/python/sglang/multimodal_gen/runtime/models/dits/qwen_image.py index bfb5a19cdb63..1b3bc8a4ae4d 100644 --- a/python/sglang/multimodal_gen/runtime/models/dits/qwen_image.py +++ b/python/sglang/multimodal_gen/runtime/models/dits/qwen_image.py @@ -10,6 +10,7 @@ import torch import torch.nn as nn import torch.nn.functional as F +from diffusers.models.attention import FeedForward from diffusers.models.embeddings import TimestepEmbedding, Timesteps from diffusers.models.modeling_outputs import Transformer2DModelOutput from diffusers.models.normalization import AdaLayerNormContinuous @@ -531,27 +532,10 @@ def __init__( prefix=f"{prefix}.to_qkv", ) else: - self.to_q = ReplicatedLinear( - dim, - self.inner_dim, - bias=True, - quant_config=quant_config, - prefix=f"{prefix}.to_q", - ) - self.to_k = ReplicatedLinear( - dim, - self.inner_dim, - bias=True, - quant_config=quant_config, - prefix=f"{prefix}.to_k", - ) - self.to_v = ReplicatedLinear( - dim, - self.inner_dim, - bias=True, - quant_config=quant_config, - prefix=f"{prefix}.to_v", - ) + # Use separate Q/K/V projections for non-quantized models + self.to_q = ReplicatedLinear(dim, self.inner_dim, bias=True) + self.to_k = ReplicatedLinear(dim, self.inner_dim, bias=True) + self.to_v = ReplicatedLinear(dim, self.inner_dim, bias=True) if self.qk_norm: self.norm_q = RMSNorm(head_dim, eps=eps) if qk_norm else nn.Identity() @@ -568,26 +552,15 @@ def __init__( prefix=f"{prefix}.to_added_qkv", ) else: + # Use separate Q/K/V projections for non-quantized models self.add_q_proj = ReplicatedLinear( - added_kv_proj_dim, - self.inner_dim, - bias=True, - quant_config=quant_config, - prefix=f"{prefix}.add_q_proj", + added_kv_proj_dim, self.inner_dim, bias=True ) self.add_k_proj = ReplicatedLinear( - added_kv_proj_dim, - self.inner_dim, - bias=True, - quant_config=quant_config, - prefix=f"{prefix}.add_k_proj", + added_kv_proj_dim, self.inner_dim, bias=True ) self.add_v_proj = ReplicatedLinear( - added_kv_proj_dim, - self.inner_dim, - bias=True, - quant_config=quant_config, - prefix=f"{prefix}.add_v_proj", + added_kv_proj_dim, self.inner_dim, bias=True ) if context_pre_only is not None and not context_pre_only: @@ -728,65 +701,6 @@ def forward( return img_attn_output, txt_attn_output -class QwenImageGELU(nn.Module): - def __init__( - self, - dim: int, - inner_dim: int, - quant_config: Optional[QuantizationConfig] = None, - prefix: str = "", - ) -> None: - super().__init__() - self.proj = ReplicatedLinear( - dim, - inner_dim, - bias=True, - quant_config=quant_config, - prefix=f"{prefix}.proj", - ) - - def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: - hidden_states, _ = self.proj(hidden_states) - return F.gelu(hidden_states, approximate="tanh") - - -class QwenImageFeedForward(nn.Module): - def __init__( - self, - dim: int, - dim_out: int, - quant_config: Optional[QuantizationConfig] = None, - prefix: str = "", - mult: int = 4, - ) -> None: - super().__init__() - inner_dim = dim * mult - self.net = nn.ModuleList( - [ - QwenImageGELU( - dim, - inner_dim, - quant_config=quant_config, - prefix=f"{prefix}.net.0", - ), - nn.Dropout(0.0), - ReplicatedLinear( - inner_dim, - dim_out, - bias=True, - quant_config=quant_config, - prefix=f"{prefix}.net.2", - ), - ] - ) - - def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: - hidden_states = self.net[0](hidden_states) - hidden_states = self.net[1](hidden_states) - hidden_states, _ = self.net[2](hidden_states) - return hidden_states - - class QwenImageTransformerBlock(nn.Module): def __init__( self, @@ -876,17 +790,15 @@ def __init__( activation_fn="gelu-approximate", ) else: - self.img_mlp = QwenImageFeedForward( + self.img_mlp = FeedForward( dim=dim, dim_out=dim, - quant_config=quant_config, - prefix=f"{prefix}.img_mlp", + activation_fn="gelu-approximate", ) - self.txt_mlp = QwenImageFeedForward( + self.txt_mlp = FeedForward( dim=dim, dim_out=dim, - quant_config=quant_config, - prefix=f"{prefix}.txt_mlp", + activation_fn="gelu-approximate", ) if nunchaku_enabled: @@ -1208,20 +1120,8 @@ def __init__( self.txt_norm = RMSNorm(joint_attention_dim, eps=1e-6) - self.img_in = ReplicatedLinear( - in_channels, - self.inner_dim, - bias=True, - quant_config=quant_config, - prefix="img_in", - ) - self.txt_in = ReplicatedLinear( - joint_attention_dim, - self.inner_dim, - bias=True, - quant_config=quant_config, - prefix="txt_in", - ) + self.img_in = nn.Linear(in_channels, self.inner_dim) + self.txt_in = nn.Linear(joint_attention_dim, self.inner_dim) self.transformer_blocks = nn.ModuleList( [ @@ -1240,12 +1140,8 @@ def __init__( self.norm_out = AdaLayerNormContinuous( self.inner_dim, self.inner_dim, elementwise_affine=False, eps=1e-6 ) - self.proj_out = ReplicatedLinear( - self.inner_dim, - patch_size * patch_size * self.out_channels, - bias=True, - quant_config=quant_config, - prefix="proj_out", + self.proj_out = nn.Linear( + self.inner_dim, patch_size * patch_size * self.out_channels, bias=True ) self.timestep_zero = torch.zeros( @@ -1328,7 +1224,7 @@ def forward( if isinstance(encoder_hidden_states, list): encoder_hidden_states = encoder_hidden_states[0] - hidden_states, _ = self.img_in(hidden_states) + hidden_states = self.img_in(hidden_states) timestep = (timestep / 1000).to(hidden_states.dtype) @@ -1340,7 +1236,7 @@ def forward( modulate_index = None encoder_hidden_states = self.txt_norm(encoder_hidden_states) - encoder_hidden_states, _ = self.txt_in(encoder_hidden_states) + encoder_hidden_states = self.txt_in(encoder_hidden_states) temb = self.time_text_embed(timestep, hidden_states, additional_t_cond) @@ -1378,7 +1274,7 @@ def forward( # Use only the image part (hidden_states) from the dual-stream blocks hidden_states = self.norm_out(hidden_states, temb_txt) - output, _ = self.proj_out(hidden_states) + output = self.proj_out(hidden_states) return output diff --git a/python/sglang/multimodal_gen/tools/build_modelopt_fp8_transformer.py b/python/sglang/multimodal_gen/tools/build_modelopt_fp8_transformer.py index f2d965efd967..d5d891a4ff04 100644 --- a/python/sglang/multimodal_gen/tools/build_modelopt_fp8_transformer.py +++ b/python/sglang/multimodal_gen/tools/build_modelopt_fp8_transformer.py @@ -76,15 +76,6 @@ r"^transformer_blocks\.(0|43|44|45|46|47)\.(attn1|attn2|audio_attn1|audio_attn2|audio_to_video_attn|video_to_audio_attn)\.to_out\.0$", r"^transformer_blocks\.(0|43|44|45|46|47)\.(ff|audio_ff)\.proj_(in|out)$", ] -DEFAULT_QWEN_IMAGE_KEEP_BF16_PATTERNS = [ - r"^img_in$", - r"^txt_in$", - r"^time_text_embed\.timestep_embedder\.linear_[12]$", - r"^norm_out\.linear$", - r"^proj_out$", - r"^transformer_blocks\.\d+\.img_mlp\.net\.2$", - r"^transformer_blocks\.\d+\.(img_mod|txt_mod)$", -] DEFAULT_HUNYUANVIDEO_KEEP_BF16_PATTERNS = [ r"^context_embedder\.", r"^x_embedder\.proj$", @@ -324,7 +315,6 @@ def _module_name_variants( canonicalized.append( re.sub(r"(\.audio_ff|\.ff)\.net\.2$", r"\1.proj_out", variant) ) - canonicalized.append(re.sub(r"(\.(img_mod|txt_mod))\.1$", r"\1", variant)) variants.extend(canonicalized) if runtime_name_mapper is not None: runtime_variants: list[str] = [] @@ -419,8 +409,6 @@ def get_default_keep_bf16_patterns( return list(DEFAULT_FLUX1_KEEP_BF16_PATTERNS) if model_type == "flux2": return list(DEFAULT_FLUX2_KEEP_BF16_PATTERNS) - if model_type == "qwen-image": - return list(DEFAULT_QWEN_IMAGE_KEEP_BF16_PATTERNS) if model_type == "hunyuan-video": return list(DEFAULT_HUNYUANVIDEO_KEEP_BF16_PATTERNS) if model_type == "none": @@ -429,8 +417,6 @@ def get_default_keep_bf16_patterns( return list(DEFAULT_FLUX1_KEEP_BF16_PATTERNS) if class_name == "Flux2Transformer2DModel": return list(DEFAULT_FLUX2_KEEP_BF16_PATTERNS) - if class_name == "QwenImageTransformer2DModel": - return list(DEFAULT_QWEN_IMAGE_KEEP_BF16_PATTERNS) if class_name == "HunyuanVideoTransformer3DModel": return list(DEFAULT_HUNYUANVIDEO_KEEP_BF16_PATTERNS) return [] @@ -828,7 +814,6 @@ def _parse_args() -> argparse.Namespace: "flux1", "flux2", "ltx2", - "qwen-image", "hunyuan-video", "none", ], @@ -836,7 +821,7 @@ def _parse_args() -> argparse.Namespace: help=( "Optional model-family BF16 fallback profile. 'none' uses the generic " "conversion path. 'auto' enables the validated FLUX.1 / FLUX.2 / LTX-2 / " - "Qwen Image / HunyuanVideo fallback set when the export config matches " + "HunyuanVideo fallback set when the export config matches " "those transformer classes." ), ) From ae41d37b1ad0dfc1600804c69b0a9d4284403c7a Mon Sep 17 00:00:00 2001 From: BBuf <1182563586@qq.com> Date: Tue, 28 Apr 2026 16:14:21 +0800 Subject: [PATCH 5/5] Use lmsys HunyuanVideo ModelOpt checkpoint --- docs/diffusion/quantization.md | 29 +++++++------ .../docs/sglang-diffusion/quantization.mdx | 41 +++++++++++++------ .../sglang-diffusion-modelopt-quant/SKILL.md | 2 +- .../multimodal_gen/test/server/gpu_cases.py | 13 ++++++ .../test/server/testcase_configs.py | 13 +++--- 5 files changed, 65 insertions(+), 33 deletions(-) diff --git a/docs/diffusion/quantization.md b/docs/diffusion/quantization.md index df60c1668a40..e5eeac208529 100644 --- a/docs/diffusion/quantization.md +++ b/docs/diffusion/quantization.md @@ -57,22 +57,21 @@ Published checkpoints keep the serialized quantization config as `quant_method=modelopt`; the FP8 vs NVFP4 split below is a documentation label derived from `quant_algo`. -Six of the seven repos live under `BBuf/*`. The FLUX.2 NVFP4 entry keeps the +Six of the seven repos live under `lmsys/*`. The FLUX.2 NVFP4 entry keeps the official `black-forest-labs/FLUX.2-dev-NVFP4` repo. | Quant Algo | Base Model | Preferred CLI | HF Repo | Current Scope | Notes | | --- | --- | --- | --- | --- | --- | -| `FP8` | `black-forest-labs/FLUX.1-dev` | `--transformer-path` | `BBuf/flux1-dev-modelopt-fp8-sglang-transformer` | single-transformer override, deterministic latent/image comparison, H100 benchmark, torch-profiler trace | SGLang converter keeps a validated BF16 fallback set for modulation and FF projection layers; use `--model-id FLUX.1-dev` for local mirrors | -| `FP8` | `black-forest-labs/FLUX.2-dev` | `--transformer-path` | `BBuf/flux2-dev-modelopt-fp8-sglang-transformer` | single-transformer override load and generation path | published SGLang-ready transformer override | -| `FP8` | `Wan-AI/Wan2.2-T2V-A14B-Diffusers` | `--transformer-path` | `BBuf/wan22-t2v-a14b-modelopt-fp8-sglang-transformer` | primary `transformer` quantized, `transformer_2` kept BF16 | primary-transformer-only path; keep `transformer_2` on the base checkpoint, and do not describe this as dual-transformer full-model FP8 unless that path is validated separately | -| `FP8` | `hunyuanvideo-community/HunyuanVideo` | `--transformer-path` | `BBuf/HunyuanVideo-ModelOpt-FP8-SGLang` | single-transformer override, BF16-vs-FP8 video comparison, H100 benchmark, torch-profiler trace | HunyuanVideo uses different ModelOpt/diffusers and SGLang runtime module names; the converter maps those names before writing FP8 scale tensors and BF16 fallback ignores | -| `NVFP4` | `black-forest-labs/FLUX.1-dev` | `--transformer-path` | `BBuf/flux1-dev-modelopt-nvfp4-sglang-transformer` | mixed BF16+NVFP4 transformer override, correctness validation, 4x RTX 5090 benchmark, torch-profiler trace | use `build_modelopt_nvfp4_transformer.py`; validated builder keeps selected FLUX.1 modules in BF16 and sets `swap_weight_nibbles=false` | +| `FP8` | `black-forest-labs/FLUX.1-dev` | `--transformer-path` | `lmsys/flux1-dev-modelopt-fp8-sglang-transformer` | single-transformer override, deterministic latent/image comparison, H100 benchmark, torch-profiler trace | SGLang converter keeps a validated BF16 fallback set for modulation and FF projection layers; use `--model-id FLUX.1-dev` for local mirrors | +| `FP8` | `black-forest-labs/FLUX.2-dev` | `--transformer-path` | `lmsys/flux2-dev-modelopt-fp8-sglang-transformer` | single-transformer override load and generation path | published SGLang-ready transformer override | +| `FP8` | `Wan-AI/Wan2.2-T2V-A14B-Diffusers` | `--transformer-path` | `lmsys/wan22-t2v-a14b-modelopt-fp8-sglang-transformer` | primary `transformer` quantized, `transformer_2` kept BF16 | primary-transformer-only path; keep `transformer_2` on the base checkpoint, and do not describe this as dual-transformer full-model FP8 unless that path is validated separately | +| `FP8` | `hunyuanvideo-community/HunyuanVideo` | `--transformer-path` | `lmsys/hunyuanvideo-modelopt-fp8-sglang-transformer` | single-transformer override, BF16-vs-FP8 video comparison, H100 benchmark, torch-profiler trace | HunyuanVideo uses different ModelOpt/diffusers and SGLang runtime module names; the converter maps those names before writing FP8 scale tensors and BF16 fallback ignores | +| `NVFP4` | `black-forest-labs/FLUX.1-dev` | `--transformer-path` | `lmsys/flux1-dev-modelopt-nvfp4-sglang-transformer` | mixed BF16+NVFP4 transformer override, correctness validation, 4x RTX 5090 benchmark, torch-profiler trace | use `build_modelopt_nvfp4_transformer.py`; validated builder keeps selected FLUX.1 modules in BF16 and sets `swap_weight_nibbles=false` | | `NVFP4` | `black-forest-labs/FLUX.2-dev` | `--transformer-weights-path` | `black-forest-labs/FLUX.2-dev-NVFP4` | packed-QKV load path | official raw export repo; validated packed export detection and runtime layout handling | -| `NVFP4` | `Wan-AI/Wan2.2-T2V-A14B-Diffusers` | `--transformer-path` | `BBuf/wan22-t2v-a14b-modelopt-nvfp4-sglang-transformer` | primary `transformer` quantized with ModelOpt NVFP4, `transformer_2` kept BF16 | primary-transformer-only path; keep `transformer_2` on the base checkpoint, and current B200/Blackwell bring-up uses `SGLANG_DIFFUSION_FLASHINFER_FP4_GEMM_BACKEND=cudnn` | +| `NVFP4` | `Wan-AI/Wan2.2-T2V-A14B-Diffusers` | `--transformer-path` | `lmsys/wan22-t2v-a14b-modelopt-nvfp4-sglang-transformer` | primary `transformer` quantized with ModelOpt NVFP4, `transformer_2` kept BF16 | primary-transformer-only path; keep `transformer_2` on the base checkpoint, and current B200/Blackwell bring-up uses `SGLANG_DIFFUSION_FLASHINFER_FP4_GEMM_BACKEND=cudnn` | -The FLUX and Wan entries are also the intended case set for the B200 diffusion -CI job (`multimodal-gen-test-1-b200`). The HunyuanVideo FP8 entry is currently -an H100 manual-validation artifact. +These seven checkpoints are also the intended case set for the B200 diffusion +CI job (`multimodal-gen-test-1-b200`). ## ModelOpt FP8 @@ -85,7 +84,7 @@ overrides. If the repo or local directory already contains `config.json`, use ```bash sglang generate \ --model-path black-forest-labs/FLUX.2-dev \ - --transformer-path BBuf/flux2-dev-modelopt-fp8-sglang-transformer \ + --transformer-path lmsys/flux2-dev-modelopt-fp8-sglang-transformer \ --prompt "A Logo With Bold Large Text: SGL Diffusion" \ --save-output ``` @@ -93,7 +92,7 @@ sglang generate \ ```bash sglang generate \ --model-path Wan-AI/Wan2.2-T2V-A14B-Diffusers \ - --transformer-path BBuf/wan22-t2v-a14b-modelopt-fp8-sglang-transformer \ + --transformer-path lmsys/wan22-t2v-a14b-modelopt-fp8-sglang-transformer \ --prompt "a fox walking through neon rain" \ --save-output ``` @@ -101,7 +100,7 @@ sglang generate \ ```bash sglang generate \ --model-path hunyuanvideo-community/HunyuanVideo \ - --transformer-path BBuf/HunyuanVideo-ModelOpt-FP8-SGLang \ + --transformer-path lmsys/hunyuanvideo-modelopt-fp8-sglang-transformer \ --height 544 --width 960 --num-frames 17 \ --prompt "A cinematic shot of a red sports car driving through rain at night" \ --save-output @@ -148,7 +147,7 @@ For mixed ModelOpt NVFP4 transformer overrides that already contain ```bash sglang generate \ --model-path black-forest-labs/FLUX.1-dev \ - --transformer-path BBuf/flux1-dev-modelopt-nvfp4-sglang-transformer \ + --transformer-path lmsys/flux1-dev-modelopt-nvfp4-sglang-transformer \ --prompt "A Logo With Bold Large Text: SGL Diffusion" \ --save-output ``` @@ -181,7 +180,7 @@ was quantized: SGLANG_DIFFUSION_FLASHINFER_FP4_GEMM_BACKEND=cudnn \ sglang generate \ --model-path Wan-AI/Wan2.2-T2V-A14B-Diffusers \ - --transformer-path BBuf/wan22-t2v-a14b-modelopt-nvfp4-sglang-transformer \ + --transformer-path lmsys/wan22-t2v-a14b-modelopt-nvfp4-sglang-transformer \ --prompt "a fox walking through neon rain" \ --save-output ``` diff --git a/docs_new/docs/sglang-diffusion/quantization.mdx b/docs_new/docs/sglang-diffusion/quantization.mdx index 62d336f34b3e..9d2749dfd187 100644 --- a/docs_new/docs/sglang-diffusion/quantization.mdx +++ b/docs_new/docs/sglang-diffusion/quantization.mdx @@ -109,14 +109,14 @@ backend. ## Validated ModelOpt Checkpoints -This section is the canonical support matrix for the six diffusion ModelOpt +This section is the canonical support matrix for the seven diffusion ModelOpt checkpoints currently wired up in SGLang docs and B200 CI coverage. Published checkpoints keep the serialized quantization config as `quant_method=modelopt`; the FP8 vs NVFP4 split below is a documentation label derived from `quant_algo`. -Five of the six repos live under `BBuf/*`. The FLUX.2 NVFP4 entry keeps the +Six of the seven repos live under `lmsys/*`. The FLUX.2 NVFP4 entry keeps the official `black-forest-labs/FLUX.2-dev-NVFP4` repo. @@ -143,7 +143,7 @@ official `black-forest-labs/FLUX.2-dev-NVFP4` repo. - + @@ -151,7 +151,7 @@ official `black-forest-labs/FLUX.2-dev-NVFP4` repo. - + @@ -159,15 +159,23 @@ official `black-forest-labs/FLUX.2-dev-NVFP4` repo. - + + + + + + + + + - + @@ -183,14 +191,14 @@ official `black-forest-labs/FLUX.2-dev-NVFP4` repo. - +
FP8 black-forest-labs/FLUX.1-dev --transformer-pathBBuf/flux1-dev-modelopt-fp8-sglang-transformerlmsys/flux1-dev-modelopt-fp8-sglang-transformer single-transformer override, deterministic latent/image comparison, H100 benchmark, torch-profiler trace SGLang converter keeps a validated BF16 fallback set for modulation and FF projection layers; use --model-id FLUX.1-dev for local mirrors
FP8 black-forest-labs/FLUX.2-dev --transformer-pathBBuf/flux2-dev-modelopt-fp8-sglang-transformerlmsys/flux2-dev-modelopt-fp8-sglang-transformer single-transformer override load and generation path published SGLang-ready transformer override
FP8 Wan-AI/Wan2.2-T2V-A14B-Diffusers --transformer-pathBBuf/wan22-t2v-a14b-modelopt-fp8-sglang-transformerlmsys/wan22-t2v-a14b-modelopt-fp8-sglang-transformer primary transformer quantized, transformer_2 kept BF16 primary-transformer-only path; keep transformer_2 on the base checkpoint, and do not describe this as dual-transformer full-model FP8 unless that path is validated separately
FP8hunyuanvideo-community/HunyuanVideo--transformer-pathlmsys/hunyuanvideo-modelopt-fp8-sglang-transformersingle-transformer override, BF16-vs-FP8 video comparison, H100 benchmark, torch-profiler traceHunyuanVideo uses different ModelOpt/diffusers and SGLang runtime module names; the converter maps those names before writing FP8 scale tensors and BF16 fallback ignores
NVFP4 black-forest-labs/FLUX.1-dev --transformer-pathBBuf/flux1-dev-modelopt-nvfp4-sglang-transformerlmsys/flux1-dev-modelopt-nvfp4-sglang-transformer mixed BF16+NVFP4 transformer override, correctness validation, 4x RTX 5090 benchmark, torch-profiler trace use build_modelopt_nvfp4_transformer.py; validated builder keeps selected FLUX.1 modules in BF16 and sets swap_weight_nibbles=false
NVFP4 Wan-AI/Wan2.2-T2V-A14B-Diffusers --transformer-pathBBuf/wan22-t2v-a14b-modelopt-nvfp4-sglang-transformerlmsys/wan22-t2v-a14b-modelopt-nvfp4-sglang-transformer primary transformer quantized with ModelOpt NVFP4, transformer_2 kept BF16 primary-transformer-only path; keep transformer_2 on the base checkpoint, and current B200/Blackwell bring-up uses SGLANG_DIFFUSION_FLASHINFER_FP4_GEMM_BACKEND=cudnn
-These six checkpoints are also the intended case set for the B200 diffusion CI +These seven checkpoints are also the intended case set for the B200 diffusion CI job (`multimodal-gen-test-1-b200`). ## ModelOpt FP8 @@ -204,7 +212,7 @@ overrides. If the repo or local directory already contains `config.json`, use ```bash sglang generate \ --model-path black-forest-labs/FLUX.2-dev \ - --transformer-path BBuf/flux2-dev-modelopt-fp8-sglang-transformer \ + --transformer-path lmsys/flux2-dev-modelopt-fp8-sglang-transformer \ --prompt "A Logo With Bold Large Text: SGL Diffusion" \ --save-output ``` @@ -212,11 +220,20 @@ sglang generate \ ```bash sglang generate \ --model-path Wan-AI/Wan2.2-T2V-A14B-Diffusers \ - --transformer-path BBuf/wan22-t2v-a14b-modelopt-fp8-sglang-transformer \ + --transformer-path lmsys/wan22-t2v-a14b-modelopt-fp8-sglang-transformer \ --prompt "a fox walking through neon rain" \ --save-output ``` +```bash +sglang generate \ + --model-path hunyuanvideo-community/HunyuanVideo \ + --transformer-path lmsys/hunyuanvideo-modelopt-fp8-sglang-transformer \ + --height 544 --width 960 --num-frames 17 \ + --prompt "A cinematic shot of a red sports car driving through rain at night" \ + --save-output +``` + ### Notes - `--transformer-path` is the canonical flag for converted ModelOpt FP8 @@ -247,7 +264,7 @@ For mixed ModelOpt NVFP4 transformer overrides that already contain ```bash sglang generate \ --model-path black-forest-labs/FLUX.1-dev \ - --transformer-path BBuf/flux1-dev-modelopt-nvfp4-sglang-transformer \ + --transformer-path lmsys/flux1-dev-modelopt-nvfp4-sglang-transformer \ --prompt "A Logo With Bold Large Text: SGL Diffusion" \ --save-output ``` @@ -280,7 +297,7 @@ was quantized: SGLANG_DIFFUSION_FLASHINFER_FP4_GEMM_BACKEND=cudnn \ sglang generate \ --model-path Wan-AI/Wan2.2-T2V-A14B-Diffusers \ - --transformer-path BBuf/wan22-t2v-a14b-modelopt-nvfp4-sglang-transformer \ + --transformer-path lmsys/wan22-t2v-a14b-modelopt-nvfp4-sglang-transformer \ --prompt "a fox walking through neon rain" \ --save-output ``` diff --git a/python/sglang/multimodal_gen/.claude/skills/sglang-diffusion-modelopt-quant/SKILL.md b/python/sglang/multimodal_gen/.claude/skills/sglang-diffusion-modelopt-quant/SKILL.md index 655aa1df5fa5..cc28ce497bdb 100644 --- a/python/sglang/multimodal_gen/.claude/skills/sglang-diffusion-modelopt-quant/SKILL.md +++ b/python/sglang/multimodal_gen/.claude/skills/sglang-diffusion-modelopt-quant/SKILL.md @@ -67,7 +67,7 @@ Validated documentation and CI coverage currently center on these ModelOpt diffu - NVFP4: FLUX.1-dev, FLUX.2-dev, Wan2.2 Treat a new family, a new precision, or a new checkpoint layout as unsupported until it has a documented matrix row and a matching validation story. -Before writing CLI examples, re-read the active branch's `docs/diffusion/quantization.md`: FLUX.2 NVFP4 is an official `black-forest-labs/*` repo rather than a `BBuf/*` converted repo, and its preferred flag depends on the current documented loader flow. Use `--transformer-path` for a component override directory with `config.json`; use `--transformer-weights-path` when the repo or path should be probed as raw weights. +Before writing CLI examples, re-read the active branch's `docs/diffusion/quantization.md`: FLUX.2 NVFP4 is an official `black-forest-labs/*` repo rather than a `lmsys/*` converted repo, and its preferred flag depends on the current documented loader flow. Use `--transformer-path` for a component override directory with `config.json`; use `--transformer-weights-path` when the repo or path should be probed as raw weights. B200 CI coverage can include loose BF16-vs-quantized quality smoke checks. Inspect the active branch's `run_suite.py` before assuming they are part of the suite; mainline and feature branches may differ. Those checks are intended to catch blank, corrupted, or obviously divergent images, not exact image parity. diff --git a/python/sglang/multimodal_gen/test/server/gpu_cases.py b/python/sglang/multimodal_gen/test/server/gpu_cases.py index 31edfa1edb50..1e556f1e9c96 100644 --- a/python/sglang/multimodal_gen/test/server/gpu_cases.py +++ b/python/sglang/multimodal_gen/test/server/gpu_cases.py @@ -4,6 +4,7 @@ MODELOPT_FLUX1_NVFP4_TRANSFORMER, MODELOPT_FLUX2_FP8_TRANSFORMER, MODELOPT_FLUX2_NVFP4_WEIGHTS, + MODELOPT_HUNYUANVIDEO_FP8_TRANSFORMER, MODELOPT_NVFP4_B200_ENV_VARS, MODELOPT_WAN22_FP8_TRANSFORMER, MODELOPT_WAN22_NVFP4_TRANSFORMER, @@ -387,6 +388,18 @@ sampling_params=MODELOPT_T2V_CI_sampling_params, extras=["--transformer-path", MODELOPT_WAN22_FP8_TRANSFORMER], ), + _make_modelopt_ci_case( + "hunyuanvideo_modelopt_fp8_t2v", + model_path="hunyuanvideo-community/HunyuanVideo", + modality="video", + sampling_params=MODELOPT_T2V_CI_sampling_params, + extras=[ + "--transformer-path", + MODELOPT_HUNYUANVIDEO_FP8_TRANSFORMER, + "--text-encoder-cpu-offload", + "--pin-cpu-memory", + ], + ), _make_modelopt_ci_case( "flux1_modelopt_nvfp4_t2i", model_path=DEFAULT_FLUX_1_DEV_MODEL_NAME_FOR_TEST, diff --git a/python/sglang/multimodal_gen/test/server/testcase_configs.py b/python/sglang/multimodal_gen/test/server/testcase_configs.py index c25bb2d2f67b..9e639aa92072 100644 --- a/python/sglang/multimodal_gen/test/server/testcase_configs.py +++ b/python/sglang/multimodal_gen/test/server/testcase_configs.py @@ -431,13 +431,16 @@ def from_req_perf_record( image_path="https://raw.githubusercontent.com/sgl-project/sgl-test-files/main/diffusion-ci/consistency_gt/1-gpu/hunyuan3d_2_0/hunyuan3d.png", ) -MODELOPT_FLUX1_FP8_TRANSFORMER = "BBuf/flux1-dev-modelopt-fp8-sglang-transformer" -MODELOPT_FLUX2_FP8_TRANSFORMER = "BBuf/flux2-dev-modelopt-fp8-sglang-transformer" -MODELOPT_WAN22_FP8_TRANSFORMER = "BBuf/wan22-t2v-a14b-modelopt-fp8-sglang-transformer" -MODELOPT_FLUX1_NVFP4_TRANSFORMER = "BBuf/flux1-dev-modelopt-nvfp4-sglang-transformer" +MODELOPT_FLUX1_FP8_TRANSFORMER = "lmsys/flux1-dev-modelopt-fp8-sglang-transformer" +MODELOPT_FLUX2_FP8_TRANSFORMER = "lmsys/flux2-dev-modelopt-fp8-sglang-transformer" +MODELOPT_WAN22_FP8_TRANSFORMER = "lmsys/wan22-t2v-a14b-modelopt-fp8-sglang-transformer" +MODELOPT_HUNYUANVIDEO_FP8_TRANSFORMER = ( + "lmsys/hunyuanvideo-modelopt-fp8-sglang-transformer" +) +MODELOPT_FLUX1_NVFP4_TRANSFORMER = "lmsys/flux1-dev-modelopt-nvfp4-sglang-transformer" MODELOPT_FLUX2_NVFP4_WEIGHTS = "black-forest-labs/FLUX.2-dev-NVFP4" MODELOPT_WAN22_NVFP4_TRANSFORMER = ( - "BBuf/wan22-t2v-a14b-modelopt-nvfp4-sglang-transformer" + "lmsys/wan22-t2v-a14b-modelopt-nvfp4-sglang-transformer" ) MODELOPT_NVFP4_B200_ENV_VARS = {"SGLANG_DIFFUSION_FLASHINFER_FP4_GEMM_BACKEND": "cudnn"}