From 6be8de51486f9ee3016cc3ea5f0b997e34a37d8f Mon Sep 17 00:00:00 2001 From: Zhang Date: Tue, 14 Apr 2026 08:37:42 +0000 Subject: [PATCH 1/6] BugFix][Z-Image] Keep modulation and embedder layers unquantized under FP8 (#2728) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit FP8 online quantization on Z-Image-Turbo produced pure pixel noise (LPIPS 0.74 vs BF16). Root cause: small precision-sensitive layers — TimestepEmbedder MLP, x_embedder, cap_embedder, per-block adaLN_modulation, and FinalLayer's output/modulation — were being FP8-quantized. Errors on these layers feed the scale chain that multiplies the residual stream every block, so small per-layer drift turns into catastrophic magnitude blow-up by layer 30. Mirrors the earlier OmniGen2 FP8 fix (dbf8b7c7). Swap these 6 layers from to plain . Main-path matmuls (to_qkv, to_out, feed_forward.w13, feed_forward.w2) stay FP8, so the memory win is preserved. After fix: LPIPS 0.0659 (PASS, threshold 0.1). Signed-off-by: Zhang --- .../models/z_image/z_image_transformer.py | 45 ++++++++----------- 1 file changed, 18 insertions(+), 27 deletions(-) diff --git a/vllm_omni/diffusion/models/z_image/z_image_transformer.py b/vllm_omni/diffusion/models/z_image/z_image_transformer.py index 3ffad221ba9..f4a1586b056 100644 --- a/vllm_omni/diffusion/models/z_image/z_image_transformer.py +++ b/vllm_omni/diffusion/models/z_image/z_image_transformer.py @@ -214,22 +214,12 @@ def __init__( super().__init__() if mid_size is None: mid_size = out_size + # Time embedding MLP is kept full precision — small layers that + # feed adaLN; precision-sensitive (see issue #2728). self.mlp = nn.Sequential( - ReplicatedLinear( - frequency_embedding_size, - mid_size, - bias=True, - quant_config=quant_config, - return_bias=False, - ), + nn.Linear(frequency_embedding_size, mid_size, bias=True), nn.SiLU(), - ReplicatedLinear( - mid_size, - out_size, - bias=True, - quant_config=quant_config, - return_bias=False, - ), + nn.Linear(mid_size, out_size, bias=True), ) self.frequency_embedding_size = frequency_embedding_size @@ -426,10 +416,11 @@ def __init__( self.modulation = modulation if modulation: + # Modulation linear is kept at full precision — it produces + # scale/gate values that are precision-sensitive (mirrors the + # OmniGen2 FP8 fix; see issue #2728). self.adaLN_modulation = nn.Sequential( - ReplicatedLinear( - min(dim, ADALN_EMBED_DIM), 4 * dim, bias=True, return_bias=False, quant_config=quant_config - ), + nn.Linear(min(dim, ADALN_EMBED_DIM), 4 * dim, bias=True), ) def forward( @@ -485,15 +476,13 @@ class FinalLayer(nn.Module): def __init__(self, hidden_size, out_channels, quant_config: "QuantizationConfig | None" = None): super().__init__() self.norm_final = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6) - self.linear = ReplicatedLinear( - hidden_size, out_channels, bias=True, quant_config=quant_config, return_bias=False - ) + # Final output projection and its modulation are precision-sensitive + # (map latents -> image); keep at full precision (see issue #2728). + self.linear = nn.Linear(hidden_size, out_channels, bias=True) self.adaLN_modulation = nn.Sequential( nn.SiLU(), - ReplicatedLinear( - min(hidden_size, ADALN_EMBED_DIM), hidden_size, bias=True, quant_config=quant_config, return_bias=False - ), + nn.Linear(min(hidden_size, ADALN_EMBED_DIM), hidden_size, bias=True), ) def forward(self, x, c): @@ -673,12 +662,12 @@ def __init__( all_x_embedder = {} all_final_layer = {} for patch_idx, (patch_size, f_patch_size) in enumerate(zip(all_patch_size, all_f_patch_size)): - x_embedder = ReplicatedLinear( + # x_embedder (patch embed) is a small precision-sensitive entry + # layer; keep full precision (see issue #2728). + x_embedder = nn.Linear( f_patch_size * patch_size * patch_size * in_channels, dim, bias=True, - quant_config=quant_config, - return_bias=False, ) all_x_embedder[f"{patch_size}-{f_patch_size}"] = x_embedder @@ -720,9 +709,11 @@ def __init__( ] ) self.t_embedder = TimestepEmbedder(min(dim, ADALN_EMBED_DIM), mid_size=1024, quant_config=quant_config) + # Caption embedder maps text features -> hidden; keep full precision + # (see issue #2728). self.cap_embedder = nn.Sequential( RMSNorm(cap_feat_dim, eps=norm_eps), - ReplicatedLinear(cap_feat_dim, dim, bias=True, return_bias=False, quant_config=quant_config), + nn.Linear(cap_feat_dim, dim, bias=True), ) self.x_pad_token = nn.Parameter(torch.empty((1, dim))) From 70af357ee79880a81495f713ca775324b1b52e7b Mon Sep 17 00:00:00 2001 From: Zhang Date: Tue, 14 Apr 2026 09:02:56 +0000 Subject: [PATCH 2/6] [BugFix][Qwen-Image] Keep modulation and embedder layers unquantized under FP8 (#2728) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit FP8 online quantization on Qwen-Image produced pure pixel noise (LPIPS 0.95 vs BF16). Same root cause as the Z-Image fix: precision- sensitive small layers (time embedder, img_in/txt_in entry, per-block img_mod/txt_mod modulation, norm_out.linear, proj_out) feed the shift/scale/gate chain that multiplies the residual stream every block, so small per-layer drift blows up into noise. After fix: LPIPS 0.32 (PASS, Qwen-Image threshold 0.35). Main-path matmuls (to_qkv, to_out, add_kv_proj, to_add_out, img_mlp, txt_mlp) remain FP8 for memory savings — peak ~41 GB vs ~59 GB BF16. Signed-off-by: Zhang --- .../qwen_image/qwen_image_transformer.py | 85 +++++-------------- 1 file changed, 19 insertions(+), 66 deletions(-) diff --git a/vllm_omni/diffusion/models/qwen_image/qwen_image_transformer.py b/vllm_omni/diffusion/models/qwen_image/qwen_image_transformer.py index b34f19e954a..acf2f2aa494 100644 --- a/vllm_omni/diffusion/models/qwen_image/qwen_image_transformer.py +++ b/vllm_omni/diffusion/models/qwen_image/qwen_image_transformer.py @@ -169,22 +169,10 @@ def __init__( self.time_proj = Timesteps(num_channels=256, flip_sin_to_cos=True, downscale_freq_shift=0, scale=1000) self.timestep_embedder = TimestepEmbedding(in_channels=256, time_embed_dim=embedding_dim) - self.timestep_embedder.linear_1 = ReplicatedLinear( - 256, - embedding_dim, - bias=True, - return_bias=False, - quant_config=quant_config, - prefix="timestep_embedder.linear_1", - ) - self.timestep_embedder.linear_2 = ReplicatedLinear( - embedding_dim, - embedding_dim, - bias=True, - return_bias=False, - quant_config=quant_config, - prefix="timestep_embedder.linear_2", - ) + # Time embedding MLP is kept full precision — small layers that + # feed the per-block modulation; precision-sensitive (see issue #2728). + self.timestep_embedder.linear_1 = nn.Linear(256, embedding_dim, bias=True) + self.timestep_embedder.linear_2 = nn.Linear(embedding_dim, embedding_dim, bias=True) self.use_additional_t_cond = use_additional_t_cond if use_additional_t_cond: self.addition_t_embedding = nn.Embedding(2, embedding_dim) @@ -701,17 +689,12 @@ def __init__( self.num_attention_heads = num_attention_heads self.attention_head_dim = attention_head_dim - # Image processing modules + # Image processing modules. + # Modulation linears are kept full precision — they produce + # shift/scale/gate values that are precision-sensitive (see #2728). self.img_mod = nn.Sequential( nn.SiLU(), - ReplicatedLinear( - dim, - 6 * dim, - bias=True, - return_bias=False, - quant_config=quant_config, - prefix="img_mod.1", - ), + nn.Linear(dim, 6 * dim, bias=True), ) self.img_norm1 = AdaLayerNorm(dim, elementwise_affine=False, eps=eps) self.attn = QwenImageCrossAttention( @@ -725,17 +708,10 @@ def __init__( self.img_norm2 = AdaLayerNorm(dim, elementwise_affine=False, eps=eps) self.img_mlp = FeedForward(dim=dim, dim_out=dim, quant_config=quant_config, prefix="img_mlp") - # Text processing modules + # Text processing modules. self.txt_mod = nn.Sequential( nn.SiLU(), - ReplicatedLinear( - dim, - 6 * dim, - bias=True, - return_bias=False, - quant_config=quant_config, - prefix="txt_mod.1", - ), + nn.Linear(dim, 6 * dim, bias=True), ) self.txt_norm1 = AdaLayerNorm(dim, elementwise_affine=False, eps=eps) # Text doesn't need separate attention - it's handled by img_attn joint computation @@ -958,22 +934,10 @@ def __init__( self.txt_norm = RMSNorm(joint_attention_dim, eps=1e-6) - self.img_in = ReplicatedLinear( - in_channels, - self.inner_dim, - bias=True, - return_bias=False, - quant_config=quant_config, - prefix="img_in", - ) - self.txt_in = ReplicatedLinear( - joint_attention_dim, - self.inner_dim, - bias=True, - return_bias=False, - quant_config=quant_config, - prefix="txt_in", - ) + # Entry projections (image/text) are kept full precision — small + # sensitive layers at the network boundary (see #2728). + self.img_in = nn.Linear(in_channels, self.inner_dim, bias=True) + self.txt_in = nn.Linear(joint_attention_dim, self.inner_dim, bias=True) self.transformer_blocks = nn.ModuleList( [ @@ -988,23 +952,12 @@ def __init__( ] ) + # Final modulation and output projection are kept full precision — + # they produce the output latent and are precision-sensitive + # (see #2728). self.norm_out = AdaLayerNormContinuous(self.inner_dim, self.inner_dim, elementwise_affine=False, eps=1e-6) - self.norm_out.linear = ReplicatedLinear( - self.inner_dim, - 2 * self.inner_dim, - bias=True, - return_bias=False, - quant_config=quant_config, - prefix="norm_out.linear", - ) - self.proj_out = ReplicatedLinear( - self.inner_dim, - patch_size * patch_size * self.out_channels, - bias=True, - return_bias=False, - quant_config=quant_config, - prefix="proj_out", - ) + self.norm_out.linear = nn.Linear(self.inner_dim, 2 * self.inner_dim, bias=True) + self.proj_out = nn.Linear(self.inner_dim, patch_size * patch_size * self.out_channels, bias=True) self.gradient_checkpointing = False self.zero_cond_t = zero_cond_t From 7fccddeda6ca49bb67898192488ce42164ab7438 Mon Sep 17 00:00:00 2001 From: Zhang Date: Tue, 14 Apr 2026 09:10:26 +0000 Subject: [PATCH 3/6] skip layers by quant_config=None Signed-off-by: Zhang --- .../qwen_image/qwen_image_transformer.py | 86 ++++++++++++++++--- .../models/z_image/z_image_transformer.py | 68 +++++++++++---- 2 files changed, 125 insertions(+), 29 deletions(-) diff --git a/vllm_omni/diffusion/models/qwen_image/qwen_image_transformer.py b/vllm_omni/diffusion/models/qwen_image/qwen_image_transformer.py index acf2f2aa494..049bdc053ec 100644 --- a/vllm_omni/diffusion/models/qwen_image/qwen_image_transformer.py +++ b/vllm_omni/diffusion/models/qwen_image/qwen_image_transformer.py @@ -169,10 +169,25 @@ def __init__( self.time_proj = Timesteps(num_channels=256, flip_sin_to_cos=True, downscale_freq_shift=0, scale=1000) self.timestep_embedder = TimestepEmbedding(in_channels=256, time_embed_dim=embedding_dim) - # Time embedding MLP is kept full precision — small layers that - # feed the per-block modulation; precision-sensitive (see issue #2728). - self.timestep_embedder.linear_1 = nn.Linear(256, embedding_dim, bias=True) - self.timestep_embedder.linear_2 = nn.Linear(embedding_dim, embedding_dim, bias=True) + # Time embedding MLP is kept full precision (quant_config=None) — + # small layers that feed per-block modulation; precision-sensitive + # (see #2728). + self.timestep_embedder.linear_1 = ReplicatedLinear( + 256, + embedding_dim, + bias=True, + return_bias=False, + quant_config=None, + prefix="timestep_embedder.linear_1", + ) + self.timestep_embedder.linear_2 = ReplicatedLinear( + embedding_dim, + embedding_dim, + bias=True, + return_bias=False, + quant_config=None, + prefix="timestep_embedder.linear_2", + ) self.use_additional_t_cond = use_additional_t_cond if use_additional_t_cond: self.addition_t_embedding = nn.Embedding(2, embedding_dim) @@ -690,11 +705,19 @@ def __init__( self.attention_head_dim = attention_head_dim # Image processing modules. - # Modulation linears are kept full precision — they produce - # shift/scale/gate values that are precision-sensitive (see #2728). + # Modulation linear is kept full precision (quant_config=None) — it + # produces shift/scale/gate values that are precision-sensitive + # (see #2728). self.img_mod = nn.Sequential( nn.SiLU(), - nn.Linear(dim, 6 * dim, bias=True), + ReplicatedLinear( + dim, + 6 * dim, + bias=True, + return_bias=False, + quant_config=None, + prefix="img_mod.1", + ), ) self.img_norm1 = AdaLayerNorm(dim, elementwise_affine=False, eps=eps) self.attn = QwenImageCrossAttention( @@ -711,7 +734,14 @@ def __init__( # Text processing modules. self.txt_mod = nn.Sequential( nn.SiLU(), - nn.Linear(dim, 6 * dim, bias=True), + ReplicatedLinear( + dim, + 6 * dim, + bias=True, + return_bias=False, + quant_config=None, + prefix="txt_mod.1", + ), ) self.txt_norm1 = AdaLayerNorm(dim, elementwise_affine=False, eps=eps) # Text doesn't need separate attention - it's handled by img_attn joint computation @@ -934,10 +964,24 @@ def __init__( self.txt_norm = RMSNorm(joint_attention_dim, eps=1e-6) - # Entry projections (image/text) are kept full precision — small - # sensitive layers at the network boundary (see #2728). - self.img_in = nn.Linear(in_channels, self.inner_dim, bias=True) - self.txt_in = nn.Linear(joint_attention_dim, self.inner_dim, bias=True) + # Entry projections (image/text) are kept full precision — + # small sensitive layers at the network boundary (see #2728). + self.img_in = ReplicatedLinear( + in_channels, + self.inner_dim, + bias=True, + return_bias=False, + quant_config=None, + prefix="img_in", + ) + self.txt_in = ReplicatedLinear( + joint_attention_dim, + self.inner_dim, + bias=True, + return_bias=False, + quant_config=None, + prefix="txt_in", + ) self.transformer_blocks = nn.ModuleList( [ @@ -956,8 +1000,22 @@ def __init__( # they produce the output latent and are precision-sensitive # (see #2728). self.norm_out = AdaLayerNormContinuous(self.inner_dim, self.inner_dim, elementwise_affine=False, eps=1e-6) - self.norm_out.linear = nn.Linear(self.inner_dim, 2 * self.inner_dim, bias=True) - self.proj_out = nn.Linear(self.inner_dim, patch_size * patch_size * self.out_channels, bias=True) + self.norm_out.linear = ReplicatedLinear( + self.inner_dim, + 2 * self.inner_dim, + bias=True, + return_bias=False, + quant_config=None, + prefix="norm_out.linear", + ) + self.proj_out = ReplicatedLinear( + self.inner_dim, + patch_size * patch_size * self.out_channels, + bias=True, + return_bias=False, + quant_config=None, + prefix="proj_out", + ) self.gradient_checkpointing = False self.zero_cond_t = zero_cond_t diff --git a/vllm_omni/diffusion/models/z_image/z_image_transformer.py b/vllm_omni/diffusion/models/z_image/z_image_transformer.py index f4a1586b056..c36ea746654 100644 --- a/vllm_omni/diffusion/models/z_image/z_image_transformer.py +++ b/vllm_omni/diffusion/models/z_image/z_image_transformer.py @@ -214,12 +214,24 @@ def __init__( super().__init__() if mid_size is None: mid_size = out_size - # Time embedding MLP is kept full precision — small layers that - # feed adaLN; precision-sensitive (see issue #2728). + # Time embedding MLP is kept full precision (quant_config=None) — + # small layers that feed adaLN; precision-sensitive (see #2728). self.mlp = nn.Sequential( - nn.Linear(frequency_embedding_size, mid_size, bias=True), + ReplicatedLinear( + frequency_embedding_size, + mid_size, + bias=True, + quant_config=None, + return_bias=False, + ), nn.SiLU(), - nn.Linear(mid_size, out_size, bias=True), + ReplicatedLinear( + mid_size, + out_size, + bias=True, + quant_config=None, + return_bias=False, + ), ) self.frequency_embedding_size = frequency_embedding_size @@ -416,11 +428,17 @@ def __init__( self.modulation = modulation if modulation: - # Modulation linear is kept at full precision — it produces - # scale/gate values that are precision-sensitive (mirrors the - # OmniGen2 FP8 fix; see issue #2728). + # Modulation linear is kept at full precision (quant_config=None) + # — it produces scale/gate values that are precision-sensitive + # (see #2728, mirrors OmniGen2 fix). self.adaLN_modulation = nn.Sequential( - nn.Linear(min(dim, ADALN_EMBED_DIM), 4 * dim, bias=True), + ReplicatedLinear( + min(dim, ADALN_EMBED_DIM), + 4 * dim, + bias=True, + quant_config=None, + return_bias=False, + ), ) def forward( @@ -477,12 +495,24 @@ def __init__(self, hidden_size, out_channels, quant_config: "QuantizationConfig super().__init__() self.norm_final = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6) # Final output projection and its modulation are precision-sensitive - # (map latents -> image); keep at full precision (see issue #2728). - self.linear = nn.Linear(hidden_size, out_channels, bias=True) + # (produce the output latent); keep at full precision (see #2728). + self.linear = ReplicatedLinear( + hidden_size, + out_channels, + bias=True, + quant_config=None, + return_bias=False, + ) self.adaLN_modulation = nn.Sequential( nn.SiLU(), - nn.Linear(min(hidden_size, ADALN_EMBED_DIM), hidden_size, bias=True), + ReplicatedLinear( + min(hidden_size, ADALN_EMBED_DIM), + hidden_size, + bias=True, + quant_config=None, + return_bias=False, + ), ) def forward(self, x, c): @@ -663,11 +693,13 @@ def __init__( all_final_layer = {} for patch_idx, (patch_size, f_patch_size) in enumerate(zip(all_patch_size, all_f_patch_size)): # x_embedder (patch embed) is a small precision-sensitive entry - # layer; keep full precision (see issue #2728). - x_embedder = nn.Linear( + # layer; keep full precision (see #2728). + x_embedder = ReplicatedLinear( f_patch_size * patch_size * patch_size * in_channels, dim, bias=True, + quant_config=None, + return_bias=False, ) all_x_embedder[f"{patch_size}-{f_patch_size}"] = x_embedder @@ -710,10 +742,16 @@ def __init__( ) self.t_embedder = TimestepEmbedder(min(dim, ADALN_EMBED_DIM), mid_size=1024, quant_config=quant_config) # Caption embedder maps text features -> hidden; keep full precision - # (see issue #2728). + # (see #2728). self.cap_embedder = nn.Sequential( RMSNorm(cap_feat_dim, eps=norm_eps), - nn.Linear(cap_feat_dim, dim, bias=True), + ReplicatedLinear( + cap_feat_dim, + dim, + bias=True, + quant_config=None, + return_bias=False, + ), ) self.x_pad_token = nn.Parameter(torch.empty((1, dim))) From 56cc9a1613aaa25c1e8e45d4bdaf65c137801e53 Mon Sep 17 00:00:00 2001 From: Zhang Date: Tue, 14 Apr 2026 11:06:37 +0000 Subject: [PATCH 4/6] [BugFix][FLUX] Keep dual-stream blocks and modulation unquantized under FP8 (#2728) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit FP8 online quantization on FLUX.1-dev produced pure pixel noise (LPIPS 0.93 vs BF16). Unlike Z-Image/Qwen where the modulation/embedder pattern was enough, FLUX's dual-stream blocks (19 FluxTransformerBlock) run joint attention over concatenated [text, image] tokens — the mixed-distribution activations don't tolerate FP8 per-token quant, and neither the attn nor ff sub-layers can individually take FP8. Keep dual blocks fully BF16 and keep per-block modulation and final norm_out unquantized. Single blocks (38 of them, ~2x more param than dual) remain FP8, preserving most of the memory saving. After fix: LPIPS 0.1201 (PASS, FLUX threshold 0.20). Peak 33.2 GB vs BF16 36.7 GB (saves ~3.5 GB; less than Z-Image/Qwen because the bulk of dual-block params stays BF16). Co-Authored-By: pjh4993 Signed-off-by: Zhang --- vllm_omni/diffusion/models/flux/flux_transformer.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/vllm_omni/diffusion/models/flux/flux_transformer.py b/vllm_omni/diffusion/models/flux/flux_transformer.py index 680b8bfbbed..297c6267515 100644 --- a/vllm_omni/diffusion/models/flux/flux_transformer.py +++ b/vllm_omni/diffusion/models/flux/flux_transformer.py @@ -381,7 +381,9 @@ def __init__( super().__init__() self.mlp_hidden_dim = int(dim * mlp_ratio) - self.norm = AdaLayerNormZeroSingle(dim, quant_config=quant_config, prefix=f"{prefix}.norm") + # Modulation linear kept full precision; shift/scale/gate outputs + # are multiplied into the residual stream every block (see #2728). + self.norm = AdaLayerNormZeroSingle(dim, quant_config=None, prefix=f"{prefix}.norm") self.proj_mlp = ReplicatedLinear( dim, self.mlp_hidden_dim, @@ -563,13 +565,16 @@ def __init__( self.context_embedder = nn.Linear(joint_attention_dim, self.inner_dim) self.x_embedder = nn.Linear(in_channels, self.inner_dim) + # Dual-stream blocks kept full precision — FP8 on their joint + # attention path causes noise on FLUX (#2728). Single-stream + # blocks (38 vs 19) still get FP8 for memory savings. self.transformer_blocks = nn.ModuleList( [ FluxTransformerBlock( dim=self.inner_dim, num_attention_heads=num_attention_heads, attention_head_dim=attention_head_dim, - quant_config=quant_config, + quant_config=None, prefix=f"transformer_blocks.{i}", ) for i in range(num_layers) @@ -589,12 +594,13 @@ def __init__( ] ) + # Final modulation feeds proj_out; keep full precision (see #2728). self.norm_out = AdaLayerNormContinuous( self.inner_dim, self.inner_dim, elementwise_affine=False, eps=1e-6, - quant_config=quant_config, + quant_config=None, prefix="norm_out", ) self.proj_out = nn.Linear(self.inner_dim, patch_size * patch_size * self.out_channels, bias=True) From 5e695bde2ecc78a0e054fb054ce73596fda2eefc Mon Sep 17 00:00:00 2001 From: Zhang Date: Tue, 14 Apr 2026 12:15:13 +0000 Subject: [PATCH 5/6] docs: align docs/user_guide with main (drop FP8 table edits) Signed-off-by: Zhang --- docs/user_guide/diffusion/quantization/fp8.md | 16 +++++++--------- .../diffusion/quantization/overview.md | 2 +- 2 files changed, 8 insertions(+), 10 deletions(-) diff --git a/docs/user_guide/diffusion/quantization/fp8.md b/docs/user_guide/diffusion/quantization/fp8.md index a0d5545b36b..9906631b625 100644 --- a/docs/user_guide/diffusion/quantization/fp8.md +++ b/docs/user_guide/diffusion/quantization/fp8.md @@ -4,11 +4,9 @@ FP8 quantization converts BF16/FP16 weights to FP8 at model load time. No calibration or pre-quantized checkpoint needed. -Depending on the model, either most linear layers are quantized with a few **built-in** BF16 exceptions in code (`quant_config=None`), or you must add extra skips via `ignored_layers`. See the [per-model table](#supported-models). +Depending on the model, either all layers can be quantized, or some sensitive layers should stay in BF16. See the [per-model table](#supported-models) for which case applies. -Built-in BF16 is used where small linear layers drive **timestep conditioning**, **per-block modulation** (scale/shift/gate), **input embedders**, or the **final latent projection**. Quantizing those paths caused visible noise or color drift on Z-Image, Qwen-Image, and FLUX.1-dev ([PR #2728](https://github.com/vllm-project/vllm-omni/pull/2728)); they stay in full precision automatically—you do not name them in `ignored_layers`. - -Beyond that, common user-controlled skips include **image-stream MLPs** (`img_mlp`) on Qwen-Image: they see shifting latent statistics and benefit from `ignored_layers` for best quality. **Attention projections** (`to_qkv`, `to_out`) and **text-stream MLPs** (`txt_mlp`) are usually fine in FP8 when modulation and embedders stay BF16. +Common sensitive layers in DiT-based diffusion models include **image-stream MLPs** (`img_mlp`). These are particularly vulnerable to FP8 precision loss because they process denoising latents whose dynamic range shifts significantly across timesteps, and unlike attention projections (which benefit from QK-Norm stabilization), MLPs have no built-in normalization to absorb quantization error. In deep architectures (e.g., 60+ residual blocks), small per-layer errors compound and degrade output quality. Other layers such as **attention projections** (`to_qkv`, `to_out`) and **text-stream MLPs** (`txt_mlp`) are generally more robust due to normalization or more stable input statistics. ## Configuration @@ -60,11 +58,11 @@ The available `ignored_layers` names depend on the model architecture (e.g., `to ## Supported Models -| Model | HF Models | FP8 scope | `ignored_layers` (optional) | -|-------|-----------|-----------|------------------------------| -| Z-Image | `Tongyi-MAI/Z-Image-Turbo` | Main blocks (attention + FFN) use FP8. **Always BF16 in code:** timestep MLP, per-block adaLN modulation linear, patch and caption embedders, final layer (modulation + `proj_out`). | None required for those paths | -| Qwen-Image | `Qwen/Qwen-Image`, `Qwen/Qwen-Image-2512` | Joint attention and MLPs can use FP8. **Always BF16 in code:** timestep MLP, per-block `img_mod` / `txt_mod` linears, `img_in` / `txt_in`, `norm_out.linear`, `proj_out`. | Still recommend `img_mlp` for quality | -| Flux | `black-forest-labs/FLUX.1-dev` | **Single-stream** blocks (`single_transformer_blocks`) use FP8. **Always BF16 in code:** all **dual-stream** blocks (`transformer_blocks`, joint attention path), AdaLayerNormZeroSingle modulation in single blocks, and `norm_out` before final `proj_out`. | None required for those paths | +| Model | HF Models | Recommendation | `ignored_layers` | +|-------|-----------|---------------|------------------| +| Z-Image | `Tongyi-MAI/Z-Image-Turbo` | All layers | None | +| Qwen-Image | `Qwen/Qwen-Image`, `Qwen/Qwen-Image-2512` | Skip sensitive layers | `img_mlp` | +| Flux | `black-forest-labs/FLUX.1-dev` | All layers | None | | HunyuanImage-3 | `tencent/HunyuanImage3` | All layers | None | | HunyuanVideo-1.5 | `hunyuanvideo-community/HunyuanVideo-1.5-Diffusers-480p_t2v`, `720p_t2v`, `480p_i2v` | All layers | None | | Helios | `BestWishYsh/Helios-Base`, `BestWishYsh/Helios-Mid`, `BestWishYsh/Helios-Distilled` | All layers | None | diff --git a/docs/user_guide/diffusion/quantization/overview.md b/docs/user_guide/diffusion/quantization/overview.md index 4d1b37e73d4..25d7fa5c756 100644 --- a/docs/user_guide/diffusion/quantization/overview.md +++ b/docs/user_guide/diffusion/quantization/overview.md @@ -53,7 +53,7 @@ When `--quantization fp8` is enabled for diffusion models: | Component | What Gets Quantized | Mechanism | |-----------|-------------------|-----------| -| **DiT (transformer)** | `nn.Linear` layers (minus [built-in BF16 submodules](fp8.md#supported-models) on Z-Image, Qwen-Image, FLUX.1-dev) | vLLM W8A8 FP8 compute (Ada/Hopper) or weight-only (older GPUs) | +| **DiT (transformer)** | `nn.Linear` layers | vLLM W8A8 FP8 compute (Ada/Hopper) or weight-only (older GPUs) | | **Text encoder** | `nn.Linear` layers | FP8 weight storage, BF16 compute | | **VAE** | `nn.Conv2d`, `nn.Conv3d` layers | FP8 weight storage, BF16 compute | From 47ded5dc16e17fe73aac7e65d4bde122b14b7c82 Mon Sep 17 00:00:00 2001 From: Zhang Date: Tue, 14 Apr 2026 12:15:13 +0000 Subject: [PATCH 6/6] docs: align quantization contributor guide with main Signed-off-by: Zhang --- docs/contributing/model/adding_quantization_model.md | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/docs/contributing/model/adding_quantization_model.md b/docs/contributing/model/adding_quantization_model.md index 38e07387b3c..f2731888846 100644 --- a/docs/contributing/model/adding_quantization_model.md +++ b/docs/contributing/model/adding_quantization_model.md @@ -156,7 +156,7 @@ Some layers degrade output quality when quantized. Common sensitive layers: |-------|--------|-----------|----------------------|----------------| | Qwen-Image-2512 | Int8 | LPIPS 0.0197 | 0.0027 (skip `img_mlp`) | Skip `img_mlp` | | Z-Image-Turbo | Int8 | LPIPS 0.1597 | 0.0290 (skip `feed_forward`) | Skip `feed_forward` | -| Z-Image-Turbo | FP8 | LPIPS ~0.005 | — | Modulation / embedders / final layer stay BF16 in code ([#2728](https://github.com/vllm-project/vllm-omni/pull/2728)) | +| Z-Image-Turbo | FP8 | LPIPS ~0.005 | — | All layers OK | To identify sensitive layers for a new model: @@ -400,7 +400,7 @@ aggressive quantization methods (Int8, NVFP4). |-------|--------|-------------------|----------------------|---------------------| | Qwen-Image-2512 | Int8 | 0.0197 | 0.0027 | `img_mlp` | | Z-Image-Turbo | Int8 | 0.1597 | 0.0290 | `feed_forward` | -| Z-Image-Turbo | FP8 | ~0.005 | — | Built-in BF16 for modulation, embedders, final layer ([#2728](https://github.com/vllm-project/vllm-omni/pull/2728)) | +| Z-Image-Turbo | FP8 | ~0.005 | — | None needed | ### What to Verify (Checklist) @@ -459,9 +459,8 @@ aggressive quantization methods (Int8, NVFP4). | Model | Pipeline File | Transformer File | Notes | |-------|-------------|-----------------|-------| -| **Z-Image** | `models/z_image/pipeline_z_image.py` | `models/z_image/z_image_transformer.py` | DiT blocks + text encoder FP8; modulation, embedders, final layer BF16 ([#2728](https://github.com/vllm-project/vllm-omni/pull/2728)) | -| **Qwen-Image** | `models/qwen_image/pipeline_qwen_image.py` | `models/qwen_image/qwen_image_transformer.py` | DiT blocks + text encoder + VAE FP8; timestep MLP, modulation linears, `img_in`/`txt_in`, `norm_out`/`proj_out` BF16 ([#2728](https://github.com/vllm-project/vllm-omni/pull/2728)) | -| **FLUX.1-dev** | `models/flux/pipeline_flux.py` | `models/flux/flux_transformer.py` | Single-stream blocks FP8; dual-stream blocks + final `norm_out` BF16 ([#2728](https://github.com/vllm-project/vllm-omni/pull/2728)) | +| **Z-Image** | `models/z_image/pipeline_z_image.py` | `models/z_image/z_image_transformer.py` | DiT + text encoder FP8 | +| **Qwen-Image** | `models/qwen_image/pipeline_qwen_image.py` | `models/qwen_image/qwen_image_transformer.py` | DiT + text encoder + VAE FP8 | All files are under `vllm_omni/diffusion/`.