From 6be8de51486f9ee3016cc3ea5f0b997e34a37d8f Mon Sep 17 00:00:00 2001
From: Zhang <jianmusings@gmail.com>
Date: Tue, 14 Apr 2026 08:37:42 +0000
Subject: [PATCH 1/6] BugFix][Z-Image] Keep modulation and embedder layers
 unquantized under FP8   (#2728)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

  FP8 online quantization on Z-Image-Turbo produced pure pixel noise
  (LPIPS 0.74 vs BF16). Root cause: small precision-sensitive layers —
  TimestepEmbedder MLP, x_embedder, cap_embedder, per-block
  adaLN_modulation, and FinalLayer's output/modulation — were being
  FP8-quantized. Errors on these layers feed the  scale
  chain that multiplies the residual stream every block, so small
  per-layer drift turns into catastrophic magnitude blow-up by layer 30.

  Mirrors the earlier OmniGen2 FP8 fix (dbf8b7c7). Swap these 6 layers
  from  to plain
  . Main-path matmuls (to_qkv, to_out, feed_forward.w13,
  feed_forward.w2) stay FP8, so the memory win is preserved.

  After fix: LPIPS 0.0659 (PASS, threshold 0.1).

Signed-off-by: Zhang <jianmusings@gmail.com>
---
 .../models/z_image/z_image_transformer.py     | 45 ++++++++-----------
 1 file changed, 18 insertions(+), 27 deletions(-)

diff --git a/vllm_omni/diffusion/models/z_image/z_image_transformer.py b/vllm_omni/diffusion/models/z_image/z_image_transformer.py
index 3ffad221ba9..f4a1586b056 100644
--- a/vllm_omni/diffusion/models/z_image/z_image_transformer.py
+++ b/vllm_omni/diffusion/models/z_image/z_image_transformer.py
@@ -214,22 +214,12 @@ def __init__(
         super().__init__()
         if mid_size is None:
             mid_size = out_size
+        # Time embedding MLP is kept full precision — small layers that
+        # feed adaLN; precision-sensitive (see issue #2728).
         self.mlp = nn.Sequential(
-            ReplicatedLinear(
-                frequency_embedding_size,
-                mid_size,
-                bias=True,
-                quant_config=quant_config,
-                return_bias=False,
-            ),
+            nn.Linear(frequency_embedding_size, mid_size, bias=True),
             nn.SiLU(),
-            ReplicatedLinear(
-                mid_size,
-                out_size,
-                bias=True,
-                quant_config=quant_config,
-                return_bias=False,
-            ),
+            nn.Linear(mid_size, out_size, bias=True),
         )
 
         self.frequency_embedding_size = frequency_embedding_size
@@ -426,10 +416,11 @@ def __init__(
 
         self.modulation = modulation
         if modulation:
+            # Modulation linear is kept at full precision — it produces
+            # scale/gate values that are precision-sensitive (mirrors the
+            # OmniGen2 FP8 fix; see issue #2728).
             self.adaLN_modulation = nn.Sequential(
-                ReplicatedLinear(
-                    min(dim, ADALN_EMBED_DIM), 4 * dim, bias=True, return_bias=False, quant_config=quant_config
-                ),
+                nn.Linear(min(dim, ADALN_EMBED_DIM), 4 * dim, bias=True),
             )
 
     def forward(
@@ -485,15 +476,13 @@ class FinalLayer(nn.Module):
     def __init__(self, hidden_size, out_channels, quant_config: "QuantizationConfig | None" = None):
         super().__init__()
         self.norm_final = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
-        self.linear = ReplicatedLinear(
-            hidden_size, out_channels, bias=True, quant_config=quant_config, return_bias=False
-        )
+        # Final output projection and its modulation are precision-sensitive
+        # (map latents -> image); keep at full precision (see issue #2728).
+        self.linear = nn.Linear(hidden_size, out_channels, bias=True)
 
         self.adaLN_modulation = nn.Sequential(
             nn.SiLU(),
-            ReplicatedLinear(
-                min(hidden_size, ADALN_EMBED_DIM), hidden_size, bias=True, quant_config=quant_config, return_bias=False
-            ),
+            nn.Linear(min(hidden_size, ADALN_EMBED_DIM), hidden_size, bias=True),
         )
 
     def forward(self, x, c):
@@ -673,12 +662,12 @@ def __init__(
         all_x_embedder = {}
         all_final_layer = {}
         for patch_idx, (patch_size, f_patch_size) in enumerate(zip(all_patch_size, all_f_patch_size)):
-            x_embedder = ReplicatedLinear(
+            # x_embedder (patch embed) is a small precision-sensitive entry
+            # layer; keep full precision (see issue #2728).
+            x_embedder = nn.Linear(
                 f_patch_size * patch_size * patch_size * in_channels,
                 dim,
                 bias=True,
-                quant_config=quant_config,
-                return_bias=False,
             )
             all_x_embedder[f"{patch_size}-{f_patch_size}"] = x_embedder
 
@@ -720,9 +709,11 @@ def __init__(
             ]
         )
         self.t_embedder = TimestepEmbedder(min(dim, ADALN_EMBED_DIM), mid_size=1024, quant_config=quant_config)
+        # Caption embedder maps text features -> hidden; keep full precision
+        # (see issue #2728).
         self.cap_embedder = nn.Sequential(
             RMSNorm(cap_feat_dim, eps=norm_eps),
-            ReplicatedLinear(cap_feat_dim, dim, bias=True, return_bias=False, quant_config=quant_config),
+            nn.Linear(cap_feat_dim, dim, bias=True),
         )
 
         self.x_pad_token = nn.Parameter(torch.empty((1, dim)))

From 70af357ee79880a81495f713ca775324b1b52e7b Mon Sep 17 00:00:00 2001
From: Zhang <jianmusings@gmail.com>
Date: Tue, 14 Apr 2026 09:02:56 +0000
Subject: [PATCH 2/6] [BugFix][Qwen-Image] Keep modulation and embedder layers
 unquantized under FP8   (#2728)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

  FP8 online quantization on Qwen-Image produced pure pixel noise
  (LPIPS 0.95 vs BF16). Same root cause as the Z-Image fix: precision-
  sensitive small layers (time embedder, img_in/txt_in entry, per-block
  img_mod/txt_mod modulation, norm_out.linear, proj_out) feed the
  shift/scale/gate chain that multiplies the residual stream every
  block, so small per-layer drift blows up into noise.

  After fix: LPIPS 0.32 (PASS, Qwen-Image threshold 0.35). Main-path
  matmuls (to_qkv, to_out, add_kv_proj, to_add_out, img_mlp, txt_mlp)
  remain FP8 for memory savings — peak ~41 GB vs ~59 GB BF16.

Signed-off-by: Zhang <jianmusings@gmail.com>
---
 .../qwen_image/qwen_image_transformer.py      | 85 +++++--------------
 1 file changed, 19 insertions(+), 66 deletions(-)

diff --git a/vllm_omni/diffusion/models/qwen_image/qwen_image_transformer.py b/vllm_omni/diffusion/models/qwen_image/qwen_image_transformer.py
index b34f19e954a..acf2f2aa494 100644
--- a/vllm_omni/diffusion/models/qwen_image/qwen_image_transformer.py
+++ b/vllm_omni/diffusion/models/qwen_image/qwen_image_transformer.py
@@ -169,22 +169,10 @@ def __init__(
 
         self.time_proj = Timesteps(num_channels=256, flip_sin_to_cos=True, downscale_freq_shift=0, scale=1000)
         self.timestep_embedder = TimestepEmbedding(in_channels=256, time_embed_dim=embedding_dim)
-        self.timestep_embedder.linear_1 = ReplicatedLinear(
-            256,
-            embedding_dim,
-            bias=True,
-            return_bias=False,
-            quant_config=quant_config,
-            prefix="timestep_embedder.linear_1",
-        )
-        self.timestep_embedder.linear_2 = ReplicatedLinear(
-            embedding_dim,
-            embedding_dim,
-            bias=True,
-            return_bias=False,
-            quant_config=quant_config,
-            prefix="timestep_embedder.linear_2",
-        )
+        # Time embedding MLP is kept full precision — small layers that
+        # feed the per-block modulation; precision-sensitive (see issue #2728).
+        self.timestep_embedder.linear_1 = nn.Linear(256, embedding_dim, bias=True)
+        self.timestep_embedder.linear_2 = nn.Linear(embedding_dim, embedding_dim, bias=True)
         self.use_additional_t_cond = use_additional_t_cond
         if use_additional_t_cond:
             self.addition_t_embedding = nn.Embedding(2, embedding_dim)
@@ -701,17 +689,12 @@ def __init__(
         self.num_attention_heads = num_attention_heads
         self.attention_head_dim = attention_head_dim
 
-        # Image processing modules
+        # Image processing modules.
+        # Modulation linears are kept full precision — they produce
+        # shift/scale/gate values that are precision-sensitive (see #2728).
         self.img_mod = nn.Sequential(
             nn.SiLU(),
-            ReplicatedLinear(
-                dim,
-                6 * dim,
-                bias=True,
-                return_bias=False,
-                quant_config=quant_config,
-                prefix="img_mod.1",
-            ),
+            nn.Linear(dim, 6 * dim, bias=True),
         )
         self.img_norm1 = AdaLayerNorm(dim, elementwise_affine=False, eps=eps)
         self.attn = QwenImageCrossAttention(
@@ -725,17 +708,10 @@ def __init__(
         self.img_norm2 = AdaLayerNorm(dim, elementwise_affine=False, eps=eps)
         self.img_mlp = FeedForward(dim=dim, dim_out=dim, quant_config=quant_config, prefix="img_mlp")
 
-        # Text processing modules
+        # Text processing modules.
         self.txt_mod = nn.Sequential(
             nn.SiLU(),
-            ReplicatedLinear(
-                dim,
-                6 * dim,
-                bias=True,
-                return_bias=False,
-                quant_config=quant_config,
-                prefix="txt_mod.1",
-            ),
+            nn.Linear(dim, 6 * dim, bias=True),
         )
         self.txt_norm1 = AdaLayerNorm(dim, elementwise_affine=False, eps=eps)
         # Text doesn't need separate attention - it's handled by img_attn joint computation
@@ -958,22 +934,10 @@ def __init__(
 
         self.txt_norm = RMSNorm(joint_attention_dim, eps=1e-6)
 
-        self.img_in = ReplicatedLinear(
-            in_channels,
-            self.inner_dim,
-            bias=True,
-            return_bias=False,
-            quant_config=quant_config,
-            prefix="img_in",
-        )
-        self.txt_in = ReplicatedLinear(
-            joint_attention_dim,
-            self.inner_dim,
-            bias=True,
-            return_bias=False,
-            quant_config=quant_config,
-            prefix="txt_in",
-        )
+        # Entry projections (image/text) are kept full precision — small
+        # sensitive layers at the network boundary (see #2728).
+        self.img_in = nn.Linear(in_channels, self.inner_dim, bias=True)
+        self.txt_in = nn.Linear(joint_attention_dim, self.inner_dim, bias=True)
 
         self.transformer_blocks = nn.ModuleList(
             [
@@ -988,23 +952,12 @@ def __init__(
             ]
         )
 
+        # Final modulation and output projection are kept full precision —
+        # they produce the output latent and are precision-sensitive
+        # (see #2728).
         self.norm_out = AdaLayerNormContinuous(self.inner_dim, self.inner_dim, elementwise_affine=False, eps=1e-6)
-        self.norm_out.linear = ReplicatedLinear(
-            self.inner_dim,
-            2 * self.inner_dim,
-            bias=True,
-            return_bias=False,
-            quant_config=quant_config,
-            prefix="norm_out.linear",
-        )
-        self.proj_out = ReplicatedLinear(
-            self.inner_dim,
-            patch_size * patch_size * self.out_channels,
-            bias=True,
-            return_bias=False,
-            quant_config=quant_config,
-            prefix="proj_out",
-        )
+        self.norm_out.linear = nn.Linear(self.inner_dim, 2 * self.inner_dim, bias=True)
+        self.proj_out = nn.Linear(self.inner_dim, patch_size * patch_size * self.out_channels, bias=True)
 
         self.gradient_checkpointing = False
         self.zero_cond_t = zero_cond_t

From 7fccddeda6ca49bb67898192488ce42164ab7438 Mon Sep 17 00:00:00 2001
From: Zhang <jianmusings@gmail.com>
Date: Tue, 14 Apr 2026 09:10:26 +0000
Subject: [PATCH 3/6] skip layers by quant_config=None

Signed-off-by: Zhang <jianmusings@gmail.com>
---
 .../qwen_image/qwen_image_transformer.py      | 86 ++++++++++++++++---
 .../models/z_image/z_image_transformer.py     | 68 +++++++++++----
 2 files changed, 125 insertions(+), 29 deletions(-)

diff --git a/vllm_omni/diffusion/models/qwen_image/qwen_image_transformer.py b/vllm_omni/diffusion/models/qwen_image/qwen_image_transformer.py
index acf2f2aa494..049bdc053ec 100644
--- a/vllm_omni/diffusion/models/qwen_image/qwen_image_transformer.py
+++ b/vllm_omni/diffusion/models/qwen_image/qwen_image_transformer.py
@@ -169,10 +169,25 @@ def __init__(
 
         self.time_proj = Timesteps(num_channels=256, flip_sin_to_cos=True, downscale_freq_shift=0, scale=1000)
         self.timestep_embedder = TimestepEmbedding(in_channels=256, time_embed_dim=embedding_dim)
-        # Time embedding MLP is kept full precision — small layers that
-        # feed the per-block modulation; precision-sensitive (see issue #2728).
-        self.timestep_embedder.linear_1 = nn.Linear(256, embedding_dim, bias=True)
-        self.timestep_embedder.linear_2 = nn.Linear(embedding_dim, embedding_dim, bias=True)
+        # Time embedding MLP is kept full precision (quant_config=None) —
+        # small layers that feed per-block modulation; precision-sensitive
+        # (see #2728).
+        self.timestep_embedder.linear_1 = ReplicatedLinear(
+            256,
+            embedding_dim,
+            bias=True,
+            return_bias=False,
+            quant_config=None,
+            prefix="timestep_embedder.linear_1",
+        )
+        self.timestep_embedder.linear_2 = ReplicatedLinear(
+            embedding_dim,
+            embedding_dim,
+            bias=True,
+            return_bias=False,
+            quant_config=None,
+            prefix="timestep_embedder.linear_2",
+        )
         self.use_additional_t_cond = use_additional_t_cond
         if use_additional_t_cond:
             self.addition_t_embedding = nn.Embedding(2, embedding_dim)
@@ -690,11 +705,19 @@ def __init__(
         self.attention_head_dim = attention_head_dim
 
         # Image processing modules.
-        # Modulation linears are kept full precision — they produce
-        # shift/scale/gate values that are precision-sensitive (see #2728).
+        # Modulation linear is kept full precision (quant_config=None) — it
+        # produces shift/scale/gate values that are precision-sensitive
+        # (see #2728).
         self.img_mod = nn.Sequential(
             nn.SiLU(),
-            nn.Linear(dim, 6 * dim, bias=True),
+            ReplicatedLinear(
+                dim,
+                6 * dim,
+                bias=True,
+                return_bias=False,
+                quant_config=None,
+                prefix="img_mod.1",
+            ),
         )
         self.img_norm1 = AdaLayerNorm(dim, elementwise_affine=False, eps=eps)
         self.attn = QwenImageCrossAttention(
@@ -711,7 +734,14 @@ def __init__(
         # Text processing modules.
         self.txt_mod = nn.Sequential(
             nn.SiLU(),
-            nn.Linear(dim, 6 * dim, bias=True),
+            ReplicatedLinear(
+                dim,
+                6 * dim,
+                bias=True,
+                return_bias=False,
+                quant_config=None,
+                prefix="txt_mod.1",
+            ),
         )
         self.txt_norm1 = AdaLayerNorm(dim, elementwise_affine=False, eps=eps)
         # Text doesn't need separate attention - it's handled by img_attn joint computation
@@ -934,10 +964,24 @@ def __init__(
 
         self.txt_norm = RMSNorm(joint_attention_dim, eps=1e-6)
 
-        # Entry projections (image/text) are kept full precision — small
-        # sensitive layers at the network boundary (see #2728).
-        self.img_in = nn.Linear(in_channels, self.inner_dim, bias=True)
-        self.txt_in = nn.Linear(joint_attention_dim, self.inner_dim, bias=True)
+        # Entry projections (image/text) are kept full precision —
+        # small sensitive layers at the network boundary (see #2728).
+        self.img_in = ReplicatedLinear(
+            in_channels,
+            self.inner_dim,
+            bias=True,
+            return_bias=False,
+            quant_config=None,
+            prefix="img_in",
+        )
+        self.txt_in = ReplicatedLinear(
+            joint_attention_dim,
+            self.inner_dim,
+            bias=True,
+            return_bias=False,
+            quant_config=None,
+            prefix="txt_in",
+        )
 
         self.transformer_blocks = nn.ModuleList(
             [
@@ -956,8 +1000,22 @@ def __init__(
         # they produce the output latent and are precision-sensitive
         # (see #2728).
         self.norm_out = AdaLayerNormContinuous(self.inner_dim, self.inner_dim, elementwise_affine=False, eps=1e-6)
-        self.norm_out.linear = nn.Linear(self.inner_dim, 2 * self.inner_dim, bias=True)
-        self.proj_out = nn.Linear(self.inner_dim, patch_size * patch_size * self.out_channels, bias=True)
+        self.norm_out.linear = ReplicatedLinear(
+            self.inner_dim,
+            2 * self.inner_dim,
+            bias=True,
+            return_bias=False,
+            quant_config=None,
+            prefix="norm_out.linear",
+        )
+        self.proj_out = ReplicatedLinear(
+            self.inner_dim,
+            patch_size * patch_size * self.out_channels,
+            bias=True,
+            return_bias=False,
+            quant_config=None,
+            prefix="proj_out",
+        )
 
         self.gradient_checkpointing = False
         self.zero_cond_t = zero_cond_t
diff --git a/vllm_omni/diffusion/models/z_image/z_image_transformer.py b/vllm_omni/diffusion/models/z_image/z_image_transformer.py
index f4a1586b056..c36ea746654 100644
--- a/vllm_omni/diffusion/models/z_image/z_image_transformer.py
+++ b/vllm_omni/diffusion/models/z_image/z_image_transformer.py
@@ -214,12 +214,24 @@ def __init__(
         super().__init__()
         if mid_size is None:
             mid_size = out_size
-        # Time embedding MLP is kept full precision — small layers that
-        # feed adaLN; precision-sensitive (see issue #2728).
+        # Time embedding MLP is kept full precision (quant_config=None) —
+        # small layers that feed adaLN; precision-sensitive (see #2728).
         self.mlp = nn.Sequential(
-            nn.Linear(frequency_embedding_size, mid_size, bias=True),
+            ReplicatedLinear(
+                frequency_embedding_size,
+                mid_size,
+                bias=True,
+                quant_config=None,
+                return_bias=False,
+            ),
             nn.SiLU(),
-            nn.Linear(mid_size, out_size, bias=True),
+            ReplicatedLinear(
+                mid_size,
+                out_size,
+                bias=True,
+                quant_config=None,
+                return_bias=False,
+            ),
         )
 
         self.frequency_embedding_size = frequency_embedding_size
@@ -416,11 +428,17 @@ def __init__(
 
         self.modulation = modulation
         if modulation:
-            # Modulation linear is kept at full precision — it produces
-            # scale/gate values that are precision-sensitive (mirrors the
-            # OmniGen2 FP8 fix; see issue #2728).
+            # Modulation linear is kept at full precision (quant_config=None)
+            # — it produces scale/gate values that are precision-sensitive
+            # (see #2728, mirrors OmniGen2 fix).
             self.adaLN_modulation = nn.Sequential(
-                nn.Linear(min(dim, ADALN_EMBED_DIM), 4 * dim, bias=True),
+                ReplicatedLinear(
+                    min(dim, ADALN_EMBED_DIM),
+                    4 * dim,
+                    bias=True,
+                    quant_config=None,
+                    return_bias=False,
+                ),
             )
 
     def forward(
@@ -477,12 +495,24 @@ def __init__(self, hidden_size, out_channels, quant_config: "QuantizationConfig
         super().__init__()
         self.norm_final = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
         # Final output projection and its modulation are precision-sensitive
-        # (map latents -> image); keep at full precision (see issue #2728).
-        self.linear = nn.Linear(hidden_size, out_channels, bias=True)
+        # (produce the output latent); keep at full precision (see #2728).
+        self.linear = ReplicatedLinear(
+            hidden_size,
+            out_channels,
+            bias=True,
+            quant_config=None,
+            return_bias=False,
+        )
 
         self.adaLN_modulation = nn.Sequential(
             nn.SiLU(),
-            nn.Linear(min(hidden_size, ADALN_EMBED_DIM), hidden_size, bias=True),
+            ReplicatedLinear(
+                min(hidden_size, ADALN_EMBED_DIM),
+                hidden_size,
+                bias=True,
+                quant_config=None,
+                return_bias=False,
+            ),
         )
 
     def forward(self, x, c):
@@ -663,11 +693,13 @@ def __init__(
         all_final_layer = {}
         for patch_idx, (patch_size, f_patch_size) in enumerate(zip(all_patch_size, all_f_patch_size)):
             # x_embedder (patch embed) is a small precision-sensitive entry
-            # layer; keep full precision (see issue #2728).
-            x_embedder = nn.Linear(
+            # layer; keep full precision (see #2728).
+            x_embedder = ReplicatedLinear(
                 f_patch_size * patch_size * patch_size * in_channels,
                 dim,
                 bias=True,
+                quant_config=None,
+                return_bias=False,
             )
             all_x_embedder[f"{patch_size}-{f_patch_size}"] = x_embedder
 
@@ -710,10 +742,16 @@ def __init__(
         )
         self.t_embedder = TimestepEmbedder(min(dim, ADALN_EMBED_DIM), mid_size=1024, quant_config=quant_config)
         # Caption embedder maps text features -> hidden; keep full precision
-        # (see issue #2728).
+        # (see #2728).
         self.cap_embedder = nn.Sequential(
             RMSNorm(cap_feat_dim, eps=norm_eps),
-            nn.Linear(cap_feat_dim, dim, bias=True),
+            ReplicatedLinear(
+                cap_feat_dim,
+                dim,
+                bias=True,
+                quant_config=None,
+                return_bias=False,
+            ),
         )
 
         self.x_pad_token = nn.Parameter(torch.empty((1, dim)))

From 56cc9a1613aaa25c1e8e45d4bdaf65c137801e53 Mon Sep 17 00:00:00 2001
From: Zhang <jianmusings@gmail.com>
Date: Tue, 14 Apr 2026 11:06:37 +0000
Subject: [PATCH 4/6] [BugFix][FLUX] Keep dual-stream blocks and modulation
 unquantized under FP8   (#2728)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

  FP8 online quantization on FLUX.1-dev produced pure pixel noise (LPIPS
  0.93 vs BF16). Unlike Z-Image/Qwen where the modulation/embedder
  pattern was enough, FLUX's dual-stream blocks (19 FluxTransformerBlock)
  run joint attention over concatenated [text, image] tokens — the
  mixed-distribution activations don't tolerate FP8 per-token quant,
  and neither the attn nor ff sub-layers can individually take FP8.

  Keep dual blocks fully BF16 and keep per-block modulation and final
  norm_out unquantized. Single blocks (38 of them, ~2x more param than
  dual) remain FP8, preserving most of the memory saving.

  After fix: LPIPS 0.1201 (PASS, FLUX threshold 0.20). Peak 33.2 GB vs
  BF16 36.7 GB (saves ~3.5 GB; less than Z-Image/Qwen because the bulk
  of dual-block params stays BF16).

Co-Authored-By: pjh4993 <pjh4993@naver.com>
Signed-off-by: Zhang <jianmusings@gmail.com>
---
 vllm_omni/diffusion/models/flux/flux_transformer.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/vllm_omni/diffusion/models/flux/flux_transformer.py b/vllm_omni/diffusion/models/flux/flux_transformer.py
index 680b8bfbbed..297c6267515 100644
--- a/vllm_omni/diffusion/models/flux/flux_transformer.py
+++ b/vllm_omni/diffusion/models/flux/flux_transformer.py
@@ -381,7 +381,9 @@ def __init__(
         super().__init__()
         self.mlp_hidden_dim = int(dim * mlp_ratio)
 
-        self.norm = AdaLayerNormZeroSingle(dim, quant_config=quant_config, prefix=f"{prefix}.norm")
+        # Modulation linear kept full precision; shift/scale/gate outputs
+        # are multiplied into the residual stream every block (see #2728).
+        self.norm = AdaLayerNormZeroSingle(dim, quant_config=None, prefix=f"{prefix}.norm")
         self.proj_mlp = ReplicatedLinear(
             dim,
             self.mlp_hidden_dim,
@@ -563,13 +565,16 @@ def __init__(
         self.context_embedder = nn.Linear(joint_attention_dim, self.inner_dim)
         self.x_embedder = nn.Linear(in_channels, self.inner_dim)
 
+        # Dual-stream blocks kept full precision — FP8 on their joint
+        # attention path causes noise on FLUX (#2728). Single-stream
+        # blocks (38 vs 19) still get FP8 for memory savings.
         self.transformer_blocks = nn.ModuleList(
             [
                 FluxTransformerBlock(
                     dim=self.inner_dim,
                     num_attention_heads=num_attention_heads,
                     attention_head_dim=attention_head_dim,
-                    quant_config=quant_config,
+                    quant_config=None,
                     prefix=f"transformer_blocks.{i}",
                 )
                 for i in range(num_layers)
@@ -589,12 +594,13 @@ def __init__(
             ]
         )
 
+        # Final modulation feeds proj_out; keep full precision (see #2728).
         self.norm_out = AdaLayerNormContinuous(
             self.inner_dim,
             self.inner_dim,
             elementwise_affine=False,
             eps=1e-6,
-            quant_config=quant_config,
+            quant_config=None,
             prefix="norm_out",
         )
         self.proj_out = nn.Linear(self.inner_dim, patch_size * patch_size * self.out_channels, bias=True)

From 5e695bde2ecc78a0e054fb054ce73596fda2eefc Mon Sep 17 00:00:00 2001
From: Zhang <jianmusings@gmail.com>
Date: Tue, 14 Apr 2026 12:15:13 +0000
Subject: [PATCH 5/6] docs: align docs/user_guide with main (drop FP8 table
 edits)

Signed-off-by: Zhang <jianmusings@gmail.com>
---
 docs/user_guide/diffusion/quantization/fp8.md    | 16 +++++++---------
 .../diffusion/quantization/overview.md           |  2 +-
 2 files changed, 8 insertions(+), 10 deletions(-)

diff --git a/docs/user_guide/diffusion/quantization/fp8.md b/docs/user_guide/diffusion/quantization/fp8.md
index a0d5545b36b..9906631b625 100644
--- a/docs/user_guide/diffusion/quantization/fp8.md
+++ b/docs/user_guide/diffusion/quantization/fp8.md
@@ -4,11 +4,9 @@
 
 FP8 quantization converts BF16/FP16 weights to FP8 at model load time. No calibration or pre-quantized checkpoint needed.
 
-Depending on the model, either most linear layers are quantized with a few **built-in** BF16 exceptions in code (`quant_config=None`), or you must add extra skips via `ignored_layers`. See the [per-model table](#supported-models).
+Depending on the model, either all layers can be quantized, or some sensitive layers should stay in BF16. See the [per-model table](#supported-models) for which case applies.
 
-Built-in BF16 is used where small linear layers drive **timestep conditioning**, **per-block modulation** (scale/shift/gate), **input embedders**, or the **final latent projection**. Quantizing those paths caused visible noise or color drift on Z-Image, Qwen-Image, and FLUX.1-dev ([PR #2728](https://github.com/vllm-project/vllm-omni/pull/2728)); they stay in full precision automatically—you do not name them in `ignored_layers`.
-
-Beyond that, common user-controlled skips include **image-stream MLPs** (`img_mlp`) on Qwen-Image: they see shifting latent statistics and benefit from `ignored_layers` for best quality. **Attention projections** (`to_qkv`, `to_out`) and **text-stream MLPs** (`txt_mlp`) are usually fine in FP8 when modulation and embedders stay BF16.
+Common sensitive layers in DiT-based diffusion models include **image-stream MLPs** (`img_mlp`). These are particularly vulnerable to FP8 precision loss because they process denoising latents whose dynamic range shifts significantly across timesteps, and unlike attention projections (which benefit from QK-Norm stabilization), MLPs have no built-in normalization to absorb quantization error. In deep architectures (e.g., 60+ residual blocks), small per-layer errors compound and degrade output quality. Other layers such as **attention projections** (`to_qkv`, `to_out`) and **text-stream MLPs** (`txt_mlp`) are generally more robust due to normalization or more stable input statistics.
 
 ## Configuration
 
@@ -60,11 +58,11 @@ The available `ignored_layers` names depend on the model architecture (e.g., `to
 
 ## Supported Models
 
-| Model | HF Models | FP8 scope | `ignored_layers` (optional) |
-|-------|-----------|-----------|------------------------------|
-| Z-Image | `Tongyi-MAI/Z-Image-Turbo` | Main blocks (attention + FFN) use FP8. **Always BF16 in code:** timestep MLP, per-block adaLN modulation linear, patch and caption embedders, final layer (modulation + `proj_out`). | None required for those paths |
-| Qwen-Image | `Qwen/Qwen-Image`, `Qwen/Qwen-Image-2512` | Joint attention and MLPs can use FP8. **Always BF16 in code:** timestep MLP, per-block `img_mod` / `txt_mod` linears, `img_in` / `txt_in`, `norm_out.linear`, `proj_out`. | Still recommend `img_mlp` for quality |
-| Flux | `black-forest-labs/FLUX.1-dev` | **Single-stream** blocks (`single_transformer_blocks`) use FP8. **Always BF16 in code:** all **dual-stream** blocks (`transformer_blocks`, joint attention path), AdaLayerNormZeroSingle modulation in single blocks, and `norm_out` before final `proj_out`. | None required for those paths |
+| Model | HF Models | Recommendation | `ignored_layers` |
+|-------|-----------|---------------|------------------|
+| Z-Image | `Tongyi-MAI/Z-Image-Turbo` | All layers | None |
+| Qwen-Image | `Qwen/Qwen-Image`, `Qwen/Qwen-Image-2512` | Skip sensitive layers | `img_mlp` |
+| Flux | `black-forest-labs/FLUX.1-dev` | All layers | None |
 | HunyuanImage-3 | `tencent/HunyuanImage3` | All layers | None |
 | HunyuanVideo-1.5 | `hunyuanvideo-community/HunyuanVideo-1.5-Diffusers-480p_t2v`, `720p_t2v`, `480p_i2v` | All layers | None |
 | Helios | `BestWishYsh/Helios-Base`, `BestWishYsh/Helios-Mid`, `BestWishYsh/Helios-Distilled` | All layers | None |
diff --git a/docs/user_guide/diffusion/quantization/overview.md b/docs/user_guide/diffusion/quantization/overview.md
index 4d1b37e73d4..25d7fa5c756 100644
--- a/docs/user_guide/diffusion/quantization/overview.md
+++ b/docs/user_guide/diffusion/quantization/overview.md
@@ -53,7 +53,7 @@ When `--quantization fp8` is enabled for diffusion models:
 
 | Component | What Gets Quantized | Mechanism |
 |-----------|-------------------|-----------|
-| **DiT (transformer)** | `nn.Linear` layers (minus [built-in BF16 submodules](fp8.md#supported-models) on Z-Image, Qwen-Image, FLUX.1-dev) | vLLM W8A8 FP8 compute (Ada/Hopper) or weight-only (older GPUs) |
+| **DiT (transformer)** | `nn.Linear` layers | vLLM W8A8 FP8 compute (Ada/Hopper) or weight-only (older GPUs) |
 | **Text encoder** | `nn.Linear` layers | FP8 weight storage, BF16 compute |
 | **VAE** | `nn.Conv2d`, `nn.Conv3d` layers | FP8 weight storage, BF16 compute |
 

From 47ded5dc16e17fe73aac7e65d4bde122b14b7c82 Mon Sep 17 00:00:00 2001
From: Zhang <jianmusings@gmail.com>
Date: Tue, 14 Apr 2026 12:15:13 +0000
Subject: [PATCH 6/6] docs: align quantization contributor guide with main

Signed-off-by: Zhang <jianmusings@gmail.com>
---
 docs/contributing/model/adding_quantization_model.md | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/docs/contributing/model/adding_quantization_model.md b/docs/contributing/model/adding_quantization_model.md
index 38e07387b3c..f2731888846 100644
--- a/docs/contributing/model/adding_quantization_model.md
+++ b/docs/contributing/model/adding_quantization_model.md
@@ -156,7 +156,7 @@ Some layers degrade output quality when quantized. Common sensitive layers:
 |-------|--------|-----------|----------------------|----------------|
 | Qwen-Image-2512 | Int8 | LPIPS 0.0197 | 0.0027 (skip `img_mlp`) | Skip `img_mlp` |
 | Z-Image-Turbo | Int8 | LPIPS 0.1597 | 0.0290 (skip `feed_forward`) | Skip `feed_forward` |
-| Z-Image-Turbo | FP8 | LPIPS ~0.005 | — | Modulation / embedders / final layer stay BF16 in code ([#2728](https://github.com/vllm-project/vllm-omni/pull/2728)) |
+| Z-Image-Turbo | FP8 | LPIPS ~0.005 | — | All layers OK |
 
 To identify sensitive layers for a new model:
 
@@ -400,7 +400,7 @@ aggressive quantization methods (Int8, NVFP4).
 |-------|--------|-------------------|----------------------|---------------------|
 | Qwen-Image-2512 | Int8 | 0.0197 | 0.0027 | `img_mlp` |
 | Z-Image-Turbo | Int8 | 0.1597 | 0.0290 | `feed_forward` |
-| Z-Image-Turbo | FP8 | ~0.005 | — | Built-in BF16 for modulation, embedders, final layer ([#2728](https://github.com/vllm-project/vllm-omni/pull/2728)) |
+| Z-Image-Turbo | FP8 | ~0.005 | — | None needed |
 
 ### What to Verify (Checklist)
 
@@ -459,9 +459,8 @@ aggressive quantization methods (Int8, NVFP4).
 
 | Model | Pipeline File | Transformer File | Notes |
 |-------|-------------|-----------------|-------|
-| **Z-Image** | `models/z_image/pipeline_z_image.py` | `models/z_image/z_image_transformer.py` | DiT blocks + text encoder FP8; modulation, embedders, final layer BF16 ([#2728](https://github.com/vllm-project/vllm-omni/pull/2728)) |
-| **Qwen-Image** | `models/qwen_image/pipeline_qwen_image.py` | `models/qwen_image/qwen_image_transformer.py` | DiT blocks + text encoder + VAE FP8; timestep MLP, modulation linears, `img_in`/`txt_in`, `norm_out`/`proj_out` BF16 ([#2728](https://github.com/vllm-project/vllm-omni/pull/2728)) |
-| **FLUX.1-dev** | `models/flux/pipeline_flux.py` | `models/flux/flux_transformer.py` | Single-stream blocks FP8; dual-stream blocks + final `norm_out` BF16 ([#2728](https://github.com/vllm-project/vllm-omni/pull/2728)) |
+| **Z-Image** | `models/z_image/pipeline_z_image.py` | `models/z_image/z_image_transformer.py` | DiT + text encoder FP8 |
+| **Qwen-Image** | `models/qwen_image/pipeline_qwen_image.py` | `models/qwen_image/qwen_image_transformer.py` | DiT + text encoder + VAE FP8 |
 
 All files are under `vllm_omni/diffusion/`.