vllm-project · lishunyang12 · Feb 20, 2026 · Feb 24, 2026 · Feb 24, 2026
@@ -272,6 +272,17 @@ def __init__(
         self.vae = AutoencoderKLQwenImage.from_pretrained(model, subfolder="vae", local_files_only=local_files_only).to(
             self.device
         )
+
+        # Apply FP8 weight quantization to VAE and text encoder
+        if (
+            od_config.quantization_config is not None
+            and getattr(od_config.quantization_config, "quant_method", None) == "fp8"
+        ):
+            from vllm_omni.diffusion.models.utils import apply_fp8_weight_storage
+
+            apply_fp8_weight_storage(self.vae)
+            apply_fp8_weight_storage(self.text_encoder)
+
         transformer_kwargs = get_transformer_config_kwargs(od_config.tf_model_config, QwenImageTransformer2DModel)
         quant_config = get_vllm_quant_config_for_layers(od_config.quantization_config)
         self.transformer = QwenImageTransformer2DModel(
@@ -719,4 +730,9 @@ def forward(
 
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         loader = AutoWeightsLoader(self)
-        return loader.load_weights(weights)
+        loaded_weights = loader.load_weights(weights)
+        # VAE and text_encoder are loaded via from_pretrained(), not through
+        # the weight pipeline, so mark their weights as loaded.
+        loaded_weights |= {f"vae.{name}" for name, _ in self.vae.named_parameters()}
+        loaded_weights |= {f"text_encoder.{name}" for name, _ in self.text_encoder.named_parameters()}
+        return loaded_weights
@@ -249,6 +249,17 @@ def __init__(
         self.vae = AutoencoderKLQwenImage.from_pretrained(model, subfolder="vae", local_files_only=local_files_only).to(
             self.device
         )
+
+        # Apply FP8 weight quantization to VAE and text encoder
+        if (
+            od_config.quantization_config is not None
+            and getattr(od_config.quantization_config, "quant_method", None) == "fp8"
+        ):
+            from vllm_omni.diffusion.models.utils import apply_fp8_weight_storage
+
+            apply_fp8_weight_storage(self.vae)
+            apply_fp8_weight_storage(self.text_encoder)
+
         transformer_kwargs = get_transformer_config_kwargs(od_config.tf_model_config, QwenImageTransformer2DModel)
         self.transformer = QwenImageTransformer2DModel(od_config=od_config, **transformer_kwargs)
         self.tokenizer = Qwen2Tokenizer.from_pretrained(model, subfolder="tokenizer", local_files_only=local_files_only)
@@ -818,4 +829,9 @@ def forward(
 
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         loader = AutoWeightsLoader(self)
-        return loader.load_weights(weights)
+        loaded_weights = loader.load_weights(weights)
+        # VAE and text_encoder are loaded via from_pretrained(), not through
+        # the weight pipeline, so mark their weights as loaded.
+        loaded_weights |= {f"vae.{name}" for name, _ in self.vae.named_parameters()}
+        loaded_weights |= {f"text_encoder.{name}" for name, _ in self.text_encoder.named_parameters()}
+        return loaded_weights
@@ -201,6 +201,16 @@ def __init__(
             self.device
         )
 
+        # Apply FP8 weight quantization to VAE and text encoder
+        if (
+            od_config.quantization_config is not None
+            and getattr(od_config.quantization_config, "quant_method", None) == "fp8"
+        ):
+            from vllm_omni.diffusion.models.utils import apply_fp8_weight_storage
+
+            apply_fp8_weight_storage(self.vae)
+            apply_fp8_weight_storage(self.text_encoder)
+
         transformer_kwargs = get_transformer_config_kwargs(od_config.tf_model_config, QwenImageTransformer2DModel)
         self.transformer = QwenImageTransformer2DModel(od_config=od_config, **transformer_kwargs)
         self.tokenizer = Qwen2Tokenizer.from_pretrained(model, subfolder="tokenizer", local_files_only=local_files_only)
@@ -773,4 +783,9 @@ def forward(
 
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         loader = AutoWeightsLoader(self)
-        return loader.load_weights(weights)
+        loaded_weights = loader.load_weights(weights)
+        # VAE and text_encoder are loaded via from_pretrained(), not through
+        # the weight pipeline, so mark their weights as loaded.
+        loaded_weights |= {f"vae.{name}" for name, _ in self.vae.named_parameters()}
+        loaded_weights |= {f"text_encoder.{name}" for name, _ in self.text_encoder.named_parameters()}
+        return loaded_weights
@@ -0,0 +1,83 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Utility functions for diffusion models."""
+
+import logging
+
+import torch
+from torch import nn
+
+logger = logging.getLogger(__name__)
+
+# Maximum value for float8_e4m3fn
+_FP8_E4M3_MAX = torch.finfo(torch.float8_e4m3fn).max
+
+
+_FP8_TARGET_LAYERS = (nn.Linear, nn.Conv2d, nn.Conv3d)
+
+
+def apply_fp8_weight_storage(model: nn.Module) -> None:
+    """Apply FP8 weight-only storage to Linear/Conv2d/Conv3d layers.
+
+    Stores weights in float8_e4m3fn with per-tensor scales.
+    Dequantizes to the original compute dtype before each forward pass,
+    then re-quantizes afterward to free BF16 memory.
+
+    This saves ~50% of memory with no accuracy loss since computation
+    still happens in the original dtype.
+
+    Args:
+        model: The model whose layers will be quantized.
+    """
+    count = 0
+    for name, module in model.named_modules():
+        if not isinstance(module, _FP8_TARGET_LAYERS):
+            continue
+
+        # P4: Idempotency guard -- skip if already quantized
+        if hasattr(module, "_fp8_weight"):
+            continue
+
+        weight = module.weight.data
+        compute_dtype = weight.dtype
+
+        # Compute per-tensor scale
+        amax = weight.abs().amax().clamp(min=1e-12)
+        scale = amax / _FP8_E4M3_MAX
+
+        # Quantize weight to FP8
+        fp8_weight = (weight / scale).clamp(min=-_FP8_E4M3_MAX, max=_FP8_E4M3_MAX).to(torch.float8_e4m3fn)
+
+        # Store FP8 weight and metadata as buffers (not parameters)
+        module.register_buffer("_fp8_weight", fp8_weight)
+        module.register_buffer("_fp8_scale", scale.to(torch.float32))
+        module._fp8_compute_dtype = compute_dtype
+
+        # P1: Keep the parameter at the original compute dtype so that
+        # model.dtype (derived from parameters) stays correct.  We store
+        # the FP8 representation in the _fp8_weight buffer and only
+        # dequantize into module.weight.data inside the pre-hook.
+        # After forward, the post-hook swaps back to FP8 storage to
+        # free the BF16/FP16 memory.
+        module.weight.data = fp8_weight.to(compute_dtype)
+
+        def _pre_hook(mod, args):
+            # Dequantize: restore BF16/FP16 weight for computation
+            # P2: Cast back to compute dtype to avoid float32 promotion
+            #     from the float32 scale tensor.
+            mod.weight.data = (mod._fp8_weight.to(mod._fp8_compute_dtype) * mod._fp8_scale).to(mod._fp8_compute_dtype)
+
+        def _post_hook(mod, args, output):
+            # Re-quantize: swap back to FP8-dequantized placeholder to
+            # free full-precision memory while keeping dtype correct.
+            mod.weight.data = mod._fp8_weight.to(mod._fp8_compute_dtype)
+
+        module.register_forward_pre_hook(_pre_hook)
+        module.register_forward_hook(_post_hook)
+        count += 1
+
+    logger.info(
+        "Applied FP8 weight storage to %d layers in %s",
+        count,
+        model.__class__.__name__,
+    )