From 3524dcb32e57a9b8da56eba6db35ac0491d5f656 Mon Sep 17 00:00:00 2001
From: lishunyang <lishunyang12@163.com>
Date: Sat, 21 Feb 2026 01:42:59 +0800
Subject: [PATCH 01/45] [Quantization] Add FP8 KV quantization for diffusion
 attention layers

Reduce attention K/V memory by ~50% via per-tensor dynamic FP8
quantization. On Hopper GPUs with FA3, this also accelerates
attention via native FP8 tensor cores; on FA2/SDPA backends,
K/V are dequantized before the kernel (memory-only benefit).

- Add quantize_kv_fp8() / dequantize_fp8() utilities in vllm_omni/quantization/
- Add kv_quantization field to OmniDiffusionConfig
- Add k_scale / v_scale fields to AttentionMetadata
- Quantize K/V (+ joint K/V) in Attention.forward() after pre_attention
- FA3 native FP8 path with descale_k/descale_v in FlashAttentionImpl
- Dequant fallback for padded batches (varlen path) and SDPA backend
- Guard against ring attention + FP8 KV (incompatible)
- Add --kv-quantization CLI flag to text_to_image.py example
- Add unit tests for roundtrip, scales, zero tensor, config integration

Signed-off-by: lishunyang <lishunyang12@163.com>
---
 .../text_to_image/text_to_image.py            | 11 +++
 tests/diffusion/quantization/test_kv_quant.py | 98 +++++++++++++++++++
 .../diffusion/attention/backends/abstract.py  |  4 +
 .../attention/backends/flash_attn.py          | 63 ++++++++++++
 .../diffusion/attention/backends/sdpa.py      | 13 +++
 vllm_omni/diffusion/attention/layer.py        | 44 +++++++++
 vllm_omni/diffusion/data.py                   |  6 ++
 vllm_omni/quantization/kv_quant.py            | 64 ++++++++++++
 8 files changed, 303 insertions(+)
 create mode 100644 tests/diffusion/quantization/test_kv_quant.py
 create mode 100644 vllm_omni/quantization/kv_quant.py

diff --git a/examples/offline_inference/text_to_image/text_to_image.py b/examples/offline_inference/text_to_image/text_to_image.py
index 927b0f0b087..c17c87fcec4 100644
--- a/examples/offline_inference/text_to_image/text_to_image.py
+++ b/examples/offline_inference/text_to_image/text_to_image.py
@@ -168,6 +168,13 @@ def parse_args() -> argparse.Namespace:
         "Available layers: to_qkv, to_out, add_kv_proj, to_add_out, img_mlp, txt_mlp, proj_out. "
         "Example: --ignored-layers 'add_kv_proj,to_add_out'",
     )
+    parser.add_argument(
+        "--kv-quantization",
+        action="store_true",
+        help="Enable FP8 quantization of attention K/V tensors for memory reduction. "
+        "Requires --quantization fp8. On Hopper GPUs with FA3, also accelerates attention. "
+        "On other backends (FA2/SDPA), K/V are dequantized before the kernel (memory-only benefit).",
+    )
     parser.add_argument(
         "--vae-use-slicing",
         action="store_true",
@@ -304,6 +311,7 @@ def main():
     # ignored_layers is specified so the list flows through OmniDiffusionConfig
     quant_kwargs: dict[str, Any] = {}
     ignored_layers = [s.strip() for s in args.ignored_layers.split(",") if s.strip()] if args.ignored_layers else None
+    kv_quantization = getattr(args, "kv_quantization", False)
     if args.quantization == "gguf":
         if not args.gguf_model:
             raise ValueError("--gguf-model is required when --quantization gguf is set.")
@@ -331,6 +339,7 @@ def main():
         "enforce_eager": args.enforce_eager,
         "enable_cpu_offload": args.enable_cpu_offload,
         "mode": "text-to-image",
+        "kv_quantization": kv_quantization,
         "log_stats": args.log_stats,
         "enable_diffusion_pipeline_profiler": args.enable_diffusion_pipeline_profiler,
         **lora_args,
@@ -354,6 +363,8 @@ def main():
     print(f"  Inference steps: {args.num_inference_steps}")
     print(f"  Cache backend: {cache_backend if cache_backend else 'None (no acceleration)'}")
     print(f"  Quantization: {args.quantization if args.quantization else 'None (BF16)'}")
+    if kv_quantization:
+        print("  KV quantization: FP8 (enabled)")
     if ignored_layers:
         print(f"  Ignored layers: {ignored_layers}")
     print(
diff --git a/tests/diffusion/quantization/test_kv_quant.py b/tests/diffusion/quantization/test_kv_quant.py
new file mode 100644
index 00000000000..5c1dd9334d4
--- /dev/null
+++ b/tests/diffusion/quantization/test_kv_quant.py
@@ -0,0 +1,98 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Tests for FP8 KV quantization utilities."""
+
+import pytest
+import torch
+
+pytestmark = [pytest.mark.core_model, pytest.mark.diffusion]
+
+
+def test_roundtrip_preserves_values():
+    """quantize_kv_fp8 -> dequantize_fp8 should preserve values within FP8 tolerance."""
+    from vllm_omni.quantization.kv_quant import dequantize_fp8, quantize_kv_fp8
+
+    torch.manual_seed(42)
+    key = torch.randn(2, 128, 8, 64, dtype=torch.bfloat16)
+    value = torch.randn(2, 128, 8, 64, dtype=torch.bfloat16)
+
+    fp8_key, fp8_value, k_scale, v_scale = quantize_kv_fp8(key, value)
+
+    assert fp8_key.dtype == torch.float8_e4m3fn
+    assert fp8_value.dtype == torch.float8_e4m3fn
+    assert k_scale.numel() == 1
+    assert v_scale.numel() == 1
+
+    key_rt = dequantize_fp8(fp8_key, k_scale, torch.bfloat16)
+    value_rt = dequantize_fp8(fp8_value, v_scale, torch.bfloat16)
+
+    assert key_rt.shape == key.shape
+    assert value_rt.shape == value.shape
+
+    # FP8 e4m3 has ~0.1% relative error for typical values
+    torch.testing.assert_close(key_rt, key, rtol=0.05, atol=0.05)
+    torch.testing.assert_close(value_rt, value, rtol=0.05, atol=0.05)
+
+
+def test_scales_are_positive():
+    from vllm_omni.quantization.kv_quant import quantize_kv_fp8
+
+    key = torch.randn(1, 64, 4, 32, dtype=torch.bfloat16)
+    value = torch.randn(1, 64, 4, 32, dtype=torch.bfloat16)
+
+    _, _, k_scale, v_scale = quantize_kv_fp8(key, value)
+    assert k_scale > 0
+    assert v_scale > 0
+
+
+def test_zero_tensor():
+    """All-zero input should not produce NaN or Inf."""
+    from vllm_omni.quantization.kv_quant import dequantize_fp8, quantize_kv_fp8
+
+    key = torch.zeros(1, 16, 4, 32, dtype=torch.bfloat16)
+    value = torch.zeros(1, 16, 4, 32, dtype=torch.bfloat16)
+
+    fp8_key, fp8_value, k_scale, v_scale = quantize_kv_fp8(key, value)
+    key_rt = dequantize_fp8(fp8_key, k_scale, torch.bfloat16)
+    value_rt = dequantize_fp8(fp8_value, v_scale, torch.bfloat16)
+
+    assert not torch.isnan(key_rt).any()
+    assert not torch.isnan(value_rt).any()
+    assert torch.allclose(key_rt, key)
+
+
+def test_fp16_input():
+    """Should work with float16 input as well."""
+    from vllm_omni.quantization.kv_quant import quantize_kv_fp8
+
+    key = torch.randn(1, 32, 4, 64, dtype=torch.float16)
+    value = torch.randn(1, 32, 4, 64, dtype=torch.float16)
+
+    fp8_key, fp8_value, k_scale, v_scale = quantize_kv_fp8(key, value)
+    assert fp8_key.dtype == torch.float8_e4m3fn
+    assert fp8_value.dtype == torch.float8_e4m3fn
+
+
+def test_kv_quantization_config_field():
+    """OmniDiffusionConfig should accept kv_quantization field."""
+    from vllm_omni.diffusion.data import OmniDiffusionConfig
+
+    config = OmniDiffusionConfig(model="test", kv_quantization=True)
+    assert config.kv_quantization is True
+
+    config_default = OmniDiffusionConfig(model="test")
+    assert config_default.kv_quantization is False
+
+
+def test_attention_metadata_scales():
+    """AttentionMetadata should have k_scale and v_scale fields."""
+    from vllm_omni.diffusion.attention.backends.abstract import AttentionMetadata
+
+    meta = AttentionMetadata()
+    assert meta.k_scale is None
+    assert meta.v_scale is None
+
+    scale = torch.tensor(0.5)
+    meta.k_scale = scale
+    meta.v_scale = scale
+    assert meta.k_scale is scale
diff --git a/vllm_omni/diffusion/attention/backends/abstract.py b/vllm_omni/diffusion/attention/backends/abstract.py
index d0a62bcd9cc..987870aa84b 100644
--- a/vllm_omni/diffusion/attention/backends/abstract.py
+++ b/vllm_omni/diffusion/attention/backends/abstract.py
@@ -65,6 +65,10 @@ class AttentionMetadata:
     joint_strategy: str = "front"
     # the strategy to joint the query, key, and value, can be "front" or "rear"
 
+    # FP8 KV quantization dequant scales (set by Attention._quantize_kv)
+    k_scale: torch.Tensor | None = None
+    v_scale: torch.Tensor | None = None
+
 
 T = TypeVar("T", bound=AttentionMetadata)
 
diff --git a/vllm_omni/diffusion/attention/backends/flash_attn.py b/vllm_omni/diffusion/attention/backends/flash_attn.py
index 5c586c0631e..b1a1054b819 100644
--- a/vllm_omni/diffusion/attention/backends/flash_attn.py
+++ b/vllm_omni/diffusion/attention/backends/flash_attn.py
@@ -97,6 +97,9 @@ def forward_cuda(
         attn_metadata: AttentionMetadata = None,
     ) -> torch.Tensor:
         """CUDA/ROCm flash attention implementation."""
+        # Dispatch to FP8 path if K/V are quantized
+        if key.dtype == torch.float8_e4m3fn:
+            return self._forward_fp8(query, key, value, attn_metadata)
         from vllm_omni.diffusion.attention.backends.utils.fa import (
             HAS_FLASH_ATTN,
             flash_attn_func,
@@ -209,3 +212,63 @@ def forward_npu(
             layout="BNSD",
         )
         return output
+
+    def _forward_fp8(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        attn_metadata: AttentionMetadata,
+    ) -> torch.Tensor:
+        """FP8 KV attention path: native FA3 or dequant fallback.
+
+        When an attention mask with padding is present, we dequantize and
+        fall through to the standard varlen path to avoid bypassing the
+        mask (FA3's varlen API does not support descale_k/descale_v).
+        """
+        from vllm_omni.quantization.kv_quant import dequantize_fp8
+
+        k_scale = attn_metadata.k_scale
+        v_scale = attn_metadata.v_scale
+
+        attention_mask = attn_metadata.attn_mask if attn_metadata is not None else None
+        has_padding = attention_mask is not None and torch.any(~attention_mask)
+
+        # If padding is present, dequant and use the standard masked path
+        if has_padding:
+            key = dequantize_fp8(key, k_scale, query.dtype)
+            value = dequantize_fp8(value, v_scale, query.dtype)
+            attn_metadata.k_scale = None
+            attn_metadata.v_scale = None
+            return self.forward_cuda(query, key, value, attn_metadata)
+
+        # Try FA3 native FP8 (Hopper / Ada / Ampere via fa3-fwd)
+        from vllm_omni.diffusion.attention.backends.ring.ring_globals import (
+            HAS_FA3,
+            fa3_attn_func,
+        )
+
+        if HAS_FA3 and fa3_attn_func is not None:
+            out = fa3_attn_func(
+                query,
+                key,
+                value,
+                softmax_scale=self.softmax_scale,
+                causal=self.causal,
+                descale_k=k_scale,
+                descale_v=v_scale,
+            )
+            if isinstance(out, tuple):
+                out = out[0]
+            return out
+
+        # Fallback: dequantize to compute dtype and use standard path
+        logger.warning_once(
+            "FP8 KV quantization without FA3 provides no performance benefit. "
+            "Install FA3 for optimal FP8 support on Hopper GPUs."
+        )
+        key = dequantize_fp8(key, k_scale, query.dtype)
+        value = dequantize_fp8(value, v_scale, query.dtype)
+        attn_metadata.k_scale = None
+        attn_metadata.v_scale = None
+        return self.forward_cuda(query, key, value, attn_metadata)
diff --git a/vllm_omni/diffusion/attention/backends/sdpa.py b/vllm_omni/diffusion/attention/backends/sdpa.py
index 3585689dd27..bf9550eada3 100644
--- a/vllm_omni/diffusion/attention/backends/sdpa.py
+++ b/vllm_omni/diffusion/attention/backends/sdpa.py
@@ -97,6 +97,19 @@ def _forward_impl(
         attn_metadata: AttentionMetadata | None = None,
         mask_mode: SDPAMaskMode = "broadcast_k",
     ) -> torch.Tensor:
+        # FP8 KV dequantization: SDPA does not support FP8 natively
+        if key.dtype == torch.float8_e4m3fn:
+            from vllm_omni.quantization.kv_quant import dequantize_fp8
+
+            k_scale = attn_metadata.k_scale if attn_metadata else None
+            v_scale = attn_metadata.v_scale if attn_metadata else None
+            key = dequantize_fp8(key, k_scale, query.dtype)
+            value = dequantize_fp8(value, v_scale, query.dtype)
+            logger.warning_once(
+                "FP8 KV with SDPA backend: dequantizing to compute dtype. "
+                "No memory or performance benefit. Use FA3 for optimal FP8 support."
+            )
+
         # Normalize mask before permuting q/k/v.
         # _maybe_reshape_attn_mask expects sequence length on dim=1.
         attention_mask = None
diff --git a/vllm_omni/diffusion/attention/layer.py b/vllm_omni/diffusion/attention/layer.py
index f83bb294d22..c736aa64c15 100644
--- a/vllm_omni/diffusion/attention/layer.py
+++ b/vllm_omni/diffusion/attention/layer.py
@@ -91,6 +91,22 @@ def __init__(
         # Fallback strategy when SP is not active (outside sharded regions)
         self._no_parallel_strategy = NoParallelAttention()
 
+        # FP8 KV quantization: read from forward context config
+        self._kv_quant_enabled = False
+        try:
+            config = get_forward_context().omni_diffusion_config
+            self._kv_quant_enabled = config.kv_quantization
+        except Exception:
+            pass
+
+        if self._kv_quant_enabled and self.use_ring:
+            raise ValueError(
+                "FP8 KV quantization is not compatible with ring attention "
+                "(ring_degree > 1). Ring kernels do not propagate FP8 descale "
+                "factors, which would silently corrupt results. Disable one of "
+                "the two, or use Ulysses SP instead."
+            )
+
     def _get_active_parallel_strategy(self):
         """Get the parallel strategy based on current SP active state.
 
@@ -104,6 +120,30 @@ def _get_active_parallel_strategy(self):
                 return self._no_parallel_strategy
         return self.parallel_strategy
 
+    def _quantize_kv(
+        self,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        attn_metadata: AttentionMetadata | None,
+    ) -> tuple[torch.Tensor, torch.Tensor, AttentionMetadata | None]:
+        """Quantize K/V tensors to FP8 and store scales in attn_metadata."""
+        from vllm_omni.quantization.kv_quant import quantize_kv_fp8
+
+        fp8_key, fp8_value, k_scale, v_scale = quantize_kv_fp8(key, value)
+
+        if attn_metadata is None:
+            attn_metadata = AttentionMetadata()
+        attn_metadata.k_scale = k_scale
+        attn_metadata.v_scale = v_scale
+
+        # Also quantize joint_key/joint_value if present
+        if attn_metadata.joint_key is not None and attn_metadata.joint_value is not None:
+            jk, jv, _, _ = quantize_kv_fp8(attn_metadata.joint_key, attn_metadata.joint_value)
+            attn_metadata.joint_key = jk
+            attn_metadata.joint_value = jv
+
+        return fp8_key, fp8_value, attn_metadata
+
     def forward(
         self,
         query: torch.Tensor,
@@ -119,6 +159,10 @@ def forward(
         # For Ring: Concat joint_q
         query, key, value, attn_metadata, ctx = strategy.pre_attention(query, key, value, attn_metadata)
 
+        # 1.5 FP8 KV quantization (after AllToAll stays BF16, before kernel)
+        if self._kv_quant_enabled:
+            key, value, attn_metadata = self._quantize_kv(key, value, attn_metadata)
+
         # 2. Kernel Execution (Computation)
         if self.use_ring and strategy is not self._no_parallel_strategy:
             out = self._run_ring_attention(query, key, value, attn_metadata)
diff --git a/vllm_omni/diffusion/data.py b/vllm_omni/diffusion/data.py
index 488378b40ff..5bbc413553f 100644
--- a/vllm_omni/diffusion/data.py
+++ b/vllm_omni/diffusion/data.py
@@ -486,6 +486,12 @@ class OmniDiffusionConfig:
     # Per-component: {"transformer": {"method": "fp8"}, "vae": None}
     quantization_config: str | QuantizationConfig | dict[str, Any] | None = None
 
+    # FP8 KV quantization: dynamically quantize attention K/V tensors to
+    # float8_e4m3fn each forward pass. Orthogonal to weight quantization.
+    # On Hopper+FA3: native FP8 attention (memory + compute savings).
+    # On FA2/SDPA: dequant fallback (memory-only savings).
+    kv_quantization: bool = False
+
     # Diffusion pipeline Profiling config
     enable_diffusion_pipeline_profiler: bool = False
 
diff --git a/vllm_omni/quantization/kv_quant.py b/vllm_omni/quantization/kv_quant.py
new file mode 100644
index 00000000000..7f7696fb506
--- /dev/null
+++ b/vllm_omni/quantization/kv_quant.py
@@ -0,0 +1,64 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""FP8 quantization utilities for diffusion model KV tensors.
+
+Provides per-tensor dynamic quantization of Key and Value tensors to
+float8_e4m3fn format. Designed for diffusion models where K/V are computed
+fresh each forward pass (no persistent KV cache).
+"""
+
+import torch
+
+
+def quantize_kv_fp8(
+    key: torch.Tensor,
+    value: torch.Tensor,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    """Quantize K/V tensors to float8_e4m3fn with dynamic per-tensor scaling.
+
+    Uses the same absmax scaling pattern as vLLM's ``input_to_float8``
+    (see ``vllm/model_executor/layers/quantization/utils/fp8_utils.py``).
+
+    Args:
+        key: Key tensor in BF16/FP16, shape ``(B, S, H, D)``
+        value: Value tensor in BF16/FP16, shape ``(B, S, H, D)``
+
+    Returns:
+        A tuple of ``(fp8_key, fp8_value, k_scale, v_scale)`` where scales
+        are *inverse* (dequant) scales: ``inv_scale = amax / FP8_MAX``.
+        Pass these scales as ``descale_k`` / ``descale_v`` to FA3 or use
+        :func:`dequantize_fp8` to convert back.
+    """
+    finfo = torch.finfo(torch.float8_e4m3fn)
+
+    # Key
+    k_amax = key.abs().amax().clamp(min=1e-12)
+    k_scale_factor = finfo.max / k_amax
+    fp8_key = (key * k_scale_factor).clamp(finfo.min, finfo.max).to(torch.float8_e4m3fn)
+    k_inv_scale = k_amax / finfo.max  # dequant scale
+
+    # Value
+    v_amax = value.abs().amax().clamp(min=1e-12)
+    v_scale_factor = finfo.max / v_amax
+    fp8_value = (value * v_scale_factor).clamp(finfo.min, finfo.max).to(torch.float8_e4m3fn)
+    v_inv_scale = v_amax / finfo.max  # dequant scale
+
+    return fp8_key, fp8_value, k_inv_scale, v_inv_scale
+
+
+def dequantize_fp8(
+    tensor: torch.Tensor,
+    inv_scale: torch.Tensor,
+    output_dtype: torch.dtype,
+) -> torch.Tensor:
+    """Dequantize an FP8 tensor back to the given dtype.
+
+    Args:
+        tensor: FP8-quantized tensor (float8_e4m3fn).
+        inv_scale: Inverse scale (dequant scale) produced by :func:`quantize_kv_fp8`.
+        output_dtype: Target dtype (e.g. ``torch.bfloat16``).
+
+    Returns:
+        Dequantized tensor: ``tensor.to(output_dtype) * inv_scale``.
+    """
+    return tensor.to(output_dtype) * inv_scale

From 57f00a74923d3f33efbe42d3170952192a893261 Mon Sep 17 00:00:00 2001
From: lishunyang <lishunyang12@163.com>
Date: Thu, 2 Apr 2026 18:21:35 +0800
Subject: [PATCH 02/45] [Quantization] Align FP8 attention with design doc: Q
 quantization, CLI rename, joint scales

Signed-off-by: lishunyang <lishunyang12@163.com>
---
 .../text_to_image/text_to_image.py            |  25 ++--
 tests/diffusion/quantization/test_kv_quant.py | 115 ++++++++++++------
 .../diffusion/attention/backends/abstract.py  |   6 +-
 .../attention/backends/flash_attn.py          |  55 +++++----
 .../diffusion/attention/backends/sdpa.py      |  13 +-
 vllm_omni/diffusion/attention/layer.py        |  48 +++++---
 vllm_omni/diffusion/data.py                   |   5 +
 vllm_omni/quantization/kv_quant.py            |  71 +++++++----
 8 files changed, 217 insertions(+), 121 deletions(-)

diff --git a/examples/offline_inference/text_to_image/text_to_image.py b/examples/offline_inference/text_to_image/text_to_image.py
index c17c87fcec4..099e1130e3b 100644
--- a/examples/offline_inference/text_to_image/text_to_image.py
+++ b/examples/offline_inference/text_to_image/text_to_image.py
@@ -169,11 +169,15 @@ def parse_args() -> argparse.Namespace:
         "Example: --ignored-layers 'add_kv_proj,to_add_out'",
     )
     parser.add_argument(
-        "--kv-quantization",
-        action="store_true",
-        help="Enable FP8 quantization of attention K/V tensors for memory reduction. "
-        "Requires --quantization fp8. On Hopper GPUs with FA3, also accelerates attention. "
-        "On other backends (FA2/SDPA), K/V are dequantized before the kernel (memory-only benefit).",
+        "--kv-cache-dtype",
+        type=str,
+        default="auto",
+        choices=["auto", "fp8"],
+        help="Data type for attention Q/K/V quantization. "
+        "'fp8': dynamically quantize to float8_e4m3fn each forward pass. "
+        "On Hopper GPUs with FA3, enables native FP8 attention compute. "
+        "On other backends (FA2/SDPA), tensors are dequantized before the kernel. "
+        "'auto': no quantization (default).",
     )
     parser.add_argument(
         "--vae-use-slicing",
@@ -307,11 +311,10 @@ def main():
         lora_args["lora_path"] = args.lora_path
         print(f"Using LoRA from: {args.lora_path}")
 
-    # Build quantization kwargs: use quantization_config dict when
-    # ignored_layers is specified so the list flows through OmniDiffusionConfig
+    # Build quantization kwargs
     quant_kwargs: dict[str, Any] = {}
     ignored_layers = [s.strip() for s in args.ignored_layers.split(",") if s.strip()] if args.ignored_layers else None
-    kv_quantization = getattr(args, "kv_quantization", False)
+    kv_cache_dtype = args.kv_cache_dtype if args.kv_cache_dtype != "auto" else None
     if args.quantization == "gguf":
         if not args.gguf_model:
             raise ValueError("--gguf-model is required when --quantization gguf is set.")
@@ -339,9 +342,9 @@ def main():
         "enforce_eager": args.enforce_eager,
         "enable_cpu_offload": args.enable_cpu_offload,
         "mode": "text-to-image",
-        "kv_quantization": kv_quantization,
         "log_stats": args.log_stats,
         "enable_diffusion_pipeline_profiler": args.enable_diffusion_pipeline_profiler,
+        "kv_cache_dtype": kv_cache_dtype,
         **lora_args,
         **quant_kwargs,
     }
@@ -363,8 +366,8 @@ def main():
     print(f"  Inference steps: {args.num_inference_steps}")
     print(f"  Cache backend: {cache_backend if cache_backend else 'None (no acceleration)'}")
     print(f"  Quantization: {args.quantization if args.quantization else 'None (BF16)'}")
-    if kv_quantization:
-        print("  KV quantization: FP8 (enabled)")
+    if kv_cache_dtype:
+        print(f"  KV cache dtype: {kv_cache_dtype}")
     if ignored_layers:
         print(f"  Ignored layers: {ignored_layers}")
     print(
diff --git a/tests/diffusion/quantization/test_kv_quant.py b/tests/diffusion/quantization/test_kv_quant.py
index 5c1dd9334d4..e9c7eebe861 100644
--- a/tests/diffusion/quantization/test_kv_quant.py
+++ b/tests/diffusion/quantization/test_kv_quant.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Tests for FP8 KV quantization utilities."""
+"""Tests for FP8 Q/K/V quantization utilities."""
 
 import pytest
 import torch
@@ -8,91 +8,132 @@
 pytestmark = [pytest.mark.core_model, pytest.mark.diffusion]
 
 
-def test_roundtrip_preserves_values():
-    """quantize_kv_fp8 -> dequantize_fp8 should preserve values within FP8 tolerance."""
-    from vllm_omni.quantization.kv_quant import dequantize_fp8, quantize_kv_fp8
+def test_qkv_roundtrip_preserves_values():
+    """quantize_qkv_fp8 -> dequantize_fp8 should preserve values within FP8 tolerance."""
+    from vllm_omni.quantization.kv_quant import (
+        dequantize_fp8,
+        quantize_qkv_fp8,
+    )
 
     torch.manual_seed(42)
+    query = torch.randn(2, 128, 8, 64, dtype=torch.bfloat16)
     key = torch.randn(2, 128, 8, 64, dtype=torch.bfloat16)
     value = torch.randn(2, 128, 8, 64, dtype=torch.bfloat16)
 
-    fp8_key, fp8_value, k_scale, v_scale = quantize_kv_fp8(key, value)
+    fp8_q, fp8_k, fp8_v, q_scale, k_scale, v_scale = quantize_qkv_fp8(
+        query, key, value
+    )
 
-    assert fp8_key.dtype == torch.float8_e4m3fn
-    assert fp8_value.dtype == torch.float8_e4m3fn
+    assert fp8_q.dtype == torch.float8_e4m3fn
+    assert fp8_k.dtype == torch.float8_e4m3fn
+    assert fp8_v.dtype == torch.float8_e4m3fn
+    assert q_scale.numel() == 1
     assert k_scale.numel() == 1
     assert v_scale.numel() == 1
 
-    key_rt = dequantize_fp8(fp8_key, k_scale, torch.bfloat16)
-    value_rt = dequantize_fp8(fp8_value, v_scale, torch.bfloat16)
-
-    assert key_rt.shape == key.shape
-    assert value_rt.shape == value.shape
+    query_rt = dequantize_fp8(fp8_q, q_scale, torch.bfloat16)
+    key_rt = dequantize_fp8(fp8_k, k_scale, torch.bfloat16)
+    value_rt = dequantize_fp8(fp8_v, v_scale, torch.bfloat16)
 
     # FP8 e4m3 has ~0.1% relative error for typical values
+    torch.testing.assert_close(query_rt, query, rtol=0.05, atol=0.05)
     torch.testing.assert_close(key_rt, key, rtol=0.05, atol=0.05)
     torch.testing.assert_close(value_rt, value, rtol=0.05, atol=0.05)
 
 
-def test_scales_are_positive():
-    from vllm_omni.quantization.kv_quant import quantize_kv_fp8
+def test_kv_only_roundtrip():
+    """quantize_kv_fp8 for joint attention path."""
+    from vllm_omni.quantization.kv_quant import (
+        dequantize_fp8,
+        quantize_kv_fp8,
+    )
 
+    torch.manual_seed(42)
     key = torch.randn(1, 64, 4, 32, dtype=torch.bfloat16)
     value = torch.randn(1, 64, 4, 32, dtype=torch.bfloat16)
 
-    _, _, k_scale, v_scale = quantize_kv_fp8(key, value)
+    fp8_k, fp8_v, k_scale, v_scale = quantize_kv_fp8(key, value)
+
+    assert fp8_k.dtype == torch.float8_e4m3fn
+    assert k_scale > 0
+    assert v_scale > 0
+
+    key_rt = dequantize_fp8(fp8_k, k_scale, torch.bfloat16)
+    torch.testing.assert_close(key_rt, key, rtol=0.05, atol=0.05)
+
+
+def test_scales_are_positive():
+    from vllm_omni.quantization.kv_quant import quantize_qkv_fp8
+
+    q = torch.randn(1, 64, 4, 32, dtype=torch.bfloat16)
+    k = torch.randn(1, 64, 4, 32, dtype=torch.bfloat16)
+    v = torch.randn(1, 64, 4, 32, dtype=torch.bfloat16)
+
+    _, _, _, q_scale, k_scale, v_scale = quantize_qkv_fp8(q, k, v)
+    assert q_scale > 0
     assert k_scale > 0
     assert v_scale > 0
 
 
 def test_zero_tensor():
     """All-zero input should not produce NaN or Inf."""
-    from vllm_omni.quantization.kv_quant import dequantize_fp8, quantize_kv_fp8
+    from vllm_omni.quantization.kv_quant import (
+        dequantize_fp8,
+        quantize_qkv_fp8,
+    )
 
-    key = torch.zeros(1, 16, 4, 32, dtype=torch.bfloat16)
-    value = torch.zeros(1, 16, 4, 32, dtype=torch.bfloat16)
+    q = torch.zeros(1, 16, 4, 32, dtype=torch.bfloat16)
+    k = torch.zeros(1, 16, 4, 32, dtype=torch.bfloat16)
+    v = torch.zeros(1, 16, 4, 32, dtype=torch.bfloat16)
 
-    fp8_key, fp8_value, k_scale, v_scale = quantize_kv_fp8(key, value)
-    key_rt = dequantize_fp8(fp8_key, k_scale, torch.bfloat16)
-    value_rt = dequantize_fp8(fp8_value, v_scale, torch.bfloat16)
+    fp8_q, fp8_k, fp8_v, q_s, k_s, v_s = quantize_qkv_fp8(q, k, v)
+    q_rt = dequantize_fp8(fp8_q, q_s, torch.bfloat16)
+    k_rt = dequantize_fp8(fp8_k, k_s, torch.bfloat16)
 
-    assert not torch.isnan(key_rt).any()
-    assert not torch.isnan(value_rt).any()
-    assert torch.allclose(key_rt, key)
+    assert not torch.isnan(q_rt).any()
+    assert not torch.isnan(k_rt).any()
+    assert torch.allclose(q_rt, q)
+    assert torch.allclose(k_rt, k)
 
 
 def test_fp16_input():
     """Should work with float16 input as well."""
-    from vllm_omni.quantization.kv_quant import quantize_kv_fp8
+    from vllm_omni.quantization.kv_quant import quantize_qkv_fp8
 
-    key = torch.randn(1, 32, 4, 64, dtype=torch.float16)
-    value = torch.randn(1, 32, 4, 64, dtype=torch.float16)
+    q = torch.randn(1, 32, 4, 64, dtype=torch.float16)
+    k = torch.randn(1, 32, 4, 64, dtype=torch.float16)
+    v = torch.randn(1, 32, 4, 64, dtype=torch.float16)
 
-    fp8_key, fp8_value, k_scale, v_scale = quantize_kv_fp8(key, value)
-    assert fp8_key.dtype == torch.float8_e4m3fn
-    assert fp8_value.dtype == torch.float8_e4m3fn
+    fp8_q, fp8_k, fp8_v, _, _, _ = quantize_qkv_fp8(q, k, v)
+    assert fp8_q.dtype == torch.float8_e4m3fn
+    assert fp8_k.dtype == torch.float8_e4m3fn
+    assert fp8_v.dtype == torch.float8_e4m3fn
 
 
-def test_kv_quantization_config_field():
-    """OmniDiffusionConfig should accept kv_quantization field."""
+def test_kv_cache_dtype_config_field():
+    """OmniDiffusionConfig should accept kv_cache_dtype field."""
     from vllm_omni.diffusion.data import OmniDiffusionConfig
 
-    config = OmniDiffusionConfig(model="test", kv_quantization=True)
-    assert config.kv_quantization is True
+    config = OmniDiffusionConfig(model="test", kv_cache_dtype="fp8")
+    assert config.kv_cache_dtype == "fp8"
 
     config_default = OmniDiffusionConfig(model="test")
-    assert config_default.kv_quantization is False
+    assert config_default.kv_cache_dtype is None
 
 
 def test_attention_metadata_scales():
-    """AttentionMetadata should have k_scale and v_scale fields."""
+    """AttentionMetadata should have q/k/v and joint scale fields."""
     from vllm_omni.diffusion.attention.backends.abstract import AttentionMetadata
 
     meta = AttentionMetadata()
+    assert meta.q_scale is None
     assert meta.k_scale is None
     assert meta.v_scale is None
+    assert meta.jk_scale is None
+    assert meta.jv_scale is None
 
     scale = torch.tensor(0.5)
+    meta.q_scale = scale
     meta.k_scale = scale
     meta.v_scale = scale
-    assert meta.k_scale is scale
+    assert meta.q_scale is scale
diff --git a/vllm_omni/diffusion/attention/backends/abstract.py b/vllm_omni/diffusion/attention/backends/abstract.py
index 05dd4c3526b..27476e8c43c 100644
--- a/vllm_omni/diffusion/attention/backends/abstract.py
+++ b/vllm_omni/diffusion/attention/backends/abstract.py
@@ -65,9 +65,13 @@ class AttentionMetadata:
     joint_strategy: str = "front"
     # the strategy to joint the query, key, and value, can be "front" or "rear"
 
-    # FP8 KV quantization dequant scales (set by Attention._quantize_kv)
+    # FP8 attention quantization dequant scales (set by Attention._quantize_qkv_fp8)
+    q_scale: torch.Tensor | None = None
     k_scale: torch.Tensor | None = None
     v_scale: torch.Tensor | None = None
+    # Separate scales for joint (img+txt concat) key/value
+    jk_scale: torch.Tensor | None = None
+    jv_scale: torch.Tensor | None = None
 
 
 T = TypeVar("T", bound=AttentionMetadata)
diff --git a/vllm_omni/diffusion/attention/backends/flash_attn.py b/vllm_omni/diffusion/attention/backends/flash_attn.py
index b1a1054b819..620ca90056b 100644
--- a/vllm_omni/diffusion/attention/backends/flash_attn.py
+++ b/vllm_omni/diffusion/attention/backends/flash_attn.py
@@ -97,7 +97,7 @@ def forward_cuda(
         attn_metadata: AttentionMetadata = None,
     ) -> torch.Tensor:
         """CUDA/ROCm flash attention implementation."""
-        # Dispatch to FP8 path if K/V are quantized
+        # Dispatch to FP8 path if Q/K/V are quantized
         if key.dtype == torch.float8_e4m3fn:
             return self._forward_fp8(query, key, value, attn_metadata)
         from vllm_omni.diffusion.attention.backends.utils.fa import (
@@ -220,14 +220,10 @@ def _forward_fp8(
         value: torch.Tensor,
         attn_metadata: AttentionMetadata,
     ) -> torch.Tensor:
-        """FP8 KV attention path: native FA3 or dequant fallback.
-
-        When an attention mask with padding is present, we dequantize and
-        fall through to the standard varlen path to avoid bypassing the
-        mask (FA3's varlen API does not support descale_k/descale_v).
-        """
+        """FP8 Q/K/V attention path: native FA3 or dequant fallback."""
         from vllm_omni.quantization.kv_quant import dequantize_fp8
 
+        q_scale = attn_metadata.q_scale
         k_scale = attn_metadata.k_scale
         v_scale = attn_metadata.v_scale
 
@@ -236,8 +232,11 @@ def _forward_fp8(
 
         # If padding is present, dequant and use the standard masked path
         if has_padding:
-            key = dequantize_fp8(key, k_scale, query.dtype)
-            value = dequantize_fp8(value, v_scale, query.dtype)
+            output_dtype = torch.bfloat16
+            query = dequantize_fp8(query, q_scale, output_dtype)
+            key = dequantize_fp8(key, k_scale, output_dtype)
+            value = dequantize_fp8(value, v_scale, output_dtype)
+            attn_metadata.q_scale = None
             attn_metadata.k_scale = None
             attn_metadata.v_scale = None
             return self.forward_cuda(query, key, value, attn_metadata)
@@ -249,26 +248,38 @@ def _forward_fp8(
         )
 
         if HAS_FA3 and fa3_attn_func is not None:
-            out = fa3_attn_func(
-                query,
-                key,
-                value,
-                softmax_scale=self.softmax_scale,
-                causal=self.causal,
-                descale_k=k_scale,
-                descale_v=v_scale,
-            )
+            fa3_kwargs: dict = {
+                "softmax_scale": self.softmax_scale,
+                "causal": self.causal,
+                "descale_k": k_scale,
+                "descale_v": v_scale,
+            }
+            # descale_q requires FA3 >= 2.7; guard against older versions
+            try:
+                out = fa3_attn_func(
+                    query, key, value, descale_q=q_scale, **fa3_kwargs
+                )
+            except TypeError:
+                logger.warning_once(
+                    "FA3 does not support descale_q (version < 2.7). "
+                    "Q will run in FP8 without descaling — consider upgrading."
+                )
+                out = fa3_attn_func(query, key, value, **fa3_kwargs)
             if isinstance(out, tuple):
                 out = out[0]
             return out
 
         # Fallback: dequantize to compute dtype and use standard path
         logger.warning_once(
-            "FP8 KV quantization without FA3 provides no performance benefit. "
+            "FP8 attention without FA3 provides no compute benefit. "
             "Install FA3 for optimal FP8 support on Hopper GPUs."
         )
-        key = dequantize_fp8(key, k_scale, query.dtype)
-        value = dequantize_fp8(value, v_scale, query.dtype)
+        output_dtype = torch.bfloat16
+        query_bf16 = dequantize_fp8(query, q_scale, output_dtype)
+        key_bf16 = dequantize_fp8(key, k_scale, output_dtype)
+        value_bf16 = dequantize_fp8(value, v_scale, output_dtype)
+        # Clear scales to avoid re-detection on recursive call
+        attn_metadata.q_scale = None
         attn_metadata.k_scale = None
         attn_metadata.v_scale = None
-        return self.forward_cuda(query, key, value, attn_metadata)
+        return self.forward_cuda(query_bf16, key_bf16, value_bf16, attn_metadata)
diff --git a/vllm_omni/diffusion/attention/backends/sdpa.py b/vllm_omni/diffusion/attention/backends/sdpa.py
index bf9550eada3..f400f57dba8 100644
--- a/vllm_omni/diffusion/attention/backends/sdpa.py
+++ b/vllm_omni/diffusion/attention/backends/sdpa.py
@@ -97,17 +97,20 @@ def _forward_impl(
         attn_metadata: AttentionMetadata | None = None,
         mask_mode: SDPAMaskMode = "broadcast_k",
     ) -> torch.Tensor:
-        # FP8 KV dequantization: SDPA does not support FP8 natively
+        # FP8 dequantization: SDPA does not support FP8 natively
         if key.dtype == torch.float8_e4m3fn:
             from vllm_omni.quantization.kv_quant import dequantize_fp8
 
+            output_dtype = torch.bfloat16
+            q_scale = attn_metadata.q_scale if attn_metadata else None
             k_scale = attn_metadata.k_scale if attn_metadata else None
             v_scale = attn_metadata.v_scale if attn_metadata else None
-            key = dequantize_fp8(key, k_scale, query.dtype)
-            value = dequantize_fp8(value, v_scale, query.dtype)
+            query = dequantize_fp8(query, q_scale, output_dtype)
+            key = dequantize_fp8(key, k_scale, output_dtype)
+            value = dequantize_fp8(value, v_scale, output_dtype)
             logger.warning_once(
-                "FP8 KV with SDPA backend: dequantizing to compute dtype. "
-                "No memory or performance benefit. Use FA3 for optimal FP8 support."
+                "FP8 attention with SDPA backend: dequantizing to compute dtype. "
+                "No compute benefit. Use FA3 for optimal FP8 support."
             )
 
         # Normalize mask before permuting q/k/v.
diff --git a/vllm_omni/diffusion/attention/layer.py b/vllm_omni/diffusion/attention/layer.py
index c736aa64c15..f849d2df9d5 100644
--- a/vllm_omni/diffusion/attention/layer.py
+++ b/vllm_omni/diffusion/attention/layer.py
@@ -91,20 +91,19 @@ def __init__(
         # Fallback strategy when SP is not active (outside sharded regions)
         self._no_parallel_strategy = NoParallelAttention()
 
-        # FP8 KV quantization: read from forward context config
-        self._kv_quant_enabled = False
+        # FP8 attention quantization: read from forward context config
+        self._fp8_attn_enabled = False
         try:
             config = get_forward_context().omni_diffusion_config
-            self._kv_quant_enabled = config.kv_quantization
+            self._fp8_attn_enabled = config.kv_cache_dtype == "fp8"
         except Exception:
             pass
 
-        if self._kv_quant_enabled and self.use_ring:
+        if self._fp8_attn_enabled and self.use_ring:
             raise ValueError(
-                "FP8 KV quantization is not compatible with ring attention "
+                "FP8 attention quantization is not compatible with ring attention "
                 "(ring_degree > 1). Ring kernels do not propagate FP8 descale "
-                "factors, which would silently corrupt results. Disable one of "
-                "the two, or use Ulysses SP instead."
+                "factors. Use Ulysses SP instead."
             )
 
     def _get_active_parallel_strategy(self):
@@ -120,29 +119,40 @@ def _get_active_parallel_strategy(self):
                 return self._no_parallel_strategy
         return self.parallel_strategy
 
-    def _quantize_kv(
+    def _quantize_qkv_fp8(
         self,
+        query: torch.Tensor,
         key: torch.Tensor,
         value: torch.Tensor,
         attn_metadata: AttentionMetadata | None,
-    ) -> tuple[torch.Tensor, torch.Tensor, AttentionMetadata | None]:
-        """Quantize K/V tensors to FP8 and store scales in attn_metadata."""
-        from vllm_omni.quantization.kv_quant import quantize_kv_fp8
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, AttentionMetadata | None]:
+        """Quantize Q/K/V tensors to FP8 and store scales in attn_metadata."""
+        from vllm_omni.quantization.kv_quant import (
+            quantize_kv_fp8,
+            quantize_qkv_fp8,
+        )
 
-        fp8_key, fp8_value, k_scale, v_scale = quantize_kv_fp8(key, value)
+        fp8_q, fp8_k, fp8_v, q_scale, k_scale, v_scale = quantize_qkv_fp8(
+            query, key, value
+        )
 
         if attn_metadata is None:
             attn_metadata = AttentionMetadata()
+        attn_metadata.q_scale = q_scale
         attn_metadata.k_scale = k_scale
         attn_metadata.v_scale = v_scale
 
-        # Also quantize joint_key/joint_value if present
+        # Quantize joint_key/joint_value with separate scales
         if attn_metadata.joint_key is not None and attn_metadata.joint_value is not None:
-            jk, jv, _, _ = quantize_kv_fp8(attn_metadata.joint_key, attn_metadata.joint_value)
+            jk, jv, jk_scale, jv_scale = quantize_kv_fp8(
+                attn_metadata.joint_key, attn_metadata.joint_value
+            )
             attn_metadata.joint_key = jk
             attn_metadata.joint_value = jv
+            attn_metadata.jk_scale = jk_scale
+            attn_metadata.jv_scale = jv_scale
 
-        return fp8_key, fp8_value, attn_metadata
+        return fp8_q, fp8_k, fp8_v, attn_metadata
 
     def forward(
         self,
@@ -159,9 +169,11 @@ def forward(
         # For Ring: Concat joint_q
         query, key, value, attn_metadata, ctx = strategy.pre_attention(query, key, value, attn_metadata)
 
-        # 1.5 FP8 KV quantization (after AllToAll stays BF16, before kernel)
-        if self._kv_quant_enabled:
-            key, value, attn_metadata = self._quantize_kv(key, value, attn_metadata)
+        # 1.5 FP8 Q/K/V quantization (after AllToAll stays BF16, before kernel)
+        if self._fp8_attn_enabled:
+            query, key, value, attn_metadata = self._quantize_qkv_fp8(
+                query, key, value, attn_metadata
+            )
 
         # 2. Kernel Execution (Computation)
         if self.use_ring and strategy is not self._no_parallel_strategy:
diff --git a/vllm_omni/diffusion/data.py b/vllm_omni/diffusion/data.py
index 7b4ff546503..b9fb21ded0c 100644
--- a/vllm_omni/diffusion/data.py
+++ b/vllm_omni/diffusion/data.py
@@ -492,6 +492,11 @@ class OmniDiffusionConfig:
     # On FA2/SDPA: dequant fallback (memory-only savings).
     kv_quantization: bool = False
 
+    # FP8 attention quantization (orthogonal to weight quantization).
+    # "fp8": dynamically quantize Q/K/V to float8_e4m3fn each forward pass.
+    # None or "auto": disabled.
+    kv_cache_dtype: str | None = None
+
     # Diffusion pipeline Profiling config
     enable_diffusion_pipeline_profiler: bool = False
 
diff --git a/vllm_omni/quantization/kv_quant.py b/vllm_omni/quantization/kv_quant.py
index 7f7696fb506..9baf3aa590f 100644
--- a/vllm_omni/quantization/kv_quant.py
+++ b/vllm_omni/quantization/kv_quant.py
@@ -1,49 +1,66 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""FP8 quantization utilities for diffusion model KV tensors.
+"""FP8 quantization utilities for diffusion attention tensors.
 
-Provides per-tensor dynamic quantization of Key and Value tensors to
-float8_e4m3fn format. Designed for diffusion models where K/V are computed
-fresh each forward pass (no persistent KV cache).
+Provides per-tensor dynamic quantization of Q/K/V tensors to
+float8_e4m3fn format. Designed for diffusion models where Q/K/V are
+computed fresh each forward pass (no persistent KV cache).
 """
 
 import torch
 
 
-def quantize_kv_fp8(
+def _quantize_tensor_fp8(
+    tensor: torch.Tensor,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    """Quantize a single tensor to FP8 with per-tensor dynamic scaling.
+
+    Returns:
+        ``(fp8_tensor, inv_scale)`` where inv_scale is the dequant scale.
+    """
+    finfo = torch.finfo(torch.float8_e4m3fn)
+    amax = tensor.abs().amax().clamp(min=1e-12)
+    scale_factor = finfo.max / amax
+    fp8 = (tensor * scale_factor).clamp(finfo.min, finfo.max).to(torch.float8_e4m3fn)
+    inv_scale = amax / finfo.max
+    return fp8, inv_scale
+
+
+def quantize_qkv_fp8(
+    query: torch.Tensor,
     key: torch.Tensor,
     value: torch.Tensor,
-) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
-    """Quantize K/V tensors to float8_e4m3fn with dynamic per-tensor scaling.
-
-    Uses the same absmax scaling pattern as vLLM's ``input_to_float8``
-    (see ``vllm/model_executor/layers/quantization/utils/fp8_utils.py``).
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    """Quantize Q/K/V tensors to float8_e4m3fn with dynamic per-tensor scaling.
 
     Args:
+        query: Query tensor in BF16/FP16, shape ``(B, S, H, D)``
         key: Key tensor in BF16/FP16, shape ``(B, S, H, D)``
         value: Value tensor in BF16/FP16, shape ``(B, S, H, D)``
 
     Returns:
-        A tuple of ``(fp8_key, fp8_value, k_scale, v_scale)`` where scales
-        are *inverse* (dequant) scales: ``inv_scale = amax / FP8_MAX``.
-        Pass these scales as ``descale_k`` / ``descale_v`` to FA3 or use
-        :func:`dequantize_fp8` to convert back.
+        ``(fp8_query, fp8_key, fp8_value, q_scale, k_scale, v_scale)``
+        where scales are inverse (dequant) scales: ``inv_scale = amax / FP8_MAX``.
+        Pass as ``descale_q/k/v`` to FA3 or use :func:`dequantize_fp8`.
     """
-    finfo = torch.finfo(torch.float8_e4m3fn)
+    fp8_q, q_scale = _quantize_tensor_fp8(query)
+    fp8_k, k_scale = _quantize_tensor_fp8(key)
+    fp8_v, v_scale = _quantize_tensor_fp8(value)
+    return fp8_q, fp8_k, fp8_v, q_scale, k_scale, v_scale
 
-    # Key
-    k_amax = key.abs().amax().clamp(min=1e-12)
-    k_scale_factor = finfo.max / k_amax
-    fp8_key = (key * k_scale_factor).clamp(finfo.min, finfo.max).to(torch.float8_e4m3fn)
-    k_inv_scale = k_amax / finfo.max  # dequant scale
 
-    # Value
-    v_amax = value.abs().amax().clamp(min=1e-12)
-    v_scale_factor = finfo.max / v_amax
-    fp8_value = (value * v_scale_factor).clamp(finfo.min, finfo.max).to(torch.float8_e4m3fn)
-    v_inv_scale = v_amax / finfo.max  # dequant scale
+def quantize_kv_fp8(
+    key: torch.Tensor,
+    value: torch.Tensor,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    """Quantize K/V tensors to float8_e4m3fn (joint attention path).
 
-    return fp8_key, fp8_value, k_inv_scale, v_inv_scale
+    Returns:
+        ``(fp8_key, fp8_value, k_scale, v_scale)``
+    """
+    fp8_k, k_scale = _quantize_tensor_fp8(key)
+    fp8_v, v_scale = _quantize_tensor_fp8(value)
+    return fp8_k, fp8_v, k_scale, v_scale
 
 
 def dequantize_fp8(
@@ -55,7 +72,7 @@ def dequantize_fp8(
 
     Args:
         tensor: FP8-quantized tensor (float8_e4m3fn).
-        inv_scale: Inverse scale (dequant scale) produced by :func:`quantize_kv_fp8`.
+        inv_scale: Inverse scale (dequant scale).
         output_dtype: Target dtype (e.g. ``torch.bfloat16``).
 
     Returns:

From bd9931c4c5327d2ef29cf2dab759356634726390 Mon Sep 17 00:00:00 2001
From: lishunyang <lishunyang12@163.com>
Date: Thu, 2 Apr 2026 18:45:43 +0800
Subject: [PATCH 03/45] Add --kv-cache-dtype flag to text_to_video.py

Signed-off-by: lishunyang <lishunyang12@163.com>
---
 .../text_to_video/text_to_video.py                  | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/examples/offline_inference/text_to_video/text_to_video.py b/examples/offline_inference/text_to_video/text_to_video.py
index a3aa818d2e6..b5fcc0086bf 100644
--- a/examples/offline_inference/text_to_video/text_to_video.py
+++ b/examples/offline_inference/text_to_video/text_to_video.py
@@ -180,6 +180,16 @@ def parse_args() -> argparse.Namespace:
         choices=["fp8", "gguf"],
         help="Quantization method for the transformer (fp8 for online FP8 quantization).",
     )
+    parser.add_argument(
+        "--kv-cache-dtype",
+        type=str,
+        default="auto",
+        choices=["auto", "fp8"],
+        help="Data type for attention Q/K/V quantization. "
+        "'fp8': dynamically quantize to float8_e4m3fn each forward pass. "
+        "On Hopper GPUs with FA3, enables native FP8 attention compute. "
+        "'auto': no quantization (default).",
+    )
     return parser.parse_args()
 
 
@@ -221,6 +231,8 @@ def main():
     # Check if profiling is requested via environment variable
     profiler_enabled = bool(os.getenv("VLLM_TORCH_PROFILER_DIR"))
 
+    kv_cache_dtype = args.kv_cache_dtype if args.kv_cache_dtype != "auto" else None
+
     omni_kwargs = dict(
         model=args.model,
         enable_layerwise_offload=args.enable_layerwise_offload,
@@ -232,6 +244,7 @@ def main():
         cache_backend=args.cache_backend,
         cache_config=cache_config,
         enable_diffusion_pipeline_profiler=args.enable_diffusion_pipeline_profiler,
+        kv_cache_dtype=kv_cache_dtype,
     )
     if args.boundary_ratio is not None:
         omni_kwargs["boundary_ratio"] = args.boundary_ratio

From 1ec32364024d243353ace6567c89add39bf207b2 Mon Sep 17 00:00:00 2001
From: lishunyang <lishunyang12@163.com>
Date: Thu, 2 Apr 2026 19:04:55 +0800
Subject: [PATCH 04/45] Fix FP8 attention not activating: lazy-resolve config
 from forward context

forward_context is not set during model loading when Attention.__init__
runs, so kv_cache_dtype was never read. Resolve it lazily on first
forward() call instead.

Signed-off-by: lishunyang <lishunyang12@163.com>
---
 vllm_omni/diffusion/attention/layer.py | 37 +++++++++++++++-----------
 1 file changed, 22 insertions(+), 15 deletions(-)

diff --git a/vllm_omni/diffusion/attention/layer.py b/vllm_omni/diffusion/attention/layer.py
index f849d2df9d5..da97228e4d1 100644
--- a/vllm_omni/diffusion/attention/layer.py
+++ b/vllm_omni/diffusion/attention/layer.py
@@ -91,20 +91,9 @@ def __init__(
         # Fallback strategy when SP is not active (outside sharded regions)
         self._no_parallel_strategy = NoParallelAttention()
 
-        # FP8 attention quantization: read from forward context config
-        self._fp8_attn_enabled = False
-        try:
-            config = get_forward_context().omni_diffusion_config
-            self._fp8_attn_enabled = config.kv_cache_dtype == "fp8"
-        except Exception:
-            pass
-
-        if self._fp8_attn_enabled and self.use_ring:
-            raise ValueError(
-                "FP8 attention quantization is not compatible with ring attention "
-                "(ring_degree > 1). Ring kernels do not propagate FP8 descale "
-                "factors. Use Ulysses SP instead."
-            )
+        # FP8 attention quantization: resolved lazily in forward() because
+        # forward_context is not available during model loading.
+        self._fp8_attn_enabled: bool | None = None
 
     def _get_active_parallel_strategy(self):
         """Get the parallel strategy based on current SP active state.
@@ -154,6 +143,24 @@ def _quantize_qkv_fp8(
 
         return fp8_q, fp8_k, fp8_v, attn_metadata
 
+    def _resolve_fp8_attn(self) -> bool:
+        """Lazily resolve FP8 attention config from forward context."""
+        if self._fp8_attn_enabled is not None:
+            return self._fp8_attn_enabled
+        try:
+            config = get_forward_context().omni_diffusion_config
+            enabled = config.kv_cache_dtype == "fp8"
+        except Exception:
+            enabled = False
+        if enabled and self.use_ring:
+            raise ValueError(
+                "FP8 attention quantization is not compatible with ring attention "
+                "(ring_degree > 1). Ring kernels do not propagate FP8 descale "
+                "factors. Use Ulysses SP instead."
+            )
+        self._fp8_attn_enabled = enabled
+        return enabled
+
     def forward(
         self,
         query: torch.Tensor,
@@ -170,7 +177,7 @@ def forward(
         query, key, value, attn_metadata, ctx = strategy.pre_attention(query, key, value, attn_metadata)
 
         # 1.5 FP8 Q/K/V quantization (after AllToAll stays BF16, before kernel)
-        if self._fp8_attn_enabled:
+        if self._resolve_fp8_attn():
             query, key, value, attn_metadata = self._quantize_qkv_fp8(
                 query, key, value, attn_metadata
             )

From 5a5ed10ec1fea46524423739d8d24c4c2680adba Mon Sep 17 00:00:00 2001
From: lishunyang <lishunyang12@163.com>
Date: Thu, 2 Apr 2026 20:04:48 +0800
Subject: [PATCH 05/45] Add debug logging for FP8 attention resolution

Signed-off-by: lishunyang <lishunyang12@163.com>
---
 vllm_omni/diffusion/attention/layer.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/vllm_omni/diffusion/attention/layer.py b/vllm_omni/diffusion/attention/layer.py
index da97228e4d1..28cde6e06c1 100644
--- a/vllm_omni/diffusion/attention/layer.py
+++ b/vllm_omni/diffusion/attention/layer.py
@@ -150,7 +150,13 @@ def _resolve_fp8_attn(self) -> bool:
         try:
             config = get_forward_context().omni_diffusion_config
             enabled = config.kv_cache_dtype == "fp8"
-        except Exception:
+            logger.info(
+                "FP8 attention resolved: kv_cache_dtype=%s, enabled=%s",
+                getattr(config, "kv_cache_dtype", "MISSING"),
+                enabled,
+            )
+        except Exception as e:
+            logger.warning("FP8 attention resolve failed: %s", e)
             enabled = False
         if enabled and self.use_ring:
             raise ValueError(

From 841ad38cdbc0ed46a475fb8348c234962a73a51f Mon Sep 17 00:00:00 2001
From: lishunyang <lishunyang12@163.com>
Date: Thu, 2 Apr 2026 20:14:35 +0800
Subject: [PATCH 06/45] Wire kv_cache_dtype through default stage config

Signed-off-by: lishunyang <lishunyang12@163.com>
---
 vllm_omni/engine/async_omni_engine.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vllm_omni/engine/async_omni_engine.py b/vllm_omni/engine/async_omni_engine.py
index 9de3dc867ff..b102294891b 100644
--- a/vllm_omni/engine/async_omni_engine.py
+++ b/vllm_omni/engine/async_omni_engine.py
@@ -876,6 +876,7 @@ def _create_default_diffusion_stage_cfg(kwargs: dict[str, Any]) -> list:
                     "enable_multithread_weight_load": kwargs.get("enable_multithread_weight_load", True),
                     "num_weight_load_threads": kwargs.get("num_weight_load_threads", 4),
                     "quantization": kwargs.get("quantization", None),
+                    "kv_cache_dtype": kwargs.get("kv_cache_dtype", None),
                     "enable_diffusion_pipeline_profiler": kwargs.get("enable_diffusion_pipeline_profiler", False),
                     **(
                         {

From aaf3930e7c30892b80c26d761f4e8ec225cf87ce Mon Sep 17 00:00:00 2001
From: lishunyang <lishunyang12@163.com>
Date: Fri, 3 Apr 2026 00:03:33 +0800
Subject: [PATCH 07/45] Use vLLM fused CUDA kernel for FP8 quantization

Replace 3-op PyTorch quantization (amax + scale + cast) with vLLM's
scaled_fp8_quant fused CUDA kernel. Single kernel launch for the
entire quantize operation, eliminating the per-tensor amax reduction
bottleneck that caused FP8 to be slower than BF16 at long sequences.

Falls back to PyTorch ops if the CUDA kernel is unavailable.

Signed-off-by: lishunyang <lishunyang12@163.com>
---
 vllm_omni/quantization/kv_quant.py | 57 +++++++++++++++++++++++++-----
 1 file changed, 49 insertions(+), 8 deletions(-)

diff --git a/vllm_omni/quantization/kv_quant.py b/vllm_omni/quantization/kv_quant.py
index 9baf3aa590f..4502dee71c3 100644
--- a/vllm_omni/quantization/kv_quant.py
+++ b/vllm_omni/quantization/kv_quant.py
@@ -5,9 +5,27 @@
 Provides per-tensor dynamic quantization of Q/K/V tensors to
 float8_e4m3fn format. Designed for diffusion models where Q/K/V are
 computed fresh each forward pass (no persistent KV cache).
+
+Uses vLLM's fused CUDA kernel (scaled_fp8_quant) for efficient
+amax+scale+cast in a single kernel launch.
 """
 
 import torch
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+# Try to use vLLM's fused CUDA kernel; fall back to PyTorch ops.
+try:
+    from vllm._custom_ops import scaled_fp8_quant as _vllm_scaled_fp8_quant
+
+    _HAS_FUSED_QUANT = True
+except ImportError:
+    _HAS_FUSED_QUANT = False
+    logger.warning_once(
+        "vLLM scaled_fp8_quant not available, using PyTorch ops fallback. "
+        "FP8 attention will work but with higher quantization overhead."
+    )
 
 
 def _quantize_tensor_fp8(
@@ -15,22 +33,45 @@ def _quantize_tensor_fp8(
 ) -> tuple[torch.Tensor, torch.Tensor]:
     """Quantize a single tensor to FP8 with per-tensor dynamic scaling.
 
+    Uses vLLM's fused CUDA kernel when available (single kernel launch
+    for amax reduction + scale computation + FP8 cast). Falls back to
+    3 separate PyTorch ops otherwise.
+
     Returns:
         ``(fp8_tensor, inv_scale)`` where inv_scale is the dequant scale.
     """
-    finfo = torch.finfo(torch.float8_e4m3fn)
-    amax = tensor.abs().amax().clamp(min=1e-12)
-    scale_factor = finfo.max / amax
-    fp8 = (tensor * scale_factor).clamp(finfo.min, finfo.max).to(torch.float8_e4m3fn)
-    inv_scale = amax / finfo.max
-    return fp8, inv_scale
+    if _HAS_FUSED_QUANT:
+        # scaled_fp8_quant requires 2D input [M, N]
+        orig_shape = tensor.shape
+        flat = tensor.reshape(-1, orig_shape[-1])
+        # Dynamic per-tensor quantization: scale=None
+        fp8_flat, scale = _vllm_scaled_fp8_quant(flat)
+        fp8_out = fp8_flat.reshape(orig_shape)
+        # scale from vLLM is 1/scale (inv_scale / dequant scale)
+        return fp8_out, scale
+    else:
+        finfo = torch.finfo(torch.float8_e4m3fn)
+        amax = tensor.abs().amax().clamp(min=1e-12)
+        scale_factor = finfo.max / amax
+        fp8 = (tensor * scale_factor).clamp(finfo.min, finfo.max).to(
+            torch.float8_e4m3fn
+        )
+        inv_scale = amax / finfo.max
+        return fp8, inv_scale
 
 
 def quantize_qkv_fp8(
     query: torch.Tensor,
     key: torch.Tensor,
     value: torch.Tensor,
-) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+) -> tuple[
+    torch.Tensor,
+    torch.Tensor,
+    torch.Tensor,
+    torch.Tensor,
+    torch.Tensor,
+    torch.Tensor,
+]:
     """Quantize Q/K/V tensors to float8_e4m3fn with dynamic per-tensor scaling.
 
     Args:
@@ -40,7 +81,7 @@ def quantize_qkv_fp8(
 
     Returns:
         ``(fp8_query, fp8_key, fp8_value, q_scale, k_scale, v_scale)``
-        where scales are inverse (dequant) scales: ``inv_scale = amax / FP8_MAX``.
+        where scales are inverse (dequant) scales.
         Pass as ``descale_q/k/v`` to FA3 or use :func:`dequantize_fp8`.
     """
     fp8_q, q_scale = _quantize_tensor_fp8(query)

From 2c4e9eab4affefc45caecd151d53866d3805fa33 Mon Sep 17 00:00:00 2001
From: lishunyang <lishunyang12@163.com>
Date: Fri, 3 Apr 2026 00:09:05 +0800
Subject: [PATCH 08/45] Fix dequant dtype: cast result to output_dtype after
 f32 scale multiply

Signed-off-by: lishunyang <lishunyang12@163.com>
---
 vllm_omni/quantization/kv_quant.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm_omni/quantization/kv_quant.py b/vllm_omni/quantization/kv_quant.py
index 4502dee71c3..6a621f6fdc5 100644
--- a/vllm_omni/quantization/kv_quant.py
+++ b/vllm_omni/quantization/kv_quant.py
@@ -119,4 +119,4 @@ def dequantize_fp8(
     Returns:
         Dequantized tensor: ``tensor.to(output_dtype) * inv_scale``.
     """
-    return tensor.to(output_dtype) * inv_scale
+    return (tensor.to(output_dtype) * inv_scale).to(output_dtype)

From 2934ed0888b41a5c7655beef2a7bde8fb5406da1 Mon Sep 17 00:00:00 2001
From: lishunyang <lishunyang12@163.com>
Date: Fri, 3 Apr 2026 01:14:01 +0800
Subject: [PATCH 09/45] Skip FP8 quantization when padding mask is present

FA3 varlen path doesn't support descale, so quantize+dequant is pure
overhead when padding is detected. Check for padding before quantizing
to avoid the unnecessary roundtrip.

Signed-off-by: lishunyang <lishunyang12@163.com>
---
 vllm_omni/diffusion/attention/layer.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/vllm_omni/diffusion/attention/layer.py b/vllm_omni/diffusion/attention/layer.py
index 28cde6e06c1..d453a23db43 100644
--- a/vllm_omni/diffusion/attention/layer.py
+++ b/vllm_omni/diffusion/attention/layer.py
@@ -183,10 +183,15 @@ def forward(
         query, key, value, attn_metadata, ctx = strategy.pre_attention(query, key, value, attn_metadata)
 
         # 1.5 FP8 Q/K/V quantization (after AllToAll stays BF16, before kernel)
+        # Skip when padding mask is present — FA3 varlen doesn't support
+        # descale, so quantize+dequant would be pure overhead.
         if self._resolve_fp8_attn():
-            query, key, value, attn_metadata = self._quantize_qkv_fp8(
-                query, key, value, attn_metadata
-            )
+            attn_mask = attn_metadata.attn_mask if attn_metadata is not None else None
+            has_padding = attn_mask is not None and torch.any(~attn_mask)
+            if not has_padding:
+                query, key, value, attn_metadata = self._quantize_qkv_fp8(
+                    query, key, value, attn_metadata
+                )
 
         # 2. Kernel Execution (Computation)
         if self.use_ring and strategy is not self._no_parallel_strategy:

From ad37fdc2be1379275b8e3c171d43eefee4e8ab52 Mon Sep 17 00:00:00 2001
From: lishunyang <lishunyang12@163.com>
Date: Fri, 3 Apr 2026 01:31:47 +0800
Subject: [PATCH 10/45] Add debug scripts for FA3 FP8 capability check and mask
 stats

Signed-off-by: lishunyang <lishunyang12@163.com>
---
 debug_fa3_check.py                     | 93 ++++++++++++++++++++++++++
 vllm_omni/diffusion/attention/layer.py | 16 +++++
 2 files changed, 109 insertions(+)
 create mode 100644 debug_fa3_check.py

diff --git a/debug_fa3_check.py b/debug_fa3_check.py
new file mode 100644
index 00000000000..2c69cef9543
--- /dev/null
+++ b/debug_fa3_check.py
@@ -0,0 +1,93 @@
+"""Debug script: check FA3 FP8 capabilities and attention mask behavior."""
+import inspect
+
+print("=" * 60)
+print("1. FA3 FP8 descale support check")
+print("=" * 60)
+
+# Check fa3_fwd interface
+try:
+    from fa3_fwd_interface import _flash_attn_forward
+    sig = inspect.signature(_flash_attn_forward)
+    params = list(sig.parameters.keys())
+    print(f"fa3_fwd params: {params}")
+    has_descale = any("descale" in p for p in params)
+    print(f"Has descale support: {has_descale}")
+except Exception as e:
+    print(f"fa3_fwd not available: {e}")
+
+# Check flash_attn varlen
+try:
+    from flash_attn.flash_attn_interface import flash_attn_varlen_func
+    sig = inspect.signature(flash_attn_varlen_func)
+    params = list(sig.parameters.keys())
+    print(f"\nvarlen params: {params}")
+    has_descale = any("descale" in p for p in params)
+    print(f"Varlen has descale: {has_descale}")
+except Exception as e:
+    print(f"flash_attn varlen not available: {e}")
+
+# Check flash_attn regular func
+try:
+    from flash_attn.flash_attn_interface import flash_attn_func
+    sig = inspect.signature(flash_attn_func)
+    params = list(sig.parameters.keys())
+    print(f"\nflash_attn_func params: {params}")
+    has_descale = any("descale" in p for p in params)
+    print(f"flash_attn_func has descale: {has_descale}")
+except Exception as e:
+    print(f"flash_attn_func not available: {e}")
+
+# Check FA3 version
+print("\n" + "=" * 60)
+print("2. Package versions")
+print("=" * 60)
+try:
+    import flash_attn
+    print(f"flash_attn version: {flash_attn.__version__}")
+except Exception:
+    print("flash_attn: not installed or no __version__")
+
+try:
+    import fa3_fwd_cuda
+    print(f"fa3_fwd_cuda: available")
+except Exception:
+    print("fa3_fwd_cuda: not available")
+
+import torch
+print(f"torch version: {torch.__version__}")
+print(f"CUDA version: {torch.version.cuda}")
+print(f"GPU: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'N/A'}")
+
+# Check ring_globals imports (what vllm-omni actually uses)
+print("\n" + "=" * 60)
+print("3. vllm-omni FA3 imports")
+print("=" * 60)
+try:
+    from vllm_omni.diffusion.attention.backends.ring.ring_globals import (
+        HAS_FA3,
+        fa3_attn_func,
+    )
+    print(f"HAS_FA3: {HAS_FA3}")
+    print(f"fa3_attn_func: {fa3_attn_func}")
+    if fa3_attn_func is not None:
+        sig = inspect.signature(fa3_attn_func)
+        params = list(sig.parameters.keys())
+        print(f"fa3_attn_func params: {params}")
+        has_descale = any("descale" in p for p in params)
+        print(f"fa3_attn_func has descale: {has_descale}")
+except Exception as e:
+    print(f"ring_globals import failed: {e}")
+
+# Check if flash_attn_varlen_func is available through vllm-omni's utils
+try:
+    from vllm_omni.diffusion.attention.backends.utils.fa import (
+        flash_attn_varlen_func,
+    )
+    sig = inspect.signature(flash_attn_varlen_func)
+    params = list(sig.parameters.keys())
+    print(f"\nvllm-omni varlen func params: {params}")
+    has_descale = any("descale" in p for p in params)
+    print(f"vllm-omni varlen has descale: {has_descale}")
+except Exception as e:
+    print(f"vllm-omni varlen not available: {e}")
diff --git a/vllm_omni/diffusion/attention/layer.py b/vllm_omni/diffusion/attention/layer.py
index d453a23db43..372800455a0 100644
--- a/vllm_omni/diffusion/attention/layer.py
+++ b/vllm_omni/diffusion/attention/layer.py
@@ -188,6 +188,22 @@ def forward(
         if self._resolve_fp8_attn():
             attn_mask = attn_metadata.attn_mask if attn_metadata is not None else None
             has_padding = attn_mask is not None and torch.any(~attn_mask)
+            # DEBUG: log mask stats (remove before finalizing PR)
+            if not hasattr(self, '_debug_mask_logged'):
+                self._debug_mask_logged = True
+                if attn_mask is not None:
+                    n_true = attn_mask.sum().item()
+                    n_total = attn_mask.numel()
+                    n_false = n_total - n_true
+                    logger.info(
+                        "DEBUG mask stats: shape=%s, true=%d, false=%d, "
+                        "has_padding=%s, q_shape=%s",
+                        list(attn_mask.shape), n_true, n_false,
+                        has_padding, list(query.shape),
+                    )
+                else:
+                    logger.info("DEBUG mask stats: attn_mask=None, q_shape=%s",
+                                list(query.shape))
             if not has_padding:
                 query, key, value, attn_metadata = self._quantize_qkv_fp8(
                     query, key, value, attn_metadata

From 2d5906ad1dfd361603017bffc10f2bc326aff1f9 Mon Sep 17 00:00:00 2001
From: lishunyang <lishunyang12@163.com>
Date: Fri, 3 Apr 2026 01:36:09 +0800
Subject: [PATCH 11/45] Fix FA3 FP8 descale param names and enable FP8 varlen
 path
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two critical fixes:
1. Param names were wrong: descale_q/k/v -> q_descale/k_descale/v_descale.
   The TypeError catch silently swallowed this, so FA3 FP8 native path was
   never actually running — all calls fell through to BF16 dequant fallback.

2. FA3 varlen now supports q/k/v_descale, so pass descales through to
   _forward_varlen_masked instead of dequanting. This enables FP8 for
   all attention calls including those with padding masks.

Signed-off-by: lishunyang <lishunyang12@163.com>
---
 .../attention/backends/flash_attn.py          | 97 +++++++++----------
 vllm_omni/diffusion/attention/layer.py        | 27 +-----
 2 files changed, 49 insertions(+), 75 deletions(-)

diff --git a/vllm_omni/diffusion/attention/backends/flash_attn.py b/vllm_omni/diffusion/attention/backends/flash_attn.py
index 620ca90056b..0d43f9dc6c2 100644
--- a/vllm_omni/diffusion/attention/backends/flash_attn.py
+++ b/vllm_omni/diffusion/attention/backends/flash_attn.py
@@ -59,6 +59,9 @@ def _forward_varlen_masked(
         key: torch.Tensor,
         value: torch.Tensor,
         attention_mask: torch.Tensor,
+        q_descale: torch.Tensor | None = None,
+        k_descale: torch.Tensor | None = None,
+        v_descale: torch.Tensor | None = None,
     ) -> torch.Tensor:
         from vllm_omni.diffusion.attention.backends.utils.fa import (
             _pad_input,
@@ -73,6 +76,15 @@ def _forward_varlen_masked(
             query, key, value, attention_mask, query_length, _unpad_input
         )
 
+        varlen_kwargs: dict = {
+            "causal": self.causal,
+            "softmax_scale": self.softmax_scale,
+        }
+        if q_descale is not None:
+            varlen_kwargs["q_descale"] = q_descale
+            varlen_kwargs["k_descale"] = k_descale
+            varlen_kwargs["v_descale"] = v_descale
+
         out_unpad = flash_attn_varlen_func(
             q,
             k,
@@ -81,10 +93,7 @@ def _forward_varlen_masked(
             cu_seqlens_k=cu_seq_lens_k,
             max_seqlen_q=max_length_q,
             max_seqlen_k=max_length_k,
-            **{
-                "causal": self.causal,
-                "softmax_scale": self.softmax_scale,
-            },
+            **varlen_kwargs,
         )
         out_unpad = self._unwrap_flash_output(out_unpad)
         return _pad_input(out_unpad, indices_q, query.size(0), query_length)
@@ -220,18 +229,24 @@ def _forward_fp8(
         value: torch.Tensor,
         attn_metadata: AttentionMetadata,
     ) -> torch.Tensor:
-        """FP8 Q/K/V attention path: native FA3 or dequant fallback."""
-        from vllm_omni.quantization.kv_quant import dequantize_fp8
-
+        """FP8 Q/K/V attention path: native FA3 with descale."""
         q_scale = attn_metadata.q_scale
         k_scale = attn_metadata.k_scale
         v_scale = attn_metadata.v_scale
 
-        attention_mask = attn_metadata.attn_mask if attn_metadata is not None else None
-        has_padding = attention_mask is not None and torch.any(~attention_mask)
+        from vllm_omni.diffusion.attention.backends.ring.ring_globals import (
+            HAS_FA3,
+            fa3_attn_func,
+        )
 
-        # If padding is present, dequant and use the standard masked path
-        if has_padding:
+        if not (HAS_FA3 and fa3_attn_func is not None):
+            # No FA3: dequant and use standard path
+            from vllm_omni.quantization.kv_quant import dequantize_fp8
+
+            logger.warning_once(
+                "FP8 attention without FA3 provides no compute benefit. "
+                "Install FA3 for optimal FP8 support on Hopper GPUs."
+            )
             output_dtype = torch.bfloat16
             query = dequantize_fp8(query, q_scale, output_dtype)
             key = dequantize_fp8(key, k_scale, output_dtype)
@@ -241,45 +256,25 @@ def _forward_fp8(
             attn_metadata.v_scale = None
             return self.forward_cuda(query, key, value, attn_metadata)
 
-        # Try FA3 native FP8 (Hopper / Ada / Ampere via fa3-fwd)
-        from vllm_omni.diffusion.attention.backends.ring.ring_globals import (
-            HAS_FA3,
-            fa3_attn_func,
-        )
+        attention_mask = attn_metadata.attn_mask if attn_metadata is not None else None
+        has_padding = attention_mask is not None and torch.any(~attention_mask)
 
-        if HAS_FA3 and fa3_attn_func is not None:
-            fa3_kwargs: dict = {
-                "softmax_scale": self.softmax_scale,
-                "causal": self.causal,
-                "descale_k": k_scale,
-                "descale_v": v_scale,
-            }
-            # descale_q requires FA3 >= 2.7; guard against older versions
-            try:
-                out = fa3_attn_func(
-                    query, key, value, descale_q=q_scale, **fa3_kwargs
-                )
-            except TypeError:
-                logger.warning_once(
-                    "FA3 does not support descale_q (version < 2.7). "
-                    "Q will run in FP8 without descaling — consider upgrading."
-                )
-                out = fa3_attn_func(query, key, value, **fa3_kwargs)
-            if isinstance(out, tuple):
-                out = out[0]
-            return out
-
-        # Fallback: dequantize to compute dtype and use standard path
-        logger.warning_once(
-            "FP8 attention without FA3 provides no compute benefit. "
-            "Install FA3 for optimal FP8 support on Hopper GPUs."
+        if has_padding:
+            # FA3 varlen with FP8 descale
+            return self._forward_varlen_masked(
+                query, key, value, attention_mask,
+                q_descale=q_scale, k_descale=k_scale, v_descale=v_scale,
+            )
+
+        # FA3 regular path with FP8 descale
+        out = fa3_attn_func(
+            query, key, value,
+            softmax_scale=self.softmax_scale,
+            causal=self.causal,
+            q_descale=q_scale,
+            k_descale=k_scale,
+            v_descale=v_scale,
         )
-        output_dtype = torch.bfloat16
-        query_bf16 = dequantize_fp8(query, q_scale, output_dtype)
-        key_bf16 = dequantize_fp8(key, k_scale, output_dtype)
-        value_bf16 = dequantize_fp8(value, v_scale, output_dtype)
-        # Clear scales to avoid re-detection on recursive call
-        attn_metadata.q_scale = None
-        attn_metadata.k_scale = None
-        attn_metadata.v_scale = None
-        return self.forward_cuda(query_bf16, key_bf16, value_bf16, attn_metadata)
+        if isinstance(out, tuple):
+            out = out[0]
+        return out
diff --git a/vllm_omni/diffusion/attention/layer.py b/vllm_omni/diffusion/attention/layer.py
index 372800455a0..28cde6e06c1 100644
--- a/vllm_omni/diffusion/attention/layer.py
+++ b/vllm_omni/diffusion/attention/layer.py
@@ -183,31 +183,10 @@ def forward(
         query, key, value, attn_metadata, ctx = strategy.pre_attention(query, key, value, attn_metadata)
 
         # 1.5 FP8 Q/K/V quantization (after AllToAll stays BF16, before kernel)
-        # Skip when padding mask is present — FA3 varlen doesn't support
-        # descale, so quantize+dequant would be pure overhead.
         if self._resolve_fp8_attn():
-            attn_mask = attn_metadata.attn_mask if attn_metadata is not None else None
-            has_padding = attn_mask is not None and torch.any(~attn_mask)
-            # DEBUG: log mask stats (remove before finalizing PR)
-            if not hasattr(self, '_debug_mask_logged'):
-                self._debug_mask_logged = True
-                if attn_mask is not None:
-                    n_true = attn_mask.sum().item()
-                    n_total = attn_mask.numel()
-                    n_false = n_total - n_true
-                    logger.info(
-                        "DEBUG mask stats: shape=%s, true=%d, false=%d, "
-                        "has_padding=%s, q_shape=%s",
-                        list(attn_mask.shape), n_true, n_false,
-                        has_padding, list(query.shape),
-                    )
-                else:
-                    logger.info("DEBUG mask stats: attn_mask=None, q_shape=%s",
-                                list(query.shape))
-            if not has_padding:
-                query, key, value, attn_metadata = self._quantize_qkv_fp8(
-                    query, key, value, attn_metadata
-                )
+            query, key, value, attn_metadata = self._quantize_qkv_fp8(
+                query, key, value, attn_metadata
+            )
 
         # 2. Kernel Execution (Computation)
         if self.use_ring and strategy is not self._no_parallel_strategy:

From 502f5aa0e50cc7a9976af99086023ad6be7df817 Mon Sep 17 00:00:00 2001
From: lishunyang <lishunyang12@163.com>
Date: Fri, 3 Apr 2026 01:43:34 +0800
Subject: [PATCH 12/45] Expand debug script: micro-benchmarks, varlen FP8,
 layer breakdown

Signed-off-by: lishunyang <lishunyang12@163.com>
---
 debug_fa3_check.py | 344 +++++++++++++++++++++++++++++++++++++++------
 1 file changed, 303 insertions(+), 41 deletions(-)

diff --git a/debug_fa3_check.py b/debug_fa3_check.py
index 2c69cef9543..00c88150277 100644
--- a/debug_fa3_check.py
+++ b/debug_fa3_check.py
@@ -1,11 +1,17 @@
-"""Debug script: check FA3 FP8 capabilities and attention mask behavior."""
+"""Debug script: check FA3 FP8 capabilities, run micro-benchmarks,
+and profile attention kernel performance for optimization planning."""
 import inspect
+import time
 
+import torch
+
+# ============================================================
+# 1. FA3 FP8 descale support check
+# ============================================================
 print("=" * 60)
 print("1. FA3 FP8 descale support check")
 print("=" * 60)
 
-# Check fa3_fwd interface
 try:
     from fa3_fwd_interface import _flash_attn_forward
     sig = inspect.signature(_flash_attn_forward)
@@ -16,78 +22,334 @@
 except Exception as e:
     print(f"fa3_fwd not available: {e}")
 
-# Check flash_attn varlen
 try:
-    from flash_attn.flash_attn_interface import flash_attn_varlen_func
+    from vllm_omni.diffusion.attention.backends.utils.fa import (
+        flash_attn_varlen_func,
+    )
     sig = inspect.signature(flash_attn_varlen_func)
     params = list(sig.parameters.keys())
-    print(f"\nvarlen params: {params}")
+    print(f"\nvarlen func params: {params}")
     has_descale = any("descale" in p for p in params)
     print(f"Varlen has descale: {has_descale}")
 except Exception as e:
-    print(f"flash_attn varlen not available: {e}")
+    print(f"varlen not available: {e}")
 
-# Check flash_attn regular func
 try:
-    from flash_attn.flash_attn_interface import flash_attn_func
-    sig = inspect.signature(flash_attn_func)
-    params = list(sig.parameters.keys())
-    print(f"\nflash_attn_func params: {params}")
-    has_descale = any("descale" in p for p in params)
-    print(f"flash_attn_func has descale: {has_descale}")
+    from vllm_omni.diffusion.attention.backends.ring.ring_globals import (
+        HAS_FA3,
+        fa3_attn_func,
+    )
+    print(f"\nHAS_FA3: {HAS_FA3}")
+    if fa3_attn_func is not None:
+        sig = inspect.signature(fa3_attn_func)
+        params = list(sig.parameters.keys())
+        print(f"fa3_attn_func params: {params}")
 except Exception as e:
-    print(f"flash_attn_func not available: {e}")
+    print(f"ring_globals import failed: {e}")
 
-# Check FA3 version
+# ============================================================
+# 2. Package versions and GPU info
+# ============================================================
 print("\n" + "=" * 60)
-print("2. Package versions")
+print("2. Environment")
 print("=" * 60)
+
 try:
     import flash_attn
     print(f"flash_attn version: {flash_attn.__version__}")
 except Exception:
     print("flash_attn: not installed or no __version__")
 
+print(f"torch version: {torch.__version__}")
+print(f"CUDA version: {torch.version.cuda}")
+if torch.cuda.is_available():
+    gpu = torch.cuda.get_device_name(0)
+    cap = torch.cuda.get_device_capability(0)
+    mem = torch.cuda.get_device_properties(0).total_mem / 1024**3
+    print(f"GPU: {gpu} (SM {cap[0]}{cap[1]}, {mem:.1f} GB)")
+    print(f"  FP8 tensor cores: {'Yes' if cap[0] >= 9 else 'No'} (need SM90+)")
+else:
+    print("GPU: N/A")
+
+# Check vLLM fused quant kernel
 try:
-    import fa3_fwd_cuda
-    print(f"fa3_fwd_cuda: available")
+    from vllm._custom_ops import scaled_fp8_quant
+    print(f"vLLM scaled_fp8_quant: available")
 except Exception:
-    print("fa3_fwd_cuda: not available")
+    print("vLLM scaled_fp8_quant: NOT available (will use PyTorch fallback)")
 
-import torch
-print(f"torch version: {torch.__version__}")
-print(f"CUDA version: {torch.version.cuda}")
-print(f"GPU: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'N/A'}")
+# Check torch.compile status
+print(f"torch.compile available: {hasattr(torch, 'compile')}")
+try:
+    import triton
+    print(f"triton version: {triton.__version__}")
+except Exception:
+    print("triton: not installed")
+
+# ============================================================
+# 3. FP8 micro-benchmarks (quantization overhead)
+# ============================================================
+if not torch.cuda.is_available():
+    print("\nSkipping benchmarks (no GPU)")
+    exit()
+
+print("\n" + "=" * 60)
+print("3. FP8 quantization overhead micro-benchmark")
+print("=" * 60)
+
+device = "cuda"
+
+# Simulate HunyuanVideo tensor shapes
+# 33 frames: ~(1, 2640, 24, 128) for single-stream, (1, 2640+256, 24, 128) for joint
+# 121 frames: ~(1, 9680, 24, 128) for single-stream
+test_shapes = [
+    ("33f single-stream", (1, 2640, 24, 128)),
+    ("33f joint (img+txt)", (1, 2896, 24, 128)),
+    ("121f single-stream", (1, 9680, 24, 128)),
+    ("121f joint (img+txt)", (1, 9936, 24, 128)),
+]
+
+for name, shape in test_shapes:
+    q = torch.randn(shape, dtype=torch.bfloat16, device=device)
+    k = torch.randn(shape, dtype=torch.bfloat16, device=device)
+    v = torch.randn(shape, dtype=torch.bfloat16, device=device)
+
+    # Warmup
+    for _ in range(3):
+        from vllm_omni.quantization.kv_quant import quantize_qkv_fp8
+        quantize_qkv_fp8(q, k, v)
+    torch.cuda.synchronize()
 
-# Check ring_globals imports (what vllm-omni actually uses)
+    # Benchmark quantization
+    n_iters = 20
+    start = time.perf_counter()
+    for _ in range(n_iters):
+        fp8_q, fp8_k, fp8_v, qs, ks, vs = quantize_qkv_fp8(q, k, v)
+    torch.cuda.synchronize()
+    quant_time = (time.perf_counter() - start) / n_iters * 1000
+
+    print(f"  {name} {list(shape)}: quant={quant_time:.2f} ms")
+
+# ============================================================
+# 4. FA3 attention kernel benchmark (BF16 vs FP8)
+# ============================================================
 print("\n" + "=" * 60)
-print("3. vllm-omni FA3 imports")
+print("4. FA3 attention kernel benchmark (BF16 vs FP8)")
 print("=" * 60)
+
 try:
     from vllm_omni.diffusion.attention.backends.ring.ring_globals import (
         HAS_FA3,
         fa3_attn_func,
     )
-    print(f"HAS_FA3: {HAS_FA3}")
-    print(f"fa3_attn_func: {fa3_attn_func}")
-    if fa3_attn_func is not None:
-        sig = inspect.signature(fa3_attn_func)
-        params = list(sig.parameters.keys())
-        print(f"fa3_attn_func params: {params}")
-        has_descale = any("descale" in p for p in params)
-        print(f"fa3_attn_func has descale: {has_descale}")
+    if not HAS_FA3 or fa3_attn_func is None:
+        raise RuntimeError("FA3 not available")
 except Exception as e:
-    print(f"ring_globals import failed: {e}")
+    print(f"Skipping: {e}")
+    exit()
+
+bench_shapes = [
+    ("33f", (1, 2640, 24, 128)),
+    ("121f", (1, 9680, 24, 128)),
+]
+
+n_warmup = 5
+n_iters = 20
+
+for name, shape in bench_shapes:
+    B, S, H, D = shape
+    softmax_scale = D ** -0.5
+
+    # BF16 benchmark
+    q_bf16 = torch.randn(shape, dtype=torch.bfloat16, device=device)
+    k_bf16 = torch.randn(shape, dtype=torch.bfloat16, device=device)
+    v_bf16 = torch.randn(shape, dtype=torch.bfloat16, device=device)
+
+    for _ in range(n_warmup):
+        fa3_attn_func(q_bf16, k_bf16, v_bf16, softmax_scale=softmax_scale, causal=False)
+    torch.cuda.synchronize()
+
+    start = time.perf_counter()
+    for _ in range(n_iters):
+        fa3_attn_func(q_bf16, k_bf16, v_bf16, softmax_scale=softmax_scale, causal=False)
+    torch.cuda.synchronize()
+    bf16_time = (time.perf_counter() - start) / n_iters * 1000
+
+    # FP8 benchmark (quantize + attention)
+    fp8_q, fp8_k, fp8_v, qs, ks, vs = quantize_qkv_fp8(q_bf16, k_bf16, v_bf16)
+
+    for _ in range(n_warmup):
+        fa3_attn_func(fp8_q, fp8_k, fp8_v, softmax_scale=softmax_scale,
+                      causal=False, q_descale=qs, k_descale=ks, v_descale=vs)
+    torch.cuda.synchronize()
+
+    # FP8 kernel only (no quant overhead)
+    start = time.perf_counter()
+    for _ in range(n_iters):
+        fa3_attn_func(fp8_q, fp8_k, fp8_v, softmax_scale=softmax_scale,
+                      causal=False, q_descale=qs, k_descale=ks, v_descale=vs)
+    torch.cuda.synchronize()
+    fp8_kernel_time = (time.perf_counter() - start) / n_iters * 1000
+
+    # FP8 end-to-end (quant + attention)
+    start = time.perf_counter()
+    for _ in range(n_iters):
+        fp8_q, fp8_k, fp8_v, qs, ks, vs = quantize_qkv_fp8(q_bf16, k_bf16, v_bf16)
+        fa3_attn_func(fp8_q, fp8_k, fp8_v, softmax_scale=softmax_scale,
+                      causal=False, q_descale=qs, k_descale=ks, v_descale=vs)
+    torch.cuda.synchronize()
+    fp8_e2e_time = (time.perf_counter() - start) / n_iters * 1000
+
+    speedup_kernel = bf16_time / fp8_kernel_time
+    speedup_e2e = bf16_time / fp8_e2e_time
+
+    print(f"\n  {name} {list(shape)}:")
+    print(f"    BF16 attn:           {bf16_time:.2f} ms")
+    print(f"    FP8 kernel only:     {fp8_kernel_time:.2f} ms  ({speedup_kernel:.2f}x)")
+    print(f"    FP8 quant+kernel:    {fp8_e2e_time:.2f} ms  ({speedup_e2e:.2f}x)")
+    print(f"    Quant overhead:      {fp8_e2e_time - fp8_kernel_time:.2f} ms")
+
+# ============================================================
+# 5. FA3 varlen FP8 benchmark (with padding mask)
+# ============================================================
+print("\n" + "=" * 60)
+print("5. FA3 varlen FP8 benchmark (with padding mask)")
+print("=" * 60)
 
-# Check if flash_attn_varlen_func is available through vllm-omni's utils
 try:
     from vllm_omni.diffusion.attention.backends.utils.fa import (
         flash_attn_varlen_func,
+        _unpad_input,
+        _upad_input,
     )
-    sig = inspect.signature(flash_attn_varlen_func)
-    params = list(sig.parameters.keys())
-    print(f"\nvllm-omni varlen func params: {params}")
-    has_descale = any("descale" in p for p in params)
-    print(f"vllm-omni varlen has descale: {has_descale}")
 except Exception as e:
-    print(f"vllm-omni varlen not available: {e}")
+    print(f"Skipping varlen benchmark: {e}")
+    exit()
+
+for name, shape in bench_shapes:
+    B, S, H, D = shape
+    softmax_scale = D ** -0.5
+
+    q_bf16 = torch.randn(shape, dtype=torch.bfloat16, device=device)
+    k_bf16 = torch.randn(shape, dtype=torch.bfloat16, device=device)
+    v_bf16 = torch.randn(shape, dtype=torch.bfloat16, device=device)
+
+    # Create a realistic mask: mostly True, some False at end (encoder padding)
+    mask = torch.ones(B, S, dtype=torch.bool, device=device)
+    n_pad = max(1, S // 20)  # 5% padding
+    mask[:, -n_pad:] = False
+
+    # Unpad inputs
+    q_up, k_up, v_up, indices_q, (cu_q, cu_k), (max_q, max_k) = _upad_input(
+        q_bf16, k_bf16, v_bf16, mask, S, _unpad_input
+    )
+
+    # BF16 varlen
+    for _ in range(n_warmup):
+        flash_attn_varlen_func(q_up, k_up, v_up, cu_seqlens_q=cu_q, cu_seqlens_k=cu_k,
+                               max_seqlen_q=max_q, max_seqlen_k=max_k,
+                               softmax_scale=softmax_scale, causal=False)
+    torch.cuda.synchronize()
+
+    start = time.perf_counter()
+    for _ in range(n_iters):
+        flash_attn_varlen_func(q_up, k_up, v_up, cu_seqlens_q=cu_q, cu_seqlens_k=cu_k,
+                               max_seqlen_q=max_q, max_seqlen_k=max_k,
+                               softmax_scale=softmax_scale, causal=False)
+    torch.cuda.synchronize()
+    varlen_bf16 = (time.perf_counter() - start) / n_iters * 1000
+
+    # FP8 varlen
+    fp8_q, fp8_k, fp8_v, qs, ks, vs = quantize_qkv_fp8(q_bf16, k_bf16, v_bf16)
+    q_up_fp8, k_up_fp8, v_up_fp8, _, _, _ = _upad_input(
+        fp8_q, fp8_k, fp8_v, mask, S, _unpad_input
+    )
+
+    for _ in range(n_warmup):
+        flash_attn_varlen_func(q_up_fp8, k_up_fp8, v_up_fp8,
+                               cu_seqlens_q=cu_q, cu_seqlens_k=cu_k,
+                               max_seqlen_q=max_q, max_seqlen_k=max_k,
+                               softmax_scale=softmax_scale, causal=False,
+                               q_descale=qs, k_descale=ks, v_descale=vs)
+    torch.cuda.synchronize()
+
+    start = time.perf_counter()
+    for _ in range(n_iters):
+        flash_attn_varlen_func(q_up_fp8, k_up_fp8, v_up_fp8,
+                               cu_seqlens_q=cu_q, cu_seqlens_k=cu_k,
+                               max_seqlen_q=max_q, max_seqlen_k=max_k,
+                               softmax_scale=softmax_scale, causal=False,
+                               q_descale=qs, k_descale=ks, v_descale=vs)
+    torch.cuda.synchronize()
+    varlen_fp8 = (time.perf_counter() - start) / n_iters * 1000
+
+    speedup = varlen_bf16 / varlen_fp8
+
+    print(f"\n  {name} {list(shape)} (5% padding):")
+    print(f"    BF16 varlen:  {varlen_bf16:.2f} ms")
+    print(f"    FP8 varlen:   {varlen_fp8:.2f} ms  ({speedup:.2f}x)")
+
+# ============================================================
+# 6. Breakdown: where time goes in a DiT layer
+# ============================================================
+print("\n" + "=" * 60)
+print("6. Time breakdown estimate for one DiT layer")
+print("=" * 60)
+
+shape_121f = (1, 9680, 24, 128)
+B, S, H, D = shape_121f
+hidden_dim = H * D  # 3072
+softmax_scale = D ** -0.5
+
+# Linear projections (Q/K/V projection + output projection)
+x = torch.randn(B, S, hidden_dim, dtype=torch.bfloat16, device=device)
+w_qkv = torch.randn(hidden_dim * 3, hidden_dim, dtype=torch.bfloat16, device=device)
+w_out = torch.randn(hidden_dim, hidden_dim, dtype=torch.bfloat16, device=device)
+
+for _ in range(n_warmup):
+    torch.nn.functional.linear(x, w_qkv)
+torch.cuda.synchronize()
+
+start = time.perf_counter()
+for _ in range(n_iters):
+    torch.nn.functional.linear(x, w_qkv)
+torch.cuda.synchronize()
+linear_qkv_time = (time.perf_counter() - start) / n_iters * 1000
+
+for _ in range(n_warmup):
+    torch.nn.functional.linear(x, w_out)
+torch.cuda.synchronize()
+
+start = time.perf_counter()
+for _ in range(n_iters):
+    torch.nn.functional.linear(x, w_out)
+torch.cuda.synchronize()
+linear_out_time = (time.perf_counter() - start) / n_iters * 1000
+
+# Attention (already measured above)
+q_bf16 = torch.randn(shape_121f, dtype=torch.bfloat16, device=device)
+k_bf16 = torch.randn(shape_121f, dtype=torch.bfloat16, device=device)
+v_bf16 = torch.randn(shape_121f, dtype=torch.bfloat16, device=device)
+
+for _ in range(n_warmup):
+    fa3_attn_func(q_bf16, k_bf16, v_bf16, softmax_scale=softmax_scale, causal=False)
+torch.cuda.synchronize()
+
+start = time.perf_counter()
+for _ in range(n_iters):
+    fa3_attn_func(q_bf16, k_bf16, v_bf16, softmax_scale=softmax_scale, causal=False)
+torch.cuda.synchronize()
+attn_time = (time.perf_counter() - start) / n_iters * 1000
+
+total = linear_qkv_time + attn_time + linear_out_time
+print(f"  121f single layer breakdown (estimated):")
+print(f"    QKV projection:  {linear_qkv_time:.2f} ms ({linear_qkv_time/total*100:.0f}%)")
+print(f"    Attention:       {attn_time:.2f} ms ({attn_time/total*100:.0f}%)")
+print(f"    Output proj:     {linear_out_time:.2f} ms ({linear_out_time/total*100:.0f}%)")
+print(f"    Total:           {total:.2f} ms")
+print(f"    Layers x steps:  54 layers x 30 steps = {54*30} calls")
+print(f"    Attn total est:  {attn_time * 54 * 30 / 1000:.1f}s out of ~{total * 54 * 30 / 1000:.1f}s")
+
+print("\n" + "=" * 60)
+print("Done.")
+print("=" * 60)

From 4ef08e9ca05c7ae57e2a77d82492a4fb4ad884c9 Mon Sep 17 00:00:00 2001
From: lishunyang <lishunyang12@163.com>
Date: Fri, 3 Apr 2026 01:44:39 +0800
Subject: [PATCH 13/45] Fix total_mem -> total_memory for PyTorch 2.10+

Signed-off-by: lishunyang <lishunyang12@163.com>
---
 debug_fa3_check.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/debug_fa3_check.py b/debug_fa3_check.py
index 00c88150277..09b25758b09 100644
--- a/debug_fa3_check.py
+++ b/debug_fa3_check.py
@@ -65,7 +65,8 @@
 if torch.cuda.is_available():
     gpu = torch.cuda.get_device_name(0)
     cap = torch.cuda.get_device_capability(0)
-    mem = torch.cuda.get_device_properties(0).total_mem / 1024**3
+    props = torch.cuda.get_device_properties(0)
+    mem = getattr(props, 'total_memory', getattr(props, 'total_mem', 0)) / 1024**3
     print(f"GPU: {gpu} (SM {cap[0]}{cap[1]}, {mem:.1f} GB)")
     print(f"  FP8 tensor cores: {'Yes' if cap[0] >= 9 else 'No'} (need SM90+)")
 else:

From 8f28879fda4daf0cd231e521082a509ec8bcfda6 Mon Sep 17 00:00:00 2001
From: lishunyang <lishunyang12@163.com>
Date: Fri, 3 Apr 2026 01:47:41 +0800
Subject: [PATCH 14/45] Fix descale shape: FA3 requires (batch, num_kv_heads)
 not scalar

Signed-off-by: lishunyang <lishunyang12@163.com>
---
 debug_fa3_check.py                            | 22 ++++++++++++++-----
 .../attention/backends/flash_attn.py          | 19 ++++++++++++----
 2 files changed, 32 insertions(+), 9 deletions(-)

diff --git a/debug_fa3_check.py b/debug_fa3_check.py
index 09b25758b09..e69367567c2 100644
--- a/debug_fa3_check.py
+++ b/debug_fa3_check.py
@@ -178,17 +178,21 @@
 
     # FP8 benchmark (quantize + attention)
     fp8_q, fp8_k, fp8_v, qs, ks, vs = quantize_qkv_fp8(q_bf16, k_bf16, v_bf16)
+    # FA3 expects descale shape (batch, num_kv_heads)
+    qs_2d = qs.view(1, 1).expand(B, H).contiguous()
+    ks_2d = ks.view(1, 1).expand(B, H).contiguous()
+    vs_2d = vs.view(1, 1).expand(B, H).contiguous()
 
     for _ in range(n_warmup):
         fa3_attn_func(fp8_q, fp8_k, fp8_v, softmax_scale=softmax_scale,
-                      causal=False, q_descale=qs, k_descale=ks, v_descale=vs)
+                      causal=False, q_descale=qs_2d, k_descale=ks_2d, v_descale=vs_2d)
     torch.cuda.synchronize()
 
     # FP8 kernel only (no quant overhead)
     start = time.perf_counter()
     for _ in range(n_iters):
         fa3_attn_func(fp8_q, fp8_k, fp8_v, softmax_scale=softmax_scale,
-                      causal=False, q_descale=qs, k_descale=ks, v_descale=vs)
+                      causal=False, q_descale=qs_2d, k_descale=ks_2d, v_descale=vs_2d)
     torch.cuda.synchronize()
     fp8_kernel_time = (time.perf_counter() - start) / n_iters * 1000
 
@@ -196,8 +200,11 @@
     start = time.perf_counter()
     for _ in range(n_iters):
         fp8_q, fp8_k, fp8_v, qs, ks, vs = quantize_qkv_fp8(q_bf16, k_bf16, v_bf16)
+        qs_2d = qs.view(1, 1).expand(B, H).contiguous()
+        ks_2d = ks.view(1, 1).expand(B, H).contiguous()
+        vs_2d = vs.view(1, 1).expand(B, H).contiguous()
         fa3_attn_func(fp8_q, fp8_k, fp8_v, softmax_scale=softmax_scale,
-                      causal=False, q_descale=qs, k_descale=ks, v_descale=vs)
+                      causal=False, q_descale=qs_2d, k_descale=ks_2d, v_descale=vs_2d)
     torch.cuda.synchronize()
     fp8_e2e_time = (time.perf_counter() - start) / n_iters * 1000
 
@@ -266,12 +273,17 @@
         fp8_q, fp8_k, fp8_v, mask, S, _unpad_input
     )
 
+    # FA3 expects descale shape (batch, num_kv_heads)
+    qs_2d = qs.view(1, 1).expand(B, H).contiguous()
+    ks_2d = ks.view(1, 1).expand(B, H).contiguous()
+    vs_2d = vs.view(1, 1).expand(B, H).contiguous()
+
     for _ in range(n_warmup):
         flash_attn_varlen_func(q_up_fp8, k_up_fp8, v_up_fp8,
                                cu_seqlens_q=cu_q, cu_seqlens_k=cu_k,
                                max_seqlen_q=max_q, max_seqlen_k=max_k,
                                softmax_scale=softmax_scale, causal=False,
-                               q_descale=qs, k_descale=ks, v_descale=vs)
+                               q_descale=qs_2d, k_descale=ks_2d, v_descale=vs_2d)
     torch.cuda.synchronize()
 
     start = time.perf_counter()
@@ -280,7 +292,7 @@
                                cu_seqlens_q=cu_q, cu_seqlens_k=cu_k,
                                max_seqlen_q=max_q, max_seqlen_k=max_k,
                                softmax_scale=softmax_scale, causal=False,
-                               q_descale=qs, k_descale=ks, v_descale=vs)
+                               q_descale=qs_2d, k_descale=ks_2d, v_descale=vs_2d)
     torch.cuda.synchronize()
     varlen_fp8 = (time.perf_counter() - start) / n_iters * 1000
 
diff --git a/vllm_omni/diffusion/attention/backends/flash_attn.py b/vllm_omni/diffusion/attention/backends/flash_attn.py
index 0d43f9dc6c2..d1899b03f37 100644
--- a/vllm_omni/diffusion/attention/backends/flash_attn.py
+++ b/vllm_omni/diffusion/attention/backends/flash_attn.py
@@ -222,6 +222,11 @@ def forward_npu(
         )
         return output
 
+    @staticmethod
+    def _reshape_descale(scale: torch.Tensor, batch: int, num_heads_k: int) -> torch.Tensor:
+        """Reshape per-tensor scale to FA3's expected (batch, num_heads_k) shape."""
+        return scale.view(1, 1).expand(batch, num_heads_k).contiguous()
+
     def _forward_fp8(
         self,
         query: torch.Tensor,
@@ -256,6 +261,12 @@ def _forward_fp8(
             attn_metadata.v_scale = None
             return self.forward_cuda(query, key, value, attn_metadata)
 
+        # Reshape per-tensor scales to FA3's expected (batch, num_kv_heads)
+        B, S, H, D = key.shape
+        q_descale = self._reshape_descale(q_scale, B, H)
+        k_descale = self._reshape_descale(k_scale, B, H)
+        v_descale = self._reshape_descale(v_scale, B, H)
+
         attention_mask = attn_metadata.attn_mask if attn_metadata is not None else None
         has_padding = attention_mask is not None and torch.any(~attention_mask)
 
@@ -263,7 +274,7 @@ def _forward_fp8(
             # FA3 varlen with FP8 descale
             return self._forward_varlen_masked(
                 query, key, value, attention_mask,
-                q_descale=q_scale, k_descale=k_scale, v_descale=v_scale,
+                q_descale=q_descale, k_descale=k_descale, v_descale=v_descale,
             )
 
         # FA3 regular path with FP8 descale
@@ -271,9 +282,9 @@ def _forward_fp8(
             query, key, value,
             softmax_scale=self.softmax_scale,
             causal=self.causal,
-            q_descale=q_scale,
-            k_descale=k_scale,
-            v_descale=v_scale,
+            q_descale=q_descale,
+            k_descale=k_descale,
+            v_descale=v_descale,
         )
         if isinstance(out, tuple):
             out = out[0]

From 6ca66eaa557cc3274c434c52611aceb472c2a208 Mon Sep 17 00:00:00 2001
From: lishunyang <lishunyang12@163.com>
Date: Fri, 3 Apr 2026 01:51:41 +0800
Subject: [PATCH 15/45] Add debug_shapes.py: log actual Q/K/V shapes during
 inference

Signed-off-by: lishunyang <lishunyang12@163.com>
---
 debug_shapes.py | 110 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 110 insertions(+)
 create mode 100644 debug_shapes.py

diff --git a/debug_shapes.py b/debug_shapes.py
new file mode 100644
index 00000000000..243fbf54d4b
--- /dev/null
+++ b/debug_shapes.py
@@ -0,0 +1,110 @@
+"""Debug script: check actual Q/K/V tensor shapes during HunyuanVideo inference.
+Patches Attention.forward to log shapes on first call, then runs a short generation."""
+import torch
+import sys
+
+# Monkey-patch Attention.forward to log actual tensor shapes
+_shape_log = []
+_logged_count = 0
+_max_log = 10  # only log first 10 unique shapes
+
+def _patch_attention():
+    from vllm_omni.diffusion.attention.layer import Attention
+    _orig_forward = Attention.forward
+
+    def _logging_forward(self, query, key, value, attn_metadata=None):
+        global _logged_count
+        if _logged_count < _max_log:
+            shape_key = (tuple(query.shape), tuple(key.shape))
+            if shape_key not in [s[0:2] for s in _shape_log]:
+                has_mask = (attn_metadata is not None and
+                            attn_metadata.attn_mask is not None)
+                mask_shape = (list(attn_metadata.attn_mask.shape)
+                              if has_mask else None)
+                mask_false = 0
+                if has_mask:
+                    mask_false = int((~attn_metadata.attn_mask).sum().item())
+
+                entry = (
+                    tuple(query.shape),
+                    tuple(key.shape),
+                    query.dtype,
+                    mask_shape,
+                    mask_false,
+                )
+                _shape_log.append(entry)
+                _logged_count += 1
+                B, S, H, D = query.shape
+                print(f"[SHAPE] q={list(query.shape)} k={list(key.shape)} "
+                      f"dtype={query.dtype} "
+                      f"tokens={S} heads={H} headdim={D} "
+                      f"mask={mask_shape} mask_false={mask_false}")
+        return _orig_forward(self, query, key, value, attn_metadata)
+
+    Attention.forward = _logging_forward
+    print("[DEBUG] Attention.forward patched for shape logging")
+
+_patch_attention()
+
+# Also estimate theoretical attention fraction
+print("\n" + "=" * 60)
+print("Theoretical token counts for HunyuanVideo 1.5")
+print("=" * 60)
+for n_frames in [33, 61, 81, 121]:
+    # HunyuanVideo 1.5 VAE: 4x temporal, 8x spatial
+    t = (n_frames - 1) // 4 + 1
+    h = 480 // 8
+    w = 832 // 8
+    vae_tokens = t * h * w
+    # Patchify: 1x2x2 for HunyuanVideo 1.5
+    h_p = h // 2
+    w_p = w // 2
+    patch_tokens = t * h_p * w_p
+    print(f"  {n_frames}f: VAE latent {t}x{h}x{w}={vae_tokens}, "
+          f"after patch {t}x{h_p}x{w_p}={patch_tokens} tokens")
+
+# Run a short generation to capture actual shapes
+print("\n" + "=" * 60)
+print("Running short generation to capture actual shapes...")
+print("=" * 60)
+
+from vllm_omni.diffusion.data import DiffusionParallelConfig
+from vllm_omni.entrypoints.omni import Omni
+from vllm_omni.inputs.data import OmniDiffusionSamplingParams
+from vllm_omni.platforms import current_omni_platform
+
+model = "hunyuanvideo-community/HunyuanVideo-1.5-Diffusers-480p_t2v"
+generator = torch.Generator(device=current_omni_platform.device_type).manual_seed(42)
+
+omni = Omni(
+    model=model,
+    vae_use_tiling=True,
+    enforce_eager=True,  # disable torch.compile for cleaner shape logging
+    parallel_config=DiffusionParallelConfig(),
+)
+
+print("\n[DEBUG] Starting generation (2 steps only)...")
+try:
+    outputs = omni.generate(
+        {"prompt": "A cat in a garden."},
+        OmniDiffusionSamplingParams(
+            height=480,
+            width=832,
+            num_frames=121,
+            generator=generator,
+            guidance_scale=6.0,
+            num_inference_steps=2,  # just 2 steps to see shapes
+        ),
+    )
+    print("[DEBUG] Generation completed.")
+except Exception as e:
+    print(f"[DEBUG] Generation error (expected with 2 steps): {e}")
+
+print("\n" + "=" * 60)
+print("Shape summary")
+print("=" * 60)
+for entry in _shape_log:
+    q_shape, k_shape, dtype, mask_shape, mask_false = entry
+    B, S, H, D = q_shape
+    print(f"  q={list(q_shape)} k={list(k_shape)} "
+          f"tokens={S} mask_false={mask_false}")

From d6c03460f9977ba54d771ab6d4479540c86dd37c Mon Sep 17 00:00:00 2001
From: lishunyang <lishunyang12@163.com>
Date: Fri, 3 Apr 2026 01:55:45 +0800
Subject: [PATCH 16/45] Fix debug_shapes.py: add __main__ guard for
 multiprocessing spawn

Signed-off-by: lishunyang <lishunyang12@163.com>
---
 debug_shapes.py | 127 +++++++++++++++++++++++++-----------------------
 1 file changed, 65 insertions(+), 62 deletions(-)

diff --git a/debug_shapes.py b/debug_shapes.py
index 243fbf54d4b..5a5edd6106c 100644
--- a/debug_shapes.py
+++ b/debug_shapes.py
@@ -1,13 +1,13 @@
 """Debug script: check actual Q/K/V tensor shapes during HunyuanVideo inference.
 Patches Attention.forward to log shapes on first call, then runs a short generation."""
 import torch
-import sys
 
 # Monkey-patch Attention.forward to log actual tensor shapes
 _shape_log = []
 _logged_count = 0
 _max_log = 10  # only log first 10 unique shapes
 
+
 def _patch_attention():
     from vllm_omni.diffusion.attention.layer import Attention
     _orig_forward = Attention.forward
@@ -44,67 +44,70 @@ def _logging_forward(self, query, key, value, attn_metadata=None):
     Attention.forward = _logging_forward
     print("[DEBUG] Attention.forward patched for shape logging")
 
+
 _patch_attention()
 
-# Also estimate theoretical attention fraction
-print("\n" + "=" * 60)
-print("Theoretical token counts for HunyuanVideo 1.5")
-print("=" * 60)
-for n_frames in [33, 61, 81, 121]:
-    # HunyuanVideo 1.5 VAE: 4x temporal, 8x spatial
-    t = (n_frames - 1) // 4 + 1
-    h = 480 // 8
-    w = 832 // 8
-    vae_tokens = t * h * w
-    # Patchify: 1x2x2 for HunyuanVideo 1.5
-    h_p = h // 2
-    w_p = w // 2
-    patch_tokens = t * h_p * w_p
-    print(f"  {n_frames}f: VAE latent {t}x{h}x{w}={vae_tokens}, "
-          f"after patch {t}x{h_p}x{w_p}={patch_tokens} tokens")
-
-# Run a short generation to capture actual shapes
-print("\n" + "=" * 60)
-print("Running short generation to capture actual shapes...")
-print("=" * 60)
-
-from vllm_omni.diffusion.data import DiffusionParallelConfig
-from vllm_omni.entrypoints.omni import Omni
-from vllm_omni.inputs.data import OmniDiffusionSamplingParams
-from vllm_omni.platforms import current_omni_platform
-
-model = "hunyuanvideo-community/HunyuanVideo-1.5-Diffusers-480p_t2v"
-generator = torch.Generator(device=current_omni_platform.device_type).manual_seed(42)
-
-omni = Omni(
-    model=model,
-    vae_use_tiling=True,
-    enforce_eager=True,  # disable torch.compile for cleaner shape logging
-    parallel_config=DiffusionParallelConfig(),
-)
-
-print("\n[DEBUG] Starting generation (2 steps only)...")
-try:
-    outputs = omni.generate(
-        {"prompt": "A cat in a garden."},
-        OmniDiffusionSamplingParams(
-            height=480,
-            width=832,
-            num_frames=121,
-            generator=generator,
-            guidance_scale=6.0,
-            num_inference_steps=2,  # just 2 steps to see shapes
-        ),
+
+def main():
+    print("\n" + "=" * 60)
+    print("Theoretical token counts for HunyuanVideo 1.5")
+    print("=" * 60)
+    for n_frames in [33, 61, 81, 121]:
+        t = (n_frames - 1) // 4 + 1
+        h = 480 // 8
+        w = 832 // 8
+        vae_tokens = t * h * w
+        h_p = h // 2
+        w_p = w // 2
+        patch_tokens = t * h_p * w_p
+        print(f"  {n_frames}f: VAE latent {t}x{h}x{w}={vae_tokens}, "
+              f"after patch {t}x{h_p}x{w_p}={patch_tokens} tokens")
+
+    print("\n" + "=" * 60)
+    print("Running short generation to capture actual shapes...")
+    print("=" * 60)
+
+    from vllm_omni.diffusion.data import DiffusionParallelConfig
+    from vllm_omni.entrypoints.omni import Omni
+    from vllm_omni.inputs.data import OmniDiffusionSamplingParams
+    from vllm_omni.platforms import current_omni_platform
+
+    model = "hunyuanvideo-community/HunyuanVideo-1.5-Diffusers-480p_t2v"
+    generator = torch.Generator(device=current_omni_platform.device_type).manual_seed(42)
+
+    omni = Omni(
+        model=model,
+        vae_use_tiling=True,
+        enforce_eager=True,
+        parallel_config=DiffusionParallelConfig(),
     )
-    print("[DEBUG] Generation completed.")
-except Exception as e:
-    print(f"[DEBUG] Generation error (expected with 2 steps): {e}")
-
-print("\n" + "=" * 60)
-print("Shape summary")
-print("=" * 60)
-for entry in _shape_log:
-    q_shape, k_shape, dtype, mask_shape, mask_false = entry
-    B, S, H, D = q_shape
-    print(f"  q={list(q_shape)} k={list(k_shape)} "
-          f"tokens={S} mask_false={mask_false}")
+
+    print("\n[DEBUG] Starting generation (2 steps only)...")
+    try:
+        outputs = omni.generate(
+            {"prompt": "A cat in a garden."},
+            OmniDiffusionSamplingParams(
+                height=480,
+                width=832,
+                num_frames=121,
+                generator=generator,
+                guidance_scale=6.0,
+                num_inference_steps=2,
+            ),
+        )
+        print("[DEBUG] Generation completed.")
+    except Exception as e:
+        print(f"[DEBUG] Generation error (may be expected): {e}")
+
+    print("\n" + "=" * 60)
+    print("Shape summary")
+    print("=" * 60)
+    for entry in _shape_log:
+        q_shape, k_shape, dtype, mask_shape, mask_false = entry
+        B, S, H, D = q_shape
+        print(f"  q={list(q_shape)} k={list(k_shape)} "
+              f"tokens={S} mask_false={mask_false}")
+
+
+if __name__ == "__main__":
+    main()

From 6ef694b2635873b46fc7c80d435d15c3670fe3a0 Mon Sep 17 00:00:00 2001
From: lishunyang <lishunyang12@163.com>
Date: Fri, 3 Apr 2026 02:28:54 +0800
Subject: [PATCH 17/45] Update benchmarks with actual HunyuanVideo shapes:
 50345 tokens, 16 heads

Signed-off-by: lishunyang <lishunyang12@163.com>
---
 debug_fa3_check.py | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/debug_fa3_check.py b/debug_fa3_check.py
index e69367567c2..822bd0db36b 100644
--- a/debug_fa3_check.py
+++ b/debug_fa3_check.py
@@ -102,12 +102,12 @@
 
 # Simulate HunyuanVideo tensor shapes
 # 33 frames: ~(1, 2640, 24, 128) for single-stream, (1, 2640+256, 24, 128) for joint
-# 121 frames: ~(1, 9680, 24, 128) for single-stream
+# Actual shapes from debug_shapes.py:
+# 121f: q=[1, 50345, 16, 128] (48360 img + ~1985 encoder tokens)
+# dummy: q=[1, 6081, 16, 128] (smaller warmup shape)
 test_shapes = [
-    ("33f single-stream", (1, 2640, 24, 128)),
-    ("33f joint (img+txt)", (1, 2896, 24, 128)),
-    ("121f single-stream", (1, 9680, 24, 128)),
-    ("121f joint (img+txt)", (1, 9936, 24, 128)),
+    ("dummy warmup", (1, 6081, 16, 128)),
+    ("121f actual", (1, 50345, 16, 128)),
 ]
 
 for name, shape in test_shapes:
@@ -150,8 +150,8 @@
     exit()
 
 bench_shapes = [
-    ("33f", (1, 2640, 24, 128)),
-    ("121f", (1, 9680, 24, 128)),
+    ("dummy", (1, 6081, 16, 128)),
+    ("121f actual", (1, 50345, 16, 128)),
 ]
 
 n_warmup = 5
@@ -242,9 +242,9 @@
     k_bf16 = torch.randn(shape, dtype=torch.bfloat16, device=device)
     v_bf16 = torch.randn(shape, dtype=torch.bfloat16, device=device)
 
-    # Create a realistic mask: mostly True, some False at end (encoder padding)
+    # Create a realistic mask: ~1974 false values (actual encoder padding)
     mask = torch.ones(B, S, dtype=torch.bool, device=device)
-    n_pad = max(1, S // 20)  # 5% padding
+    n_pad = 1974  # actual mask_false from debug_shapes.py
     mask[:, -n_pad:] = False
 
     # Unpad inputs
@@ -298,7 +298,7 @@
 
     speedup = varlen_bf16 / varlen_fp8
 
-    print(f"\n  {name} {list(shape)} (5% padding):")
+    print(f"\n  {name} {list(shape)} ({n_pad} padding):")
     print(f"    BF16 varlen:  {varlen_bf16:.2f} ms")
     print(f"    FP8 varlen:   {varlen_fp8:.2f} ms  ({speedup:.2f}x)")
 
@@ -309,7 +309,7 @@
 print("6. Time breakdown estimate for one DiT layer")
 print("=" * 60)
 
-shape_121f = (1, 9680, 24, 128)
+shape_121f = (1, 50345, 16, 128)
 B, S, H, D = shape_121f
 hidden_dim = H * D  # 3072
 softmax_scale = D ** -0.5

From 777355171b01a562a8dc220d2a650302ba2ca596 Mon Sep 17 00:00:00 2001
From: lishunyang <lishunyang12@163.com>
Date: Fri, 3 Apr 2026 02:39:53 +0800
Subject: [PATCH 18/45] Implement delayed scaling: cache scales to skip amax on
 subsequent calls

First call per layer computes dynamic scales (amax reduction). All
subsequent calls reuse cached scales (static mode, just scale+cast).
This eliminates the ~4ms amax overhead for ~53 of 54 layers per step.

Signed-off-by: lishunyang <lishunyang12@163.com>
---
 debug_fa3_check.py                     | 17 +++++-
 vllm_omni/diffusion/attention/layer.py | 19 +++++--
 vllm_omni/quantization/kv_quant.py     | 72 +++++++++++++++++---------
 3 files changed, 80 insertions(+), 28 deletions(-)

diff --git a/debug_fa3_check.py b/debug_fa3_check.py
index 822bd0db36b..b173735d943 100644
--- a/debug_fa3_check.py
+++ b/debug_fa3_check.py
@@ -129,7 +129,22 @@
     torch.cuda.synchronize()
     quant_time = (time.perf_counter() - start) / n_iters * 1000
 
-    print(f"  {name} {list(shape)}: quant={quant_time:.2f} ms")
+    # Static quant (with cached scale — no amax)
+    _, _, _, qs, ks, vs = quantize_qkv_fp8(q, k, v)
+    for _ in range(3):
+        quantize_qkv_fp8(q, k, v, cached_scales=(qs, ks, vs))
+    torch.cuda.synchronize()
+
+    start = time.perf_counter()
+    for _ in range(n_iters):
+        fp8_q, fp8_k, fp8_v, qs2, ks2, vs2 = quantize_qkv_fp8(
+            q, k, v, cached_scales=(qs, ks, vs)
+        )
+    torch.cuda.synchronize()
+    static_time = (time.perf_counter() - start) / n_iters * 1000
+
+    print(f"  {name} {list(shape)}: dynamic={quant_time:.2f} ms, "
+          f"static={static_time:.2f} ms ({quant_time/static_time:.1f}x faster)")
 
 # ============================================================
 # 4. FA3 attention kernel benchmark (BF16 vs FP8)
diff --git a/vllm_omni/diffusion/attention/layer.py b/vllm_omni/diffusion/attention/layer.py
index 28cde6e06c1..d2afcf1b68f 100644
--- a/vllm_omni/diffusion/attention/layer.py
+++ b/vllm_omni/diffusion/attention/layer.py
@@ -94,6 +94,9 @@ def __init__(
         # FP8 attention quantization: resolved lazily in forward() because
         # forward_context is not available during model loading.
         self._fp8_attn_enabled: bool | None = None
+        # Cached scales for delayed scaling (reuse previous timestep's scales)
+        self._cached_qkv_scales: tuple | None = None
+        self._cached_jkv_scales: tuple | None = None
 
     def _get_active_parallel_strategy(self):
         """Get the parallel strategy based on current SP active state.
@@ -115,15 +118,23 @@ def _quantize_qkv_fp8(
         value: torch.Tensor,
         attn_metadata: AttentionMetadata | None,
     ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, AttentionMetadata | None]:
-        """Quantize Q/K/V tensors to FP8 and store scales in attn_metadata."""
+        """Quantize Q/K/V tensors to FP8 and store scales in attn_metadata.
+
+        Uses delayed scaling: first call computes dynamic scales (amax),
+        subsequent calls reuse the cached scales (static, no amax).
+        Scales are refreshed each timestep since the first layer in each
+        step always runs dynamic.
+        """
         from vllm_omni.quantization.kv_quant import (
             quantize_kv_fp8,
             quantize_qkv_fp8,
         )
 
         fp8_q, fp8_k, fp8_v, q_scale, k_scale, v_scale = quantize_qkv_fp8(
-            query, key, value
+            query, key, value, cached_scales=self._cached_qkv_scales
         )
+        # Cache scales for next call (delayed scaling)
+        self._cached_qkv_scales = (q_scale, k_scale, v_scale)
 
         if attn_metadata is None:
             attn_metadata = AttentionMetadata()
@@ -134,12 +145,14 @@ def _quantize_qkv_fp8(
         # Quantize joint_key/joint_value with separate scales
         if attn_metadata.joint_key is not None and attn_metadata.joint_value is not None:
             jk, jv, jk_scale, jv_scale = quantize_kv_fp8(
-                attn_metadata.joint_key, attn_metadata.joint_value
+                attn_metadata.joint_key, attn_metadata.joint_value,
+                cached_scales=self._cached_jkv_scales,
             )
             attn_metadata.joint_key = jk
             attn_metadata.joint_value = jv
             attn_metadata.jk_scale = jk_scale
             attn_metadata.jv_scale = jv_scale
+            self._cached_jkv_scales = (jk_scale, jv_scale)
 
         return fp8_q, fp8_k, fp8_v, attn_metadata
 
diff --git a/vllm_omni/quantization/kv_quant.py b/vllm_omni/quantization/kv_quant.py
index 6a621f6fdc5..94992c1f11d 100644
--- a/vllm_omni/quantization/kv_quant.py
+++ b/vllm_omni/quantization/kv_quant.py
@@ -6,8 +6,10 @@
 float8_e4m3fn format. Designed for diffusion models where Q/K/V are
 computed fresh each forward pass (no persistent KV cache).
 
-Uses vLLM's fused CUDA kernel (scaled_fp8_quant) for efficient
-amax+scale+cast in a single kernel launch.
+Supports two modes:
+  - Dynamic: computes amax per call (accurate but ~4ms overhead at 50K tokens)
+  - Static (delayed scaling): reuses a cached scale from the previous call,
+    skipping the expensive amax reduction (~0.5ms overhead).
 """
 
 import torch
@@ -30,40 +32,51 @@
 
 def _quantize_tensor_fp8(
     tensor: torch.Tensor,
+    cached_scale: torch.Tensor | None = None,
 ) -> tuple[torch.Tensor, torch.Tensor]:
-    """Quantize a single tensor to FP8 with per-tensor dynamic scaling.
+    """Quantize a single tensor to FP8 with per-tensor scaling.
 
-    Uses vLLM's fused CUDA kernel when available (single kernel launch
-    for amax reduction + scale computation + FP8 cast). Falls back to
-    3 separate PyTorch ops otherwise.
+    Args:
+        tensor: Input tensor in BF16/FP16.
+        cached_scale: If provided, use this scale (static mode, skips amax).
+            If None, compute scale dynamically.
 
     Returns:
         ``(fp8_tensor, inv_scale)`` where inv_scale is the dequant scale.
     """
     if _HAS_FUSED_QUANT:
-        # scaled_fp8_quant requires 2D input [M, N]
         orig_shape = tensor.shape
         flat = tensor.reshape(-1, orig_shape[-1])
-        # Dynamic per-tensor quantization: scale=None
-        fp8_flat, scale = _vllm_scaled_fp8_quant(flat)
+        # Pass cached_scale for static quant (no amax), None for dynamic
+        fp8_flat, scale = _vllm_scaled_fp8_quant(flat, scale=cached_scale)
         fp8_out = fp8_flat.reshape(orig_shape)
-        # scale from vLLM is 1/scale (inv_scale / dequant scale)
         return fp8_out, scale
     else:
         finfo = torch.finfo(torch.float8_e4m3fn)
-        amax = tensor.abs().amax().clamp(min=1e-12)
-        scale_factor = finfo.max / amax
-        fp8 = (tensor * scale_factor).clamp(finfo.min, finfo.max).to(
-            torch.float8_e4m3fn
-        )
-        inv_scale = amax / finfo.max
-        return fp8, inv_scale
+        if cached_scale is not None:
+            # Static: use cached scale directly
+            inv_scale = cached_scale
+            scale_factor = 1.0 / inv_scale
+            fp8 = (tensor * scale_factor).clamp(finfo.min, finfo.max).to(
+                torch.float8_e4m3fn
+            )
+            return fp8, inv_scale
+        else:
+            # Dynamic: compute amax
+            amax = tensor.abs().amax().clamp(min=1e-12)
+            scale_factor = finfo.max / amax
+            fp8 = (tensor * scale_factor).clamp(finfo.min, finfo.max).to(
+                torch.float8_e4m3fn
+            )
+            inv_scale = amax / finfo.max
+            return fp8, inv_scale
 
 
 def quantize_qkv_fp8(
     query: torch.Tensor,
     key: torch.Tensor,
     value: torch.Tensor,
+    cached_scales: tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None = None,
 ) -> tuple[
     torch.Tensor,
     torch.Tensor,
@@ -72,35 +85,46 @@ def quantize_qkv_fp8(
     torch.Tensor,
     torch.Tensor,
 ]:
-    """Quantize Q/K/V tensors to float8_e4m3fn with dynamic per-tensor scaling.
+    """Quantize Q/K/V tensors to float8_e4m3fn.
 
     Args:
         query: Query tensor in BF16/FP16, shape ``(B, S, H, D)``
         key: Key tensor in BF16/FP16, shape ``(B, S, H, D)``
         value: Value tensor in BF16/FP16, shape ``(B, S, H, D)``
+        cached_scales: Optional ``(q_scale, k_scale, v_scale)`` from a
+            previous call. When provided, skips the expensive amax
+            reduction (static/delayed scaling mode).
 
     Returns:
         ``(fp8_query, fp8_key, fp8_value, q_scale, k_scale, v_scale)``
         where scales are inverse (dequant) scales.
-        Pass as ``descale_q/k/v`` to FA3 or use :func:`dequantize_fp8`.
     """
-    fp8_q, q_scale = _quantize_tensor_fp8(query)
-    fp8_k, k_scale = _quantize_tensor_fp8(key)
-    fp8_v, v_scale = _quantize_tensor_fp8(value)
+    if cached_scales is not None:
+        cq, ck, cv = cached_scales
+    else:
+        cq = ck = cv = None
+    fp8_q, q_scale = _quantize_tensor_fp8(query, cq)
+    fp8_k, k_scale = _quantize_tensor_fp8(key, ck)
+    fp8_v, v_scale = _quantize_tensor_fp8(value, cv)
     return fp8_q, fp8_k, fp8_v, q_scale, k_scale, v_scale
 
 
 def quantize_kv_fp8(
     key: torch.Tensor,
     value: torch.Tensor,
+    cached_scales: tuple[torch.Tensor, torch.Tensor] | None = None,
 ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
     """Quantize K/V tensors to float8_e4m3fn (joint attention path).
 
     Returns:
         ``(fp8_key, fp8_value, k_scale, v_scale)``
     """
-    fp8_k, k_scale = _quantize_tensor_fp8(key)
-    fp8_v, v_scale = _quantize_tensor_fp8(value)
+    if cached_scales is not None:
+        ck, cv = cached_scales
+    else:
+        ck = cv = None
+    fp8_k, k_scale = _quantize_tensor_fp8(key, ck)
+    fp8_v, v_scale = _quantize_tensor_fp8(value, cv)
     return fp8_k, fp8_v, k_scale, v_scale
 
 

From 99bb651a5063d4d02bcf2195ee57aea53697d84c Mon Sep 17 00:00:00 2001
From: lishunyang <lishunyang12@163.com>
Date: Fri, 3 Apr 2026 03:01:10 +0800
Subject: [PATCH 19/45] Disable delayed scaling (green output bug), use dynamic
 only

Delayed scaling gives 17% speedup but produces corrupted (green)
output. Stale scales cause overflow/underflow in FP8. Disable until
proper scale refresh strategy is implemented.

Dynamic-only FP8 with correct descale params should give ~10% speedup.

Signed-off-by: lishunyang <lishunyang12@163.com>
---
 vllm_omni/diffusion/attention/layer.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/vllm_omni/diffusion/attention/layer.py b/vllm_omni/diffusion/attention/layer.py
index d2afcf1b68f..cc2159a617f 100644
--- a/vllm_omni/diffusion/attention/layer.py
+++ b/vllm_omni/diffusion/attention/layer.py
@@ -130,11 +130,11 @@ def _quantize_qkv_fp8(
             quantize_qkv_fp8,
         )
 
+        # TODO: delayed scaling (cached_scales) gives 17% speedup but causes
+        # green output. Disabled pending investigation. Use dynamic only.
         fp8_q, fp8_k, fp8_v, q_scale, k_scale, v_scale = quantize_qkv_fp8(
-            query, key, value, cached_scales=self._cached_qkv_scales
+            query, key, value, cached_scales=None
         )
-        # Cache scales for next call (delayed scaling)
-        self._cached_qkv_scales = (q_scale, k_scale, v_scale)
 
         if attn_metadata is None:
             attn_metadata = AttentionMetadata()
@@ -146,7 +146,7 @@ def _quantize_qkv_fp8(
         if attn_metadata.joint_key is not None and attn_metadata.joint_value is not None:
             jk, jv, jk_scale, jv_scale = quantize_kv_fp8(
                 attn_metadata.joint_key, attn_metadata.joint_value,
-                cached_scales=self._cached_jkv_scales,
+                cached_scales=None,
             )
             attn_metadata.joint_key = jk
             attn_metadata.joint_value = jv

From 0a2b3e9d544abb2f977f10c4a940c8cd26402b9a Mon Sep 17 00:00:00 2001
From: lishunyang <lishunyang12@163.com>
Date: Fri, 3 Apr 2026 03:09:38 +0800
Subject: [PATCH 20/45] Test KV-only FP8: keep Q in BF16, only quantize K/V

Q precision is critical for softmax accuracy. FP8 Q may cause output
corruption due to Hopper FP8 TC accumulation imprecision (FA3 #2250).
This test keeps Q in BF16 while K/V use FP8 with FA3 descale.

Signed-off-by: lishunyang <lishunyang12@163.com>
---
 .../diffusion/attention/backends/flash_attn.py     |  5 +++--
 vllm_omni/diffusion/attention/backends/sdpa.py     |  3 ++-
 vllm_omni/diffusion/attention/layer.py             | 14 +++++++++-----
 3 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/vllm_omni/diffusion/attention/backends/flash_attn.py b/vllm_omni/diffusion/attention/backends/flash_attn.py
index d1899b03f37..216d7ae5182 100644
--- a/vllm_omni/diffusion/attention/backends/flash_attn.py
+++ b/vllm_omni/diffusion/attention/backends/flash_attn.py
@@ -253,7 +253,8 @@ def _forward_fp8(
                 "Install FA3 for optimal FP8 support on Hopper GPUs."
             )
             output_dtype = torch.bfloat16
-            query = dequantize_fp8(query, q_scale, output_dtype)
+            if q_scale is not None:
+                query = dequantize_fp8(query, q_scale, output_dtype)
             key = dequantize_fp8(key, k_scale, output_dtype)
             value = dequantize_fp8(value, v_scale, output_dtype)
             attn_metadata.q_scale = None
@@ -263,7 +264,7 @@ def _forward_fp8(
 
         # Reshape per-tensor scales to FA3's expected (batch, num_kv_heads)
         B, S, H, D = key.shape
-        q_descale = self._reshape_descale(q_scale, B, H)
+        q_descale = self._reshape_descale(q_scale, B, H) if q_scale is not None else None
         k_descale = self._reshape_descale(k_scale, B, H)
         v_descale = self._reshape_descale(v_scale, B, H)
 
diff --git a/vllm_omni/diffusion/attention/backends/sdpa.py b/vllm_omni/diffusion/attention/backends/sdpa.py
index f400f57dba8..3eb81a5c023 100644
--- a/vllm_omni/diffusion/attention/backends/sdpa.py
+++ b/vllm_omni/diffusion/attention/backends/sdpa.py
@@ -105,7 +105,8 @@ def _forward_impl(
             q_scale = attn_metadata.q_scale if attn_metadata else None
             k_scale = attn_metadata.k_scale if attn_metadata else None
             v_scale = attn_metadata.v_scale if attn_metadata else None
-            query = dequantize_fp8(query, q_scale, output_dtype)
+            if q_scale is not None:
+                query = dequantize_fp8(query, q_scale, output_dtype)
             key = dequantize_fp8(key, k_scale, output_dtype)
             value = dequantize_fp8(value, v_scale, output_dtype)
             logger.warning_once(
diff --git a/vllm_omni/diffusion/attention/layer.py b/vllm_omni/diffusion/attention/layer.py
index cc2159a617f..f7a593cfc3e 100644
--- a/vllm_omni/diffusion/attention/layer.py
+++ b/vllm_omni/diffusion/attention/layer.py
@@ -130,18 +130,22 @@ def _quantize_qkv_fp8(
             quantize_qkv_fp8,
         )
 
-        # TODO: delayed scaling (cached_scales) gives 17% speedup but causes
-        # green output. Disabled pending investigation. Use dynamic only.
-        fp8_q, fp8_k, fp8_v, q_scale, k_scale, v_scale = quantize_qkv_fp8(
-            query, key, value, cached_scales=None
+        # Only quantize K/V to FP8, keep Q in BF16.
+        # Q precision is critical for softmax accuracy — FP8 Q causes
+        # green output due to Hopper FP8 TC accumulation imprecision.
+        fp8_k, fp8_v, k_scale, v_scale = quantize_kv_fp8(
+            key, value, cached_scales=None
         )
 
         if attn_metadata is None:
             attn_metadata = AttentionMetadata()
-        attn_metadata.q_scale = q_scale
+        attn_metadata.q_scale = None
         attn_metadata.k_scale = k_scale
         attn_metadata.v_scale = v_scale
 
+        # Q stays BF16, K/V are FP8
+        fp8_q = query
+
         # Quantize joint_key/joint_value with separate scales
         if attn_metadata.joint_key is not None and attn_metadata.joint_value is not None:
             jk, jv, jk_scale, jv_scale = quantize_kv_fp8(

From 55256a10c28664ff922bb162b5910bfd077e616b Mon Sep 17 00:00:00 2001
From: lishunyang <lishunyang12@163.com>
Date: Fri, 3 Apr 2026 03:20:15 +0800
Subject: [PATCH 21/45] Skip varlen for FP8: use regular FA3 path to avoid
 varlen descale bug
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

FA3 varlen + FP8 descale produces corrupted output on current fa3-fwd
builds. Use regular fa3_attn_func even when padding is present — the
padding tokens (~4%) add negligible compute overhead.

Signed-off-by: lishunyang <lishunyang12@163.com>
---
 .../diffusion/attention/backends/flash_attn.py     | 14 +++-----------
 1 file changed, 3 insertions(+), 11 deletions(-)

diff --git a/vllm_omni/diffusion/attention/backends/flash_attn.py b/vllm_omni/diffusion/attention/backends/flash_attn.py
index 216d7ae5182..fae520115c0 100644
--- a/vllm_omni/diffusion/attention/backends/flash_attn.py
+++ b/vllm_omni/diffusion/attention/backends/flash_attn.py
@@ -268,17 +268,9 @@ def _forward_fp8(
         k_descale = self._reshape_descale(k_scale, B, H)
         v_descale = self._reshape_descale(v_scale, B, H)
 
-        attention_mask = attn_metadata.attn_mask if attn_metadata is not None else None
-        has_padding = attention_mask is not None and torch.any(~attention_mask)
-
-        if has_padding:
-            # FA3 varlen with FP8 descale
-            return self._forward_varlen_masked(
-                query, key, value, attention_mask,
-                q_descale=q_descale, k_descale=k_descale, v_descale=v_descale,
-            )
-
-        # FA3 regular path with FP8 descale
+        # Use regular FA3 path even with padding — FA3 varlen + FP8 descale
+        # produces corrupted output on current fa3-fwd builds. The padding
+        # tokens (~4% for HunyuanVideo) add negligible extra compute.
         out = fa3_attn_func(
             query, key, value,
             softmax_scale=self.softmax_scale,

From 8fa2ffac2a3b83ceb8844bfdebd700ef01efa622 Mon Sep 17 00:00:00 2001
From: lishunyang <lishunyang12@163.com>
Date: Fri, 3 Apr 2026 03:25:20 +0800
Subject: [PATCH 22/45] Revert to QKV FP8: FA3 requires same dtype for Q/K/V

KV-only with BF16 Q fails: 'query and key must have the same dtype'.
Revert to quantizing all QKV. Test with --enforce-eager to rule out
torch.compile as the cause of green output.

Signed-off-by: lishunyang <lishunyang12@163.com>
---
 vllm_omni/diffusion/attention/backends/flash_attn.py |  5 ++---
 vllm_omni/diffusion/attention/backends/sdpa.py       |  3 +--
 vllm_omni/diffusion/attention/layer.py               | 12 +++---------
 3 files changed, 6 insertions(+), 14 deletions(-)

diff --git a/vllm_omni/diffusion/attention/backends/flash_attn.py b/vllm_omni/diffusion/attention/backends/flash_attn.py
index fae520115c0..4131bb6e3a3 100644
--- a/vllm_omni/diffusion/attention/backends/flash_attn.py
+++ b/vllm_omni/diffusion/attention/backends/flash_attn.py
@@ -253,8 +253,7 @@ def _forward_fp8(
                 "Install FA3 for optimal FP8 support on Hopper GPUs."
             )
             output_dtype = torch.bfloat16
-            if q_scale is not None:
-                query = dequantize_fp8(query, q_scale, output_dtype)
+            query = dequantize_fp8(query, q_scale, output_dtype)
             key = dequantize_fp8(key, k_scale, output_dtype)
             value = dequantize_fp8(value, v_scale, output_dtype)
             attn_metadata.q_scale = None
@@ -264,7 +263,7 @@ def _forward_fp8(
 
         # Reshape per-tensor scales to FA3's expected (batch, num_kv_heads)
         B, S, H, D = key.shape
-        q_descale = self._reshape_descale(q_scale, B, H) if q_scale is not None else None
+        q_descale = self._reshape_descale(q_scale, B, H)
         k_descale = self._reshape_descale(k_scale, B, H)
         v_descale = self._reshape_descale(v_scale, B, H)
 
diff --git a/vllm_omni/diffusion/attention/backends/sdpa.py b/vllm_omni/diffusion/attention/backends/sdpa.py
index 3eb81a5c023..f400f57dba8 100644
--- a/vllm_omni/diffusion/attention/backends/sdpa.py
+++ b/vllm_omni/diffusion/attention/backends/sdpa.py
@@ -105,8 +105,7 @@ def _forward_impl(
             q_scale = attn_metadata.q_scale if attn_metadata else None
             k_scale = attn_metadata.k_scale if attn_metadata else None
             v_scale = attn_metadata.v_scale if attn_metadata else None
-            if q_scale is not None:
-                query = dequantize_fp8(query, q_scale, output_dtype)
+            query = dequantize_fp8(query, q_scale, output_dtype)
             key = dequantize_fp8(key, k_scale, output_dtype)
             value = dequantize_fp8(value, v_scale, output_dtype)
             logger.warning_once(
diff --git a/vllm_omni/diffusion/attention/layer.py b/vllm_omni/diffusion/attention/layer.py
index f7a593cfc3e..846650d3f78 100644
--- a/vllm_omni/diffusion/attention/layer.py
+++ b/vllm_omni/diffusion/attention/layer.py
@@ -130,22 +130,16 @@ def _quantize_qkv_fp8(
             quantize_qkv_fp8,
         )
 
-        # Only quantize K/V to FP8, keep Q in BF16.
-        # Q precision is critical for softmax accuracy — FP8 Q causes
-        # green output due to Hopper FP8 TC accumulation imprecision.
-        fp8_k, fp8_v, k_scale, v_scale = quantize_kv_fp8(
-            key, value, cached_scales=None
+        fp8_q, fp8_k, fp8_v, q_scale, k_scale, v_scale = quantize_qkv_fp8(
+            query, key, value, cached_scales=None
         )
 
         if attn_metadata is None:
             attn_metadata = AttentionMetadata()
-        attn_metadata.q_scale = None
+        attn_metadata.q_scale = q_scale
         attn_metadata.k_scale = k_scale
         attn_metadata.v_scale = v_scale
 
-        # Q stays BF16, K/V are FP8
-        fp8_q = query
-
         # Quantize joint_key/joint_value with separate scales
         if attn_metadata.joint_key is not None and attn_metadata.joint_value is not None:
             jk, jv, jk_scale, jv_scale = quantize_kv_fp8(

From 8495dfad9a8e9c2c7c29d070155c0e0a1003456a Mon Sep 17 00:00:00 2001
From: lishunyang <lishunyang12@163.com>
Date: Fri, 3 Apr 2026 03:36:07 +0800
Subject: [PATCH 23/45] Zero out padding K/V before FP8 quantization
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

FP8 path uses regular FA3 (not varlen) to avoid descale bug, so
padding tokens are not masked by the kernel. Zero them in K/V before
quantizing — softmax(Q * 0^T) ≈ 0, effectively masking padding.

Signed-off-by: lishunyang <lishunyang12@163.com>
---
 vllm_omni/diffusion/attention/layer.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/vllm_omni/diffusion/attention/layer.py b/vllm_omni/diffusion/attention/layer.py
index 846650d3f78..5f308c39552 100644
--- a/vllm_omni/diffusion/attention/layer.py
+++ b/vllm_omni/diffusion/attention/layer.py
@@ -195,6 +195,16 @@ def forward(
 
         # 1.5 FP8 Q/K/V quantization (after AllToAll stays BF16, before kernel)
         if self._resolve_fp8_attn():
+            # Zero out padding positions before quantizing — FP8 path skips
+            # varlen to avoid FA3 varlen+descale bug, so padding must be
+            # handled by zeroing K (makes softmax weight ≈ 0 for those positions).
+            if attn_metadata is not None and attn_metadata.attn_mask is not None:
+                mask = attn_metadata.attn_mask  # (B, S) bool
+                if not torch.all(mask):
+                    # Expand mask to (B, S, 1, 1) for broadcasting
+                    m = mask.unsqueeze(-1).unsqueeze(-1)
+                    key = key * m
+                    value = value * m
             query, key, value, attn_metadata = self._quantize_qkv_fp8(
                 query, key, value, attn_metadata
             )

From fef28140b7b2d5e1b2b872a5ef532838e339d6a3 Mon Sep 17 00:00:00 2001
From: lishunyang <lishunyang12@163.com>
Date: Fri, 3 Apr 2026 03:44:23 +0800
Subject: [PATCH 24/45] Auto-fallback to BF16 for sequences >16K tokens

FA3 FP8 accumulation on Hopper loses precision at long sequences
(flash-attention #2250). Skip FP8 and use BF16 when seq_len > 16384.

Short videos (33f, ~6K tokens): FP8 active, ~10% speedup
Long videos (121f, ~50K tokens): auto BF16 fallback, correct output

Signed-off-by: lishunyang <lishunyang12@163.com>
---
 vllm_omni/diffusion/attention/layer.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/vllm_omni/diffusion/attention/layer.py b/vllm_omni/diffusion/attention/layer.py
index 5f308c39552..397fa8f8514 100644
--- a/vllm_omni/diffusion/attention/layer.py
+++ b/vllm_omni/diffusion/attention/layer.py
@@ -194,14 +194,16 @@ def forward(
         query, key, value, attn_metadata, ctx = strategy.pre_attention(query, key, value, attn_metadata)
 
         # 1.5 FP8 Q/K/V quantization (after AllToAll stays BF16, before kernel)
-        if self._resolve_fp8_attn():
+        # Skip FP8 for long sequences — FA3 FP8 accumulation on Hopper loses
+        # precision above ~16K tokens (known issue, flash-attention #2250).
+        _FP8_MAX_SEQLEN = 16384
+        if self._resolve_fp8_attn() and query.shape[1] <= _FP8_MAX_SEQLEN:
             # Zero out padding positions before quantizing — FP8 path skips
             # varlen to avoid FA3 varlen+descale bug, so padding must be
             # handled by zeroing K (makes softmax weight ≈ 0 for those positions).
             if attn_metadata is not None and attn_metadata.attn_mask is not None:
                 mask = attn_metadata.attn_mask  # (B, S) bool
                 if not torch.all(mask):
-                    # Expand mask to (B, S, 1, 1) for broadcasting
                     m = mask.unsqueeze(-1).unsqueeze(-1)
                     key = key * m
                     value = value * m

From 68dd46c4750665bcabc605e22214c1461fa2d2fd Mon Sep 17 00:00:00 2001
From: lishunyang <lishunyang12@163.com>
Date: Fri, 3 Apr 2026 03:49:30 +0800
Subject: [PATCH 25/45] Use num_splits for FP8 accuracy at long sequences
 instead of seqlen cutoff

FA3 FP8 TC on Hopper has imprecise accumulation at long sequences.
Instead of falling back to BF16, use num_splits to chunk attention
into ~8K-token pieces, bounding the accumulation error per chunk.
This keeps FP8 active for all sequence lengths.

Signed-off-by: lishunyang <lishunyang12@163.com>
---
 vllm_omni/diffusion/attention/backends/flash_attn.py | 6 ++++++
 vllm_omni/diffusion/attention/layer.py               | 5 +----
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/vllm_omni/diffusion/attention/backends/flash_attn.py b/vllm_omni/diffusion/attention/backends/flash_attn.py
index 4131bb6e3a3..7c1d27b0a70 100644
--- a/vllm_omni/diffusion/attention/backends/flash_attn.py
+++ b/vllm_omni/diffusion/attention/backends/flash_attn.py
@@ -270,6 +270,11 @@ def _forward_fp8(
         # Use regular FA3 path even with padding — FA3 varlen + FP8 descale
         # produces corrupted output on current fa3-fwd builds. The padding
         # tokens (~4% for HunyuanVideo) add negligible extra compute.
+        #
+        # Use num_splits to improve FP8 accumulation accuracy at long sequences.
+        # FA3 FP8 TC on Hopper has imprecise accumulation (flash-attention #2250);
+        # splitting reduces tokens per chunk, bounding the error.
+        num_splits = max(1, S // 8192)  # ~8K tokens per split
         out = fa3_attn_func(
             query, key, value,
             softmax_scale=self.softmax_scale,
@@ -277,6 +282,7 @@ def _forward_fp8(
             q_descale=q_descale,
             k_descale=k_descale,
             v_descale=v_descale,
+            num_splits=num_splits,
         )
         if isinstance(out, tuple):
             out = out[0]
diff --git a/vllm_omni/diffusion/attention/layer.py b/vllm_omni/diffusion/attention/layer.py
index 397fa8f8514..295a195e7e3 100644
--- a/vllm_omni/diffusion/attention/layer.py
+++ b/vllm_omni/diffusion/attention/layer.py
@@ -194,10 +194,7 @@ def forward(
         query, key, value, attn_metadata, ctx = strategy.pre_attention(query, key, value, attn_metadata)
 
         # 1.5 FP8 Q/K/V quantization (after AllToAll stays BF16, before kernel)
-        # Skip FP8 for long sequences — FA3 FP8 accumulation on Hopper loses
-        # precision above ~16K tokens (known issue, flash-attention #2250).
-        _FP8_MAX_SEQLEN = 16384
-        if self._resolve_fp8_attn() and query.shape[1] <= _FP8_MAX_SEQLEN:
+        if self._resolve_fp8_attn():
             # Zero out padding positions before quantizing — FP8 path skips
             # varlen to avoid FA3 varlen+descale bug, so padding must be
             # handled by zeroing K (makes softmax weight ≈ 0 for those positions).

From 30180bb61c45c4861e5e21fdf3a409a099f150a5 Mon Sep 17 00:00:00 2001
From: lishunyang <lishunyang12@163.com>
Date: Fri, 3 Apr 2026 04:06:57 +0800
Subject: [PATCH 26/45] Add debug_fa3_version.py: check FA3 builds and
 two-level accumulation

Signed-off-by: lishunyang <lishunyang12@163.com>
---
 debug_fa3_version.py | 130 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 130 insertions(+)
 create mode 100644 debug_fa3_version.py

diff --git a/debug_fa3_version.py b/debug_fa3_version.py
new file mode 100644
index 00000000000..3cfb58f4fd7
--- /dev/null
+++ b/debug_fa3_version.py
@@ -0,0 +1,130 @@
+"""Debug: check which flash-attention builds are available and whether
+the FP8 two-level accumulation fix is present."""
+import importlib
+import os
+import sys
+
+print("=" * 60)
+print("1. fa3-fwd (current FP8 attention backend)")
+print("=" * 60)
+try:
+    import fa3_fwd_interface
+    print(f"  Location: {fa3_fwd_interface.__file__}")
+    # Check if the source has two-level accumulation
+    src_path = os.path.dirname(fa3_fwd_interface.__file__)
+    print(f"  Package dir: {src_path}")
+    # List files
+    for f in sorted(os.listdir(src_path)):
+        if f.endswith(('.py', '.so', '.pyd')):
+            print(f"    {f}")
+except Exception as e:
+    print(f"  Not available: {e}")
+
+try:
+    import fa3_fwd_cuda
+    print(f"\n  fa3_fwd_cuda location: {fa3_fwd_cuda.__file__}")
+except Exception as e:
+    print(f"\n  fa3_fwd_cuda: {e}")
+
+print("\n" + "=" * 60)
+print("2. vLLM's flash-attention (may have the fix)")
+print("=" * 60)
+
+# Check vLLM's internal flash-attn
+paths_to_check = [
+    "vllm.attention.backends.flash_attn",
+    "vllm.vllm_flash_attn",
+    "vllm._custom_ops",
+]
+for mod_path in paths_to_check:
+    try:
+        mod = importlib.import_module(mod_path)
+        print(f"  {mod_path}: {mod.__file__}")
+    except Exception as e:
+        print(f"  {mod_path}: not available ({e})")
+
+# Check if vLLM ships its own flash_attn_func with descale
+try:
+    from vllm.attention.backends.flash_attn import flash_attn_varlen_func
+    import inspect
+    sig = inspect.signature(flash_attn_varlen_func)
+    params = list(sig.parameters.keys())
+    has_descale = any("descale" in p for p in params)
+    print(f"\n  vLLM flash_attn_varlen_func params: {params}")
+    print(f"  Has descale: {has_descale}")
+except Exception as e:
+    print(f"\n  vLLM flash_attn_varlen_func: {e}")
+
+print("\n" + "=" * 60)
+print("3. flash_attn pip package")
+print("=" * 60)
+try:
+    import flash_attn
+    print(f"  Version: {flash_attn.__version__}")
+    print(f"  Location: {flash_attn.__file__}")
+except Exception as e:
+    print(f"  Not installed: {e}")
+
+print("\n" + "=" * 60)
+print("4. Check for two-level accumulation in fa3-fwd source")
+print("=" * 60)
+try:
+    import fa3_fwd_interface
+    src_file = fa3_fwd_interface.__file__
+    with open(src_file, 'r') as f:
+        content = f.read()
+    # Search for signs of two-level accumulation
+    keywords = [
+        "two_level", "TWO_LEVEL",
+        "accum_fp32", "ACCUM_FP32",
+        "fp8_two_level", "FP8_TWO_LEVEL",
+        "accumulation_fix",
+        "flush_accum",
+    ]
+    found = False
+    for kw in keywords:
+        if kw.lower() in content.lower():
+            print(f"  Found '{kw}' in fa3_fwd_interface.py")
+            found = True
+    if not found:
+        print("  No two-level accumulation keywords found in fa3_fwd_interface.py")
+        print("  -> This build likely does NOT have the FP8 accumulation fix")
+except Exception as e:
+    print(f"  Could not read source: {e}")
+
+print("\n" + "=" * 60)
+print("5. vllm-project/flash-attention fork check")
+print("=" * 60)
+# Check if there's a vllm flash_attn with the fix
+search_paths = [
+    "/workspace/.venv/lib/python3.12/site-packages/vllm",
+    "/workspace/.venv/lib/python3.12/site-packages/flash_attn",
+    "/workspace/.venv/lib/python3.12/site-packages",
+]
+for base in search_paths:
+    if os.path.isdir(base):
+        for root, dirs, files in os.walk(base):
+            for f in files:
+                if "flash" in f.lower() and f.endswith('.so'):
+                    full = os.path.join(root, f)
+                    size_mb = os.path.getsize(full) / 1024 / 1024
+                    print(f"  {full} ({size_mb:.1f} MB)")
+            # Don't recurse too deep
+            if root.count(os.sep) - base.count(os.sep) > 2:
+                dirs.clear()
+
+print("\n" + "=" * 60)
+print("6. Environment variable check")
+print("=" * 60)
+env_vars = [
+    "FLASH_ATTENTION_DISABLE_FP8_TWO_LEVEL_ACCUMULATION",
+    "FLASH_ATTENTION_FORCE_FP8_TWO_LEVEL_ACCUMULATION",
+    "VLLM_FLASH_ATTN_SRC_DIR",
+]
+for var in env_vars:
+    val = os.environ.get(var, "<not set>")
+    print(f"  {var}: {val}")
+
+print("\n" + "=" * 60)
+print("Done.")
+print("=" * 60)

From fb83d5981f51448665a325d65f085723b67985a7 Mon Sep 17 00:00:00 2001
From: lishunyang <lishunyang12@163.com>
Date: Fri, 3 Apr 2026 04:10:18 +0800
Subject: [PATCH 27/45] Add debug_vllm_fa.py: check vLLM's bundled
 flash-attention for FP8 fix

Signed-off-by: lishunyang <lishunyang12@163.com>
---
 debug_vllm_fa.py | 157 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 157 insertions(+)
 create mode 100644 debug_vllm_fa.py

diff --git a/debug_vllm_fa.py b/debug_vllm_fa.py
new file mode 100644
index 00000000000..eae70097802
--- /dev/null
+++ b/debug_vllm_fa.py
@@ -0,0 +1,157 @@
+"""Check vllm.vllm_flash_attn for FP8 support and two-level accumulation."""
+import inspect
+import os
+
+print("=" * 60)
+print("1. vllm.vllm_flash_attn contents")
+print("=" * 60)
+try:
+    import vllm.vllm_flash_attn as vfa
+    print(f"Location: {vfa.__file__}")
+    pkg_dir = os.path.dirname(vfa.__file__)
+    for f in sorted(os.listdir(pkg_dir)):
+        full = os.path.join(pkg_dir, f)
+        if os.path.isfile(full):
+            size = os.path.getsize(full)
+            print(f"  {f} ({size/1024:.1f} KB)")
+
+    print(f"\nExported names: {dir(vfa)}")
+except Exception as e:
+    print(f"Not available: {e}")
+
+print("\n" + "=" * 60)
+print("2. Check for flash_attn_func / varlen with descale")
+print("=" * 60)
+funcs_to_check = [
+    "flash_attn_func",
+    "flash_attn_varlen_func",
+    "flash_attn_with_kvcache",
+]
+for fname in funcs_to_check:
+    try:
+        func = getattr(vfa, fname, None)
+        if func is None:
+            # Try submodule
+            try:
+                from vllm.vllm_flash_attn import flash_attn_interface
+                func = getattr(flash_attn_interface, fname, None)
+            except:
+                pass
+        if func is not None:
+            sig = inspect.signature(func)
+            params = list(sig.parameters.keys())
+            has_descale = any("descale" in p for p in params)
+            print(f"\n  {fname}:")
+            print(f"    params: {params}")
+            print(f"    has descale: {has_descale}")
+        else:
+            print(f"\n  {fname}: not found")
+    except Exception as e:
+        print(f"\n  {fname}: error - {e}")
+
+print("\n" + "=" * 60)
+print("3. Check for two-level accumulation in source")
+print("=" * 60)
+try:
+    pkg_dir = os.path.dirname(vfa.__file__)
+    keywords = [
+        "two_level", "TWO_LEVEL", "fp8_two_level",
+        "FP8_TWO_LEVEL", "accum", "flush",
+    ]
+    for f in os.listdir(pkg_dir):
+        if f.endswith('.py'):
+            filepath = os.path.join(pkg_dir, f)
+            with open(filepath, 'r') as fh:
+                content = fh.read()
+            for kw in keywords:
+                if kw.lower() in content.lower():
+                    # Find the line
+                    for i, line in enumerate(content.split('\n')):
+                        if kw.lower() in line.lower():
+                            print(f"  {f}:{i+1}: {line.strip()[:100]}")
+                            break
+except Exception as e:
+    print(f"  Error: {e}")
+
+print("\n" + "=" * 60)
+print("4. Check CUDA backend for FP8")
+print("=" * 60)
+try:
+    pkg_dir = os.path.dirname(vfa.__file__)
+    for f in os.listdir(pkg_dir):
+        if f.endswith('.so') or f.endswith('.pyd'):
+            full = os.path.join(pkg_dir, f)
+            size_mb = os.path.getsize(full) / 1024 / 1024
+            print(f"  {f} ({size_mb:.1f} MB)")
+except Exception as e:
+    print(f"  Error: {e}")
+
+print("\n" + "=" * 60)
+print("5. Quick FP8 functional test with vllm_flash_attn")
+print("=" * 60)
+try:
+    import torch
+    if not torch.cuda.is_available():
+        print("  No GPU, skipping")
+    else:
+        # Try to use vllm's flash_attn for FP8
+        func = getattr(vfa, 'flash_attn_func', None)
+        if func is None:
+            try:
+                from vllm.vllm_flash_attn.flash_attn_interface import flash_attn_func as func
+            except:
+                pass
+
+        if func is not None:
+            sig = inspect.signature(func)
+            params = list(sig.parameters.keys())
+            if any("descale" in p for p in params):
+                # Test FP8 attention
+                B, S, H, D = 1, 1024, 16, 128
+                q = torch.randn(B, S, H, D, dtype=torch.bfloat16, device="cuda")
+                k = torch.randn(B, S, H, D, dtype=torch.bfloat16, device="cuda")
+                v = torch.randn(B, S, H, D, dtype=torch.bfloat16, device="cuda")
+
+                # Quantize to FP8
+                from vllm_omni.quantization.kv_quant import quantize_qkv_fp8
+                fp8_q, fp8_k, fp8_v, qs, ks, vs = quantize_qkv_fp8(q, k, v)
+
+                qs_2d = qs.view(1,1).expand(B, H).contiguous()
+                ks_2d = ks.view(1,1).expand(B, H).contiguous()
+                vs_2d = vs.view(1,1).expand(B, H).contiguous()
+
+                # Find the right param names
+                descale_params = [p for p in params if "descale" in p]
+                print(f"  Descale param names: {descale_params}")
+
+                kwargs = {"softmax_scale": D**-0.5, "causal": False}
+                for p in descale_params:
+                    if "q" in p: kwargs[p] = qs_2d
+                    elif "k" in p: kwargs[p] = ks_2d
+                    elif "v" in p: kwargs[p] = vs_2d
+
+                out = func(fp8_q, fp8_k, fp8_v, **kwargs)
+                if isinstance(out, tuple):
+                    out = out[0]
+                print(f"  FP8 test passed! Output shape: {out.shape}, dtype: {out.dtype}")
+                print(f"  Output has NaN: {torch.isnan(out).any()}")
+                print(f"  Output has Inf: {torch.isinf(out).any()}")
+
+                # Compare with BF16
+                out_bf16 = func(q, k, v, softmax_scale=D**-0.5, causal=False)
+                if isinstance(out_bf16, tuple):
+                    out_bf16 = out_bf16[0]
+                diff = (out.float() - out_bf16.float()).abs().mean().item()
+                print(f"  Mean abs diff vs BF16: {diff:.6f}")
+            else:
+                print("  vllm flash_attn_func has no descale params")
+        else:
+            print("  No flash_attn_func found in vllm_flash_attn")
+except Exception as e:
+    print(f"  Error: {e}")
+    import traceback
+    traceback.print_exc()
+
+print("\n" + "=" * 60)
+print("Done.")
+print("=" * 60)

From aab93f4a597ec31ac1062e3686cf7a2612fc2f0d Mon Sep 17 00:00:00 2001
From: lishunyang <lishunyang12@163.com>
Date: Fri, 3 Apr 2026 04:14:29 +0800
Subject: [PATCH 28/45] Switch FP8 path to vLLM's bundled FA3 (has two-level
 accumulation fix)

vLLM ships _vllm_fa3_C.abi3.so built from vllm-project/flash-attention
which has the FP8 two-level accumulation fix (PR #104). This fixes the
Hopper FP8 precision issue at long sequences (50K+ tokens).

Falls back to fa3-fwd then BF16 dequant if vLLM's FA3 is unavailable.

Signed-off-by: lishunyang <lishunyang12@163.com>
---
 .../attention/backends/flash_attn.py          | 110 +++++++++++-------
 1 file changed, 69 insertions(+), 41 deletions(-)

diff --git a/vllm_omni/diffusion/attention/backends/flash_attn.py b/vllm_omni/diffusion/attention/backends/flash_attn.py
index 7c1d27b0a70..ea5147ae38d 100644
--- a/vllm_omni/diffusion/attention/backends/flash_attn.py
+++ b/vllm_omni/diffusion/attention/backends/flash_attn.py
@@ -234,56 +234,84 @@ def _forward_fp8(
         value: torch.Tensor,
         attn_metadata: AttentionMetadata,
     ) -> torch.Tensor:
-        """FP8 Q/K/V attention path: native FA3 with descale."""
+        """FP8 Q/K/V attention path.
+
+        Uses vLLM's bundled FA3 backend (vllm_flash_attn) which has the
+        two-level accumulation fix for FP8 on Hopper. Falls back to
+        fa3-fwd or dequant if vLLM's FA3 is unavailable.
+        """
         q_scale = attn_metadata.q_scale
         k_scale = attn_metadata.k_scale
         v_scale = attn_metadata.v_scale
 
+        B, S, H, D = key.shape
+        q_descale = self._reshape_descale(q_scale, B, H)
+        k_descale = self._reshape_descale(k_scale, B, H)
+        v_descale = self._reshape_descale(v_scale, B, H)
+
+        # Try vLLM's bundled FA3 (has two-level accumulation fix for FP8)
+        try:
+            from vllm.vllm_flash_attn import flash_attn_varlen_func as vllm_varlen
+
+            # varlen API needs (total_tokens, H, D) and cu_seqlens
+            q_flat = query.reshape(B * S, H, D)
+            k_flat = key.reshape(B * S, H, D)
+            v_flat = value.reshape(B * S, H, D)
+            cu_seqlens = torch.arange(
+                0, (B + 1) * S, step=S, dtype=torch.int32, device=query.device
+            )
+
+            out = vllm_varlen(
+                q_flat, k_flat, v_flat,
+                max_seqlen_q=S,
+                cu_seqlens_q=cu_seqlens,
+                max_seqlen_k=S,
+                cu_seqlens_k=cu_seqlens,
+                softmax_scale=self.softmax_scale,
+                causal=self.causal,
+                q_descale=q_descale,
+                k_descale=k_descale,
+                v_descale=v_descale,
+                fa_version=3,
+            )
+            if isinstance(out, tuple):
+                out = out[0]
+            return out.reshape(B, S, H, D)
+        except Exception as e:
+            logger.warning_once(
+                "vLLM FA3 FP8 failed (%s), trying fa3-fwd fallback.", e
+            )
+
+        # Fallback: fa3-fwd (may lack two-level accumulation fix)
         from vllm_omni.diffusion.attention.backends.ring.ring_globals import (
             HAS_FA3,
             fa3_attn_func,
         )
 
-        if not (HAS_FA3 and fa3_attn_func is not None):
-            # No FA3: dequant and use standard path
-            from vllm_omni.quantization.kv_quant import dequantize_fp8
-
-            logger.warning_once(
-                "FP8 attention without FA3 provides no compute benefit. "
-                "Install FA3 for optimal FP8 support on Hopper GPUs."
+        if HAS_FA3 and fa3_attn_func is not None:
+            out = fa3_attn_func(
+                query, key, value,
+                softmax_scale=self.softmax_scale,
+                causal=self.causal,
+                q_descale=q_descale,
+                k_descale=k_descale,
+                v_descale=v_descale,
             )
-            output_dtype = torch.bfloat16
-            query = dequantize_fp8(query, q_scale, output_dtype)
-            key = dequantize_fp8(key, k_scale, output_dtype)
-            value = dequantize_fp8(value, v_scale, output_dtype)
-            attn_metadata.q_scale = None
-            attn_metadata.k_scale = None
-            attn_metadata.v_scale = None
-            return self.forward_cuda(query, key, value, attn_metadata)
-
-        # Reshape per-tensor scales to FA3's expected (batch, num_kv_heads)
-        B, S, H, D = key.shape
-        q_descale = self._reshape_descale(q_scale, B, H)
-        k_descale = self._reshape_descale(k_scale, B, H)
-        v_descale = self._reshape_descale(v_scale, B, H)
+            if isinstance(out, tuple):
+                out = out[0]
+            return out
 
-        # Use regular FA3 path even with padding — FA3 varlen + FP8 descale
-        # produces corrupted output on current fa3-fwd builds. The padding
-        # tokens (~4% for HunyuanVideo) add negligible extra compute.
-        #
-        # Use num_splits to improve FP8 accumulation accuracy at long sequences.
-        # FA3 FP8 TC on Hopper has imprecise accumulation (flash-attention #2250);
-        # splitting reduces tokens per chunk, bounding the error.
-        num_splits = max(1, S // 8192)  # ~8K tokens per split
-        out = fa3_attn_func(
-            query, key, value,
-            softmax_scale=self.softmax_scale,
-            causal=self.causal,
-            q_descale=q_descale,
-            k_descale=k_descale,
-            v_descale=v_descale,
-            num_splits=num_splits,
+        # Last resort: dequant to BF16
+        from vllm_omni.quantization.kv_quant import dequantize_fp8
+
+        logger.warning_once(
+            "No FA3 available for FP8 attention. Dequantizing to BF16."
         )
-        if isinstance(out, tuple):
-            out = out[0]
-        return out
+        output_dtype = torch.bfloat16
+        query = dequantize_fp8(query, q_scale, output_dtype)
+        key = dequantize_fp8(key, k_scale, output_dtype)
+        value = dequantize_fp8(value, v_scale, output_dtype)
+        attn_metadata.q_scale = None
+        attn_metadata.k_scale = None
+        attn_metadata.v_scale = None
+        return self.forward_cuda(query, key, value, attn_metadata)

From 1838fc1638ff04f0b91e07c9a624a9d8792a43a9 Mon Sep 17 00:00:00 2001
From: lishunyang12 <lishunyang12@163.com>
Date: Mon, 6 Apr 2026 09:58:10 +0800
Subject: [PATCH 29/45] Optimize FP8 attention: zero-overhead quantization,
 direct FA3 path
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Replace dynamic per-tensor scaling (expensive amax reduction) with
  direct saturating cast (scale=1.0). Safe for diffusion where Q/K/V
  values are well within FP8 e4m3 range (±448).
- Use fa3_attn_func directly instead of varlen API to eliminate
  reshape + cu_seqlens overhead.
- Remove padding mask zeroing (unnecessary for non-varlen path).
- Remove debug scripts from PR.

Expected ~20% speedup over BF16 for 121-frame HunyuanVideo generation.

Signed-off-by: lishunyang <lishunyang12@163.com>
---
 debug_fa3_check.py                            | 383 ------------------
 debug_fa3_version.py                          | 130 ------
 debug_shapes.py                               | 113 ------
 debug_vllm_fa.py                              | 157 -------
 .../attention/backends/flash_attn.py          |  44 +-
 vllm_omni/diffusion/attention/layer.py        |  11 +-
 vllm_omni/quantization/kv_quant.py            |  33 ++
 7 files changed, 43 insertions(+), 828 deletions(-)
 delete mode 100644 debug_fa3_check.py
 delete mode 100644 debug_fa3_version.py
 delete mode 100644 debug_shapes.py
 delete mode 100644 debug_vllm_fa.py

diff --git a/debug_fa3_check.py b/debug_fa3_check.py
deleted file mode 100644
index b173735d943..00000000000
--- a/debug_fa3_check.py
+++ /dev/null
@@ -1,383 +0,0 @@
-"""Debug script: check FA3 FP8 capabilities, run micro-benchmarks,
-and profile attention kernel performance for optimization planning."""
-import inspect
-import time
-
-import torch
-
-# ============================================================
-# 1. FA3 FP8 descale support check
-# ============================================================
-print("=" * 60)
-print("1. FA3 FP8 descale support check")
-print("=" * 60)
-
-try:
-    from fa3_fwd_interface import _flash_attn_forward
-    sig = inspect.signature(_flash_attn_forward)
-    params = list(sig.parameters.keys())
-    print(f"fa3_fwd params: {params}")
-    has_descale = any("descale" in p for p in params)
-    print(f"Has descale support: {has_descale}")
-except Exception as e:
-    print(f"fa3_fwd not available: {e}")
-
-try:
-    from vllm_omni.diffusion.attention.backends.utils.fa import (
-        flash_attn_varlen_func,
-    )
-    sig = inspect.signature(flash_attn_varlen_func)
-    params = list(sig.parameters.keys())
-    print(f"\nvarlen func params: {params}")
-    has_descale = any("descale" in p for p in params)
-    print(f"Varlen has descale: {has_descale}")
-except Exception as e:
-    print(f"varlen not available: {e}")
-
-try:
-    from vllm_omni.diffusion.attention.backends.ring.ring_globals import (
-        HAS_FA3,
-        fa3_attn_func,
-    )
-    print(f"\nHAS_FA3: {HAS_FA3}")
-    if fa3_attn_func is not None:
-        sig = inspect.signature(fa3_attn_func)
-        params = list(sig.parameters.keys())
-        print(f"fa3_attn_func params: {params}")
-except Exception as e:
-    print(f"ring_globals import failed: {e}")
-
-# ============================================================
-# 2. Package versions and GPU info
-# ============================================================
-print("\n" + "=" * 60)
-print("2. Environment")
-print("=" * 60)
-
-try:
-    import flash_attn
-    print(f"flash_attn version: {flash_attn.__version__}")
-except Exception:
-    print("flash_attn: not installed or no __version__")
-
-print(f"torch version: {torch.__version__}")
-print(f"CUDA version: {torch.version.cuda}")
-if torch.cuda.is_available():
-    gpu = torch.cuda.get_device_name(0)
-    cap = torch.cuda.get_device_capability(0)
-    props = torch.cuda.get_device_properties(0)
-    mem = getattr(props, 'total_memory', getattr(props, 'total_mem', 0)) / 1024**3
-    print(f"GPU: {gpu} (SM {cap[0]}{cap[1]}, {mem:.1f} GB)")
-    print(f"  FP8 tensor cores: {'Yes' if cap[0] >= 9 else 'No'} (need SM90+)")
-else:
-    print("GPU: N/A")
-
-# Check vLLM fused quant kernel
-try:
-    from vllm._custom_ops import scaled_fp8_quant
-    print(f"vLLM scaled_fp8_quant: available")
-except Exception:
-    print("vLLM scaled_fp8_quant: NOT available (will use PyTorch fallback)")
-
-# Check torch.compile status
-print(f"torch.compile available: {hasattr(torch, 'compile')}")
-try:
-    import triton
-    print(f"triton version: {triton.__version__}")
-except Exception:
-    print("triton: not installed")
-
-# ============================================================
-# 3. FP8 micro-benchmarks (quantization overhead)
-# ============================================================
-if not torch.cuda.is_available():
-    print("\nSkipping benchmarks (no GPU)")
-    exit()
-
-print("\n" + "=" * 60)
-print("3. FP8 quantization overhead micro-benchmark")
-print("=" * 60)
-
-device = "cuda"
-
-# Simulate HunyuanVideo tensor shapes
-# 33 frames: ~(1, 2640, 24, 128) for single-stream, (1, 2640+256, 24, 128) for joint
-# Actual shapes from debug_shapes.py:
-# 121f: q=[1, 50345, 16, 128] (48360 img + ~1985 encoder tokens)
-# dummy: q=[1, 6081, 16, 128] (smaller warmup shape)
-test_shapes = [
-    ("dummy warmup", (1, 6081, 16, 128)),
-    ("121f actual", (1, 50345, 16, 128)),
-]
-
-for name, shape in test_shapes:
-    q = torch.randn(shape, dtype=torch.bfloat16, device=device)
-    k = torch.randn(shape, dtype=torch.bfloat16, device=device)
-    v = torch.randn(shape, dtype=torch.bfloat16, device=device)
-
-    # Warmup
-    for _ in range(3):
-        from vllm_omni.quantization.kv_quant import quantize_qkv_fp8
-        quantize_qkv_fp8(q, k, v)
-    torch.cuda.synchronize()
-
-    # Benchmark quantization
-    n_iters = 20
-    start = time.perf_counter()
-    for _ in range(n_iters):
-        fp8_q, fp8_k, fp8_v, qs, ks, vs = quantize_qkv_fp8(q, k, v)
-    torch.cuda.synchronize()
-    quant_time = (time.perf_counter() - start) / n_iters * 1000
-
-    # Static quant (with cached scale — no amax)
-    _, _, _, qs, ks, vs = quantize_qkv_fp8(q, k, v)
-    for _ in range(3):
-        quantize_qkv_fp8(q, k, v, cached_scales=(qs, ks, vs))
-    torch.cuda.synchronize()
-
-    start = time.perf_counter()
-    for _ in range(n_iters):
-        fp8_q, fp8_k, fp8_v, qs2, ks2, vs2 = quantize_qkv_fp8(
-            q, k, v, cached_scales=(qs, ks, vs)
-        )
-    torch.cuda.synchronize()
-    static_time = (time.perf_counter() - start) / n_iters * 1000
-
-    print(f"  {name} {list(shape)}: dynamic={quant_time:.2f} ms, "
-          f"static={static_time:.2f} ms ({quant_time/static_time:.1f}x faster)")
-
-# ============================================================
-# 4. FA3 attention kernel benchmark (BF16 vs FP8)
-# ============================================================
-print("\n" + "=" * 60)
-print("4. FA3 attention kernel benchmark (BF16 vs FP8)")
-print("=" * 60)
-
-try:
-    from vllm_omni.diffusion.attention.backends.ring.ring_globals import (
-        HAS_FA3,
-        fa3_attn_func,
-    )
-    if not HAS_FA3 or fa3_attn_func is None:
-        raise RuntimeError("FA3 not available")
-except Exception as e:
-    print(f"Skipping: {e}")
-    exit()
-
-bench_shapes = [
-    ("dummy", (1, 6081, 16, 128)),
-    ("121f actual", (1, 50345, 16, 128)),
-]
-
-n_warmup = 5
-n_iters = 20
-
-for name, shape in bench_shapes:
-    B, S, H, D = shape
-    softmax_scale = D ** -0.5
-
-    # BF16 benchmark
-    q_bf16 = torch.randn(shape, dtype=torch.bfloat16, device=device)
-    k_bf16 = torch.randn(shape, dtype=torch.bfloat16, device=device)
-    v_bf16 = torch.randn(shape, dtype=torch.bfloat16, device=device)
-
-    for _ in range(n_warmup):
-        fa3_attn_func(q_bf16, k_bf16, v_bf16, softmax_scale=softmax_scale, causal=False)
-    torch.cuda.synchronize()
-
-    start = time.perf_counter()
-    for _ in range(n_iters):
-        fa3_attn_func(q_bf16, k_bf16, v_bf16, softmax_scale=softmax_scale, causal=False)
-    torch.cuda.synchronize()
-    bf16_time = (time.perf_counter() - start) / n_iters * 1000
-
-    # FP8 benchmark (quantize + attention)
-    fp8_q, fp8_k, fp8_v, qs, ks, vs = quantize_qkv_fp8(q_bf16, k_bf16, v_bf16)
-    # FA3 expects descale shape (batch, num_kv_heads)
-    qs_2d = qs.view(1, 1).expand(B, H).contiguous()
-    ks_2d = ks.view(1, 1).expand(B, H).contiguous()
-    vs_2d = vs.view(1, 1).expand(B, H).contiguous()
-
-    for _ in range(n_warmup):
-        fa3_attn_func(fp8_q, fp8_k, fp8_v, softmax_scale=softmax_scale,
-                      causal=False, q_descale=qs_2d, k_descale=ks_2d, v_descale=vs_2d)
-    torch.cuda.synchronize()
-
-    # FP8 kernel only (no quant overhead)
-    start = time.perf_counter()
-    for _ in range(n_iters):
-        fa3_attn_func(fp8_q, fp8_k, fp8_v, softmax_scale=softmax_scale,
-                      causal=False, q_descale=qs_2d, k_descale=ks_2d, v_descale=vs_2d)
-    torch.cuda.synchronize()
-    fp8_kernel_time = (time.perf_counter() - start) / n_iters * 1000
-
-    # FP8 end-to-end (quant + attention)
-    start = time.perf_counter()
-    for _ in range(n_iters):
-        fp8_q, fp8_k, fp8_v, qs, ks, vs = quantize_qkv_fp8(q_bf16, k_bf16, v_bf16)
-        qs_2d = qs.view(1, 1).expand(B, H).contiguous()
-        ks_2d = ks.view(1, 1).expand(B, H).contiguous()
-        vs_2d = vs.view(1, 1).expand(B, H).contiguous()
-        fa3_attn_func(fp8_q, fp8_k, fp8_v, softmax_scale=softmax_scale,
-                      causal=False, q_descale=qs_2d, k_descale=ks_2d, v_descale=vs_2d)
-    torch.cuda.synchronize()
-    fp8_e2e_time = (time.perf_counter() - start) / n_iters * 1000
-
-    speedup_kernel = bf16_time / fp8_kernel_time
-    speedup_e2e = bf16_time / fp8_e2e_time
-
-    print(f"\n  {name} {list(shape)}:")
-    print(f"    BF16 attn:           {bf16_time:.2f} ms")
-    print(f"    FP8 kernel only:     {fp8_kernel_time:.2f} ms  ({speedup_kernel:.2f}x)")
-    print(f"    FP8 quant+kernel:    {fp8_e2e_time:.2f} ms  ({speedup_e2e:.2f}x)")
-    print(f"    Quant overhead:      {fp8_e2e_time - fp8_kernel_time:.2f} ms")
-
-# ============================================================
-# 5. FA3 varlen FP8 benchmark (with padding mask)
-# ============================================================
-print("\n" + "=" * 60)
-print("5. FA3 varlen FP8 benchmark (with padding mask)")
-print("=" * 60)
-
-try:
-    from vllm_omni.diffusion.attention.backends.utils.fa import (
-        flash_attn_varlen_func,
-        _unpad_input,
-        _upad_input,
-    )
-except Exception as e:
-    print(f"Skipping varlen benchmark: {e}")
-    exit()
-
-for name, shape in bench_shapes:
-    B, S, H, D = shape
-    softmax_scale = D ** -0.5
-
-    q_bf16 = torch.randn(shape, dtype=torch.bfloat16, device=device)
-    k_bf16 = torch.randn(shape, dtype=torch.bfloat16, device=device)
-    v_bf16 = torch.randn(shape, dtype=torch.bfloat16, device=device)
-
-    # Create a realistic mask: ~1974 false values (actual encoder padding)
-    mask = torch.ones(B, S, dtype=torch.bool, device=device)
-    n_pad = 1974  # actual mask_false from debug_shapes.py
-    mask[:, -n_pad:] = False
-
-    # Unpad inputs
-    q_up, k_up, v_up, indices_q, (cu_q, cu_k), (max_q, max_k) = _upad_input(
-        q_bf16, k_bf16, v_bf16, mask, S, _unpad_input
-    )
-
-    # BF16 varlen
-    for _ in range(n_warmup):
-        flash_attn_varlen_func(q_up, k_up, v_up, cu_seqlens_q=cu_q, cu_seqlens_k=cu_k,
-                               max_seqlen_q=max_q, max_seqlen_k=max_k,
-                               softmax_scale=softmax_scale, causal=False)
-    torch.cuda.synchronize()
-
-    start = time.perf_counter()
-    for _ in range(n_iters):
-        flash_attn_varlen_func(q_up, k_up, v_up, cu_seqlens_q=cu_q, cu_seqlens_k=cu_k,
-                               max_seqlen_q=max_q, max_seqlen_k=max_k,
-                               softmax_scale=softmax_scale, causal=False)
-    torch.cuda.synchronize()
-    varlen_bf16 = (time.perf_counter() - start) / n_iters * 1000
-
-    # FP8 varlen
-    fp8_q, fp8_k, fp8_v, qs, ks, vs = quantize_qkv_fp8(q_bf16, k_bf16, v_bf16)
-    q_up_fp8, k_up_fp8, v_up_fp8, _, _, _ = _upad_input(
-        fp8_q, fp8_k, fp8_v, mask, S, _unpad_input
-    )
-
-    # FA3 expects descale shape (batch, num_kv_heads)
-    qs_2d = qs.view(1, 1).expand(B, H).contiguous()
-    ks_2d = ks.view(1, 1).expand(B, H).contiguous()
-    vs_2d = vs.view(1, 1).expand(B, H).contiguous()
-
-    for _ in range(n_warmup):
-        flash_attn_varlen_func(q_up_fp8, k_up_fp8, v_up_fp8,
-                               cu_seqlens_q=cu_q, cu_seqlens_k=cu_k,
-                               max_seqlen_q=max_q, max_seqlen_k=max_k,
-                               softmax_scale=softmax_scale, causal=False,
-                               q_descale=qs_2d, k_descale=ks_2d, v_descale=vs_2d)
-    torch.cuda.synchronize()
-
-    start = time.perf_counter()
-    for _ in range(n_iters):
-        flash_attn_varlen_func(q_up_fp8, k_up_fp8, v_up_fp8,
-                               cu_seqlens_q=cu_q, cu_seqlens_k=cu_k,
-                               max_seqlen_q=max_q, max_seqlen_k=max_k,
-                               softmax_scale=softmax_scale, causal=False,
-                               q_descale=qs_2d, k_descale=ks_2d, v_descale=vs_2d)
-    torch.cuda.synchronize()
-    varlen_fp8 = (time.perf_counter() - start) / n_iters * 1000
-
-    speedup = varlen_bf16 / varlen_fp8
-
-    print(f"\n  {name} {list(shape)} ({n_pad} padding):")
-    print(f"    BF16 varlen:  {varlen_bf16:.2f} ms")
-    print(f"    FP8 varlen:   {varlen_fp8:.2f} ms  ({speedup:.2f}x)")
-
-# ============================================================
-# 6. Breakdown: where time goes in a DiT layer
-# ============================================================
-print("\n" + "=" * 60)
-print("6. Time breakdown estimate for one DiT layer")
-print("=" * 60)
-
-shape_121f = (1, 50345, 16, 128)
-B, S, H, D = shape_121f
-hidden_dim = H * D  # 3072
-softmax_scale = D ** -0.5
-
-# Linear projections (Q/K/V projection + output projection)
-x = torch.randn(B, S, hidden_dim, dtype=torch.bfloat16, device=device)
-w_qkv = torch.randn(hidden_dim * 3, hidden_dim, dtype=torch.bfloat16, device=device)
-w_out = torch.randn(hidden_dim, hidden_dim, dtype=torch.bfloat16, device=device)
-
-for _ in range(n_warmup):
-    torch.nn.functional.linear(x, w_qkv)
-torch.cuda.synchronize()
-
-start = time.perf_counter()
-for _ in range(n_iters):
-    torch.nn.functional.linear(x, w_qkv)
-torch.cuda.synchronize()
-linear_qkv_time = (time.perf_counter() - start) / n_iters * 1000
-
-for _ in range(n_warmup):
-    torch.nn.functional.linear(x, w_out)
-torch.cuda.synchronize()
-
-start = time.perf_counter()
-for _ in range(n_iters):
-    torch.nn.functional.linear(x, w_out)
-torch.cuda.synchronize()
-linear_out_time = (time.perf_counter() - start) / n_iters * 1000
-
-# Attention (already measured above)
-q_bf16 = torch.randn(shape_121f, dtype=torch.bfloat16, device=device)
-k_bf16 = torch.randn(shape_121f, dtype=torch.bfloat16, device=device)
-v_bf16 = torch.randn(shape_121f, dtype=torch.bfloat16, device=device)
-
-for _ in range(n_warmup):
-    fa3_attn_func(q_bf16, k_bf16, v_bf16, softmax_scale=softmax_scale, causal=False)
-torch.cuda.synchronize()
-
-start = time.perf_counter()
-for _ in range(n_iters):
-    fa3_attn_func(q_bf16, k_bf16, v_bf16, softmax_scale=softmax_scale, causal=False)
-torch.cuda.synchronize()
-attn_time = (time.perf_counter() - start) / n_iters * 1000
-
-total = linear_qkv_time + attn_time + linear_out_time
-print(f"  121f single layer breakdown (estimated):")
-print(f"    QKV projection:  {linear_qkv_time:.2f} ms ({linear_qkv_time/total*100:.0f}%)")
-print(f"    Attention:       {attn_time:.2f} ms ({attn_time/total*100:.0f}%)")
-print(f"    Output proj:     {linear_out_time:.2f} ms ({linear_out_time/total*100:.0f}%)")
-print(f"    Total:           {total:.2f} ms")
-print(f"    Layers x steps:  54 layers x 30 steps = {54*30} calls")
-print(f"    Attn total est:  {attn_time * 54 * 30 / 1000:.1f}s out of ~{total * 54 * 30 / 1000:.1f}s")
-
-print("\n" + "=" * 60)
-print("Done.")
-print("=" * 60)
diff --git a/debug_fa3_version.py b/debug_fa3_version.py
deleted file mode 100644
index 3cfb58f4fd7..00000000000
--- a/debug_fa3_version.py
+++ /dev/null
@@ -1,130 +0,0 @@
-"""Debug: check which flash-attention builds are available and whether
-the FP8 two-level accumulation fix is present."""
-import importlib
-import os
-import sys
-
-print("=" * 60)
-print("1. fa3-fwd (current FP8 attention backend)")
-print("=" * 60)
-try:
-    import fa3_fwd_interface
-    print(f"  Location: {fa3_fwd_interface.__file__}")
-    # Check if the source has two-level accumulation
-    src_path = os.path.dirname(fa3_fwd_interface.__file__)
-    print(f"  Package dir: {src_path}")
-    # List files
-    for f in sorted(os.listdir(src_path)):
-        if f.endswith(('.py', '.so', '.pyd')):
-            print(f"    {f}")
-except Exception as e:
-    print(f"  Not available: {e}")
-
-try:
-    import fa3_fwd_cuda
-    print(f"\n  fa3_fwd_cuda location: {fa3_fwd_cuda.__file__}")
-except Exception as e:
-    print(f"\n  fa3_fwd_cuda: {e}")
-
-print("\n" + "=" * 60)
-print("2. vLLM's flash-attention (may have the fix)")
-print("=" * 60)
-
-# Check vLLM's internal flash-attn
-paths_to_check = [
-    "vllm.attention.backends.flash_attn",
-    "vllm.vllm_flash_attn",
-    "vllm._custom_ops",
-]
-for mod_path in paths_to_check:
-    try:
-        mod = importlib.import_module(mod_path)
-        print(f"  {mod_path}: {mod.__file__}")
-    except Exception as e:
-        print(f"  {mod_path}: not available ({e})")
-
-# Check if vLLM ships its own flash_attn_func with descale
-try:
-    from vllm.attention.backends.flash_attn import flash_attn_varlen_func
-    import inspect
-    sig = inspect.signature(flash_attn_varlen_func)
-    params = list(sig.parameters.keys())
-    has_descale = any("descale" in p for p in params)
-    print(f"\n  vLLM flash_attn_varlen_func params: {params}")
-    print(f"  Has descale: {has_descale}")
-except Exception as e:
-    print(f"\n  vLLM flash_attn_varlen_func: {e}")
-
-print("\n" + "=" * 60)
-print("3. flash_attn pip package")
-print("=" * 60)
-try:
-    import flash_attn
-    print(f"  Version: {flash_attn.__version__}")
-    print(f"  Location: {flash_attn.__file__}")
-except Exception as e:
-    print(f"  Not installed: {e}")
-
-print("\n" + "=" * 60)
-print("4. Check for two-level accumulation in fa3-fwd source")
-print("=" * 60)
-try:
-    import fa3_fwd_interface
-    src_file = fa3_fwd_interface.__file__
-    with open(src_file, 'r') as f:
-        content = f.read()
-    # Search for signs of two-level accumulation
-    keywords = [
-        "two_level", "TWO_LEVEL",
-        "accum_fp32", "ACCUM_FP32",
-        "fp8_two_level", "FP8_TWO_LEVEL",
-        "accumulation_fix",
-        "flush_accum",
-    ]
-    found = False
-    for kw in keywords:
-        if kw.lower() in content.lower():
-            print(f"  Found '{kw}' in fa3_fwd_interface.py")
-            found = True
-    if not found:
-        print("  No two-level accumulation keywords found in fa3_fwd_interface.py")
-        print("  -> This build likely does NOT have the FP8 accumulation fix")
-except Exception as e:
-    print(f"  Could not read source: {e}")
-
-print("\n" + "=" * 60)
-print("5. vllm-project/flash-attention fork check")
-print("=" * 60)
-# Check if there's a vllm flash_attn with the fix
-search_paths = [
-    "/workspace/.venv/lib/python3.12/site-packages/vllm",
-    "/workspace/.venv/lib/python3.12/site-packages/flash_attn",
-    "/workspace/.venv/lib/python3.12/site-packages",
-]
-for base in search_paths:
-    if os.path.isdir(base):
-        for root, dirs, files in os.walk(base):
-            for f in files:
-                if "flash" in f.lower() and f.endswith('.so'):
-                    full = os.path.join(root, f)
-                    size_mb = os.path.getsize(full) / 1024 / 1024
-                    print(f"  {full} ({size_mb:.1f} MB)")
-            # Don't recurse too deep
-            if root.count(os.sep) - base.count(os.sep) > 2:
-                dirs.clear()
-
-print("\n" + "=" * 60)
-print("6. Environment variable check")
-print("=" * 60)
-env_vars = [
-    "FLASH_ATTENTION_DISABLE_FP8_TWO_LEVEL_ACCUMULATION",
-    "FLASH_ATTENTION_FORCE_FP8_TWO_LEVEL_ACCUMULATION",
-    "VLLM_FLASH_ATTN_SRC_DIR",
-]
-for var in env_vars:
-    val = os.environ.get(var, "<not set>")
-    print(f"  {var}: {val}")
-
-print("\n" + "=" * 60)
-print("Done.")
-print("=" * 60)
diff --git a/debug_shapes.py b/debug_shapes.py
deleted file mode 100644
index 5a5edd6106c..00000000000
--- a/debug_shapes.py
+++ /dev/null
@@ -1,113 +0,0 @@
-"""Debug script: check actual Q/K/V tensor shapes during HunyuanVideo inference.
-Patches Attention.forward to log shapes on first call, then runs a short generation."""
-import torch
-
-# Monkey-patch Attention.forward to log actual tensor shapes
-_shape_log = []
-_logged_count = 0
-_max_log = 10  # only log first 10 unique shapes
-
-
-def _patch_attention():
-    from vllm_omni.diffusion.attention.layer import Attention
-    _orig_forward = Attention.forward
-
-    def _logging_forward(self, query, key, value, attn_metadata=None):
-        global _logged_count
-        if _logged_count < _max_log:
-            shape_key = (tuple(query.shape), tuple(key.shape))
-            if shape_key not in [s[0:2] for s in _shape_log]:
-                has_mask = (attn_metadata is not None and
-                            attn_metadata.attn_mask is not None)
-                mask_shape = (list(attn_metadata.attn_mask.shape)
-                              if has_mask else None)
-                mask_false = 0
-                if has_mask:
-                    mask_false = int((~attn_metadata.attn_mask).sum().item())
-
-                entry = (
-                    tuple(query.shape),
-                    tuple(key.shape),
-                    query.dtype,
-                    mask_shape,
-                    mask_false,
-                )
-                _shape_log.append(entry)
-                _logged_count += 1
-                B, S, H, D = query.shape
-                print(f"[SHAPE] q={list(query.shape)} k={list(key.shape)} "
-                      f"dtype={query.dtype} "
-                      f"tokens={S} heads={H} headdim={D} "
-                      f"mask={mask_shape} mask_false={mask_false}")
-        return _orig_forward(self, query, key, value, attn_metadata)
-
-    Attention.forward = _logging_forward
-    print("[DEBUG] Attention.forward patched for shape logging")
-
-
-_patch_attention()
-
-
-def main():
-    print("\n" + "=" * 60)
-    print("Theoretical token counts for HunyuanVideo 1.5")
-    print("=" * 60)
-    for n_frames in [33, 61, 81, 121]:
-        t = (n_frames - 1) // 4 + 1
-        h = 480 // 8
-        w = 832 // 8
-        vae_tokens = t * h * w
-        h_p = h // 2
-        w_p = w // 2
-        patch_tokens = t * h_p * w_p
-        print(f"  {n_frames}f: VAE latent {t}x{h}x{w}={vae_tokens}, "
-              f"after patch {t}x{h_p}x{w_p}={patch_tokens} tokens")
-
-    print("\n" + "=" * 60)
-    print("Running short generation to capture actual shapes...")
-    print("=" * 60)
-
-    from vllm_omni.diffusion.data import DiffusionParallelConfig
-    from vllm_omni.entrypoints.omni import Omni
-    from vllm_omni.inputs.data import OmniDiffusionSamplingParams
-    from vllm_omni.platforms import current_omni_platform
-
-    model = "hunyuanvideo-community/HunyuanVideo-1.5-Diffusers-480p_t2v"
-    generator = torch.Generator(device=current_omni_platform.device_type).manual_seed(42)
-
-    omni = Omni(
-        model=model,
-        vae_use_tiling=True,
-        enforce_eager=True,
-        parallel_config=DiffusionParallelConfig(),
-    )
-
-    print("\n[DEBUG] Starting generation (2 steps only)...")
-    try:
-        outputs = omni.generate(
-            {"prompt": "A cat in a garden."},
-            OmniDiffusionSamplingParams(
-                height=480,
-                width=832,
-                num_frames=121,
-                generator=generator,
-                guidance_scale=6.0,
-                num_inference_steps=2,
-            ),
-        )
-        print("[DEBUG] Generation completed.")
-    except Exception as e:
-        print(f"[DEBUG] Generation error (may be expected): {e}")
-
-    print("\n" + "=" * 60)
-    print("Shape summary")
-    print("=" * 60)
-    for entry in _shape_log:
-        q_shape, k_shape, dtype, mask_shape, mask_false = entry
-        B, S, H, D = q_shape
-        print(f"  q={list(q_shape)} k={list(k_shape)} "
-              f"tokens={S} mask_false={mask_false}")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/debug_vllm_fa.py b/debug_vllm_fa.py
deleted file mode 100644
index eae70097802..00000000000
--- a/debug_vllm_fa.py
+++ /dev/null
@@ -1,157 +0,0 @@
-"""Check vllm.vllm_flash_attn for FP8 support and two-level accumulation."""
-import inspect
-import os
-
-print("=" * 60)
-print("1. vllm.vllm_flash_attn contents")
-print("=" * 60)
-try:
-    import vllm.vllm_flash_attn as vfa
-    print(f"Location: {vfa.__file__}")
-    pkg_dir = os.path.dirname(vfa.__file__)
-    for f in sorted(os.listdir(pkg_dir)):
-        full = os.path.join(pkg_dir, f)
-        if os.path.isfile(full):
-            size = os.path.getsize(full)
-            print(f"  {f} ({size/1024:.1f} KB)")
-
-    print(f"\nExported names: {dir(vfa)}")
-except Exception as e:
-    print(f"Not available: {e}")
-
-print("\n" + "=" * 60)
-print("2. Check for flash_attn_func / varlen with descale")
-print("=" * 60)
-funcs_to_check = [
-    "flash_attn_func",
-    "flash_attn_varlen_func",
-    "flash_attn_with_kvcache",
-]
-for fname in funcs_to_check:
-    try:
-        func = getattr(vfa, fname, None)
-        if func is None:
-            # Try submodule
-            try:
-                from vllm.vllm_flash_attn import flash_attn_interface
-                func = getattr(flash_attn_interface, fname, None)
-            except:
-                pass
-        if func is not None:
-            sig = inspect.signature(func)
-            params = list(sig.parameters.keys())
-            has_descale = any("descale" in p for p in params)
-            print(f"\n  {fname}:")
-            print(f"    params: {params}")
-            print(f"    has descale: {has_descale}")
-        else:
-            print(f"\n  {fname}: not found")
-    except Exception as e:
-        print(f"\n  {fname}: error - {e}")
-
-print("\n" + "=" * 60)
-print("3. Check for two-level accumulation in source")
-print("=" * 60)
-try:
-    pkg_dir = os.path.dirname(vfa.__file__)
-    keywords = [
-        "two_level", "TWO_LEVEL", "fp8_two_level",
-        "FP8_TWO_LEVEL", "accum", "flush",
-    ]
-    for f in os.listdir(pkg_dir):
-        if f.endswith('.py'):
-            filepath = os.path.join(pkg_dir, f)
-            with open(filepath, 'r') as fh:
-                content = fh.read()
-            for kw in keywords:
-                if kw.lower() in content.lower():
-                    # Find the line
-                    for i, line in enumerate(content.split('\n')):
-                        if kw.lower() in line.lower():
-                            print(f"  {f}:{i+1}: {line.strip()[:100]}")
-                            break
-except Exception as e:
-    print(f"  Error: {e}")
-
-print("\n" + "=" * 60)
-print("4. Check CUDA backend for FP8")
-print("=" * 60)
-try:
-    pkg_dir = os.path.dirname(vfa.__file__)
-    for f in os.listdir(pkg_dir):
-        if f.endswith('.so') or f.endswith('.pyd'):
-            full = os.path.join(pkg_dir, f)
-            size_mb = os.path.getsize(full) / 1024 / 1024
-            print(f"  {f} ({size_mb:.1f} MB)")
-except Exception as e:
-    print(f"  Error: {e}")
-
-print("\n" + "=" * 60)
-print("5. Quick FP8 functional test with vllm_flash_attn")
-print("=" * 60)
-try:
-    import torch
-    if not torch.cuda.is_available():
-        print("  No GPU, skipping")
-    else:
-        # Try to use vllm's flash_attn for FP8
-        func = getattr(vfa, 'flash_attn_func', None)
-        if func is None:
-            try:
-                from vllm.vllm_flash_attn.flash_attn_interface import flash_attn_func as func
-            except:
-                pass
-
-        if func is not None:
-            sig = inspect.signature(func)
-            params = list(sig.parameters.keys())
-            if any("descale" in p for p in params):
-                # Test FP8 attention
-                B, S, H, D = 1, 1024, 16, 128
-                q = torch.randn(B, S, H, D, dtype=torch.bfloat16, device="cuda")
-                k = torch.randn(B, S, H, D, dtype=torch.bfloat16, device="cuda")
-                v = torch.randn(B, S, H, D, dtype=torch.bfloat16, device="cuda")
-
-                # Quantize to FP8
-                from vllm_omni.quantization.kv_quant import quantize_qkv_fp8
-                fp8_q, fp8_k, fp8_v, qs, ks, vs = quantize_qkv_fp8(q, k, v)
-
-                qs_2d = qs.view(1,1).expand(B, H).contiguous()
-                ks_2d = ks.view(1,1).expand(B, H).contiguous()
-                vs_2d = vs.view(1,1).expand(B, H).contiguous()
-
-                # Find the right param names
-                descale_params = [p for p in params if "descale" in p]
-                print(f"  Descale param names: {descale_params}")
-
-                kwargs = {"softmax_scale": D**-0.5, "causal": False}
-                for p in descale_params:
-                    if "q" in p: kwargs[p] = qs_2d
-                    elif "k" in p: kwargs[p] = ks_2d
-                    elif "v" in p: kwargs[p] = vs_2d
-
-                out = func(fp8_q, fp8_k, fp8_v, **kwargs)
-                if isinstance(out, tuple):
-                    out = out[0]
-                print(f"  FP8 test passed! Output shape: {out.shape}, dtype: {out.dtype}")
-                print(f"  Output has NaN: {torch.isnan(out).any()}")
-                print(f"  Output has Inf: {torch.isinf(out).any()}")
-
-                # Compare with BF16
-                out_bf16 = func(q, k, v, softmax_scale=D**-0.5, causal=False)
-                if isinstance(out_bf16, tuple):
-                    out_bf16 = out_bf16[0]
-                diff = (out.float() - out_bf16.float()).abs().mean().item()
-                print(f"  Mean abs diff vs BF16: {diff:.6f}")
-            else:
-                print("  vllm flash_attn_func has no descale params")
-        else:
-            print("  No flash_attn_func found in vllm_flash_attn")
-except Exception as e:
-    print(f"  Error: {e}")
-    import traceback
-    traceback.print_exc()
-
-print("\n" + "=" * 60)
-print("Done.")
-print("=" * 60)
diff --git a/vllm_omni/diffusion/attention/backends/flash_attn.py b/vllm_omni/diffusion/attention/backends/flash_attn.py
index ea5147ae38d..60ccba99c9b 100644
--- a/vllm_omni/diffusion/attention/backends/flash_attn.py
+++ b/vllm_omni/diffusion/attention/backends/flash_attn.py
@@ -234,11 +234,10 @@ def _forward_fp8(
         value: torch.Tensor,
         attn_metadata: AttentionMetadata,
     ) -> torch.Tensor:
-        """FP8 Q/K/V attention path.
+        """Optimized FP8 Q/K/V attention path.
 
-        Uses vLLM's bundled FA3 backend (vllm_flash_attn) which has the
-        two-level accumulation fix for FP8 on Hopper. Falls back to
-        fa3-fwd or dequant if vLLM's FA3 is unavailable.
+        Uses fa3_attn_func directly (non-varlen) for minimum overhead.
+        With scale=1.0 fast quantization, descale tensors are all-ones.
         """
         q_scale = attn_metadata.q_scale
         k_scale = attn_metadata.k_scale
@@ -249,40 +248,7 @@ def _forward_fp8(
         k_descale = self._reshape_descale(k_scale, B, H)
         v_descale = self._reshape_descale(v_scale, B, H)
 
-        # Try vLLM's bundled FA3 (has two-level accumulation fix for FP8)
-        try:
-            from vllm.vllm_flash_attn import flash_attn_varlen_func as vllm_varlen
-
-            # varlen API needs (total_tokens, H, D) and cu_seqlens
-            q_flat = query.reshape(B * S, H, D)
-            k_flat = key.reshape(B * S, H, D)
-            v_flat = value.reshape(B * S, H, D)
-            cu_seqlens = torch.arange(
-                0, (B + 1) * S, step=S, dtype=torch.int32, device=query.device
-            )
-
-            out = vllm_varlen(
-                q_flat, k_flat, v_flat,
-                max_seqlen_q=S,
-                cu_seqlens_q=cu_seqlens,
-                max_seqlen_k=S,
-                cu_seqlens_k=cu_seqlens,
-                softmax_scale=self.softmax_scale,
-                causal=self.causal,
-                q_descale=q_descale,
-                k_descale=k_descale,
-                v_descale=v_descale,
-                fa_version=3,
-            )
-            if isinstance(out, tuple):
-                out = out[0]
-            return out.reshape(B, S, H, D)
-        except Exception as e:
-            logger.warning_once(
-                "vLLM FA3 FP8 failed (%s), trying fa3-fwd fallback.", e
-            )
-
-        # Fallback: fa3-fwd (may lack two-level accumulation fix)
+        # Primary path: fa3_attn_func (non-varlen, lowest overhead)
         from vllm_omni.diffusion.attention.backends.ring.ring_globals import (
             HAS_FA3,
             fa3_attn_func,
@@ -301,7 +267,7 @@ def _forward_fp8(
                 out = out[0]
             return out
 
-        # Last resort: dequant to BF16
+        # Fallback: dequant to BF16
         from vllm_omni.quantization.kv_quant import dequantize_fp8
 
         logger.warning_once(
diff --git a/vllm_omni/diffusion/attention/layer.py b/vllm_omni/diffusion/attention/layer.py
index 286dffea02c..582a54f236d 100644
--- a/vllm_omni/diffusion/attention/layer.py
+++ b/vllm_omni/diffusion/attention/layer.py
@@ -130,12 +130,12 @@ def _quantize_qkv_fp8(
         step always runs dynamic.
         """
         from vllm_omni.quantization.kv_quant import (
-            quantize_kv_fp8,
-            quantize_qkv_fp8,
+            quantize_kv_fp8_fast,
+            quantize_qkv_fp8_fast,
         )
 
-        fp8_q, fp8_k, fp8_v, q_scale, k_scale, v_scale = quantize_qkv_fp8(
-            query, key, value, cached_scales=None
+        fp8_q, fp8_k, fp8_v, q_scale, k_scale, v_scale = quantize_qkv_fp8_fast(
+            query, key, value
         )
 
         if attn_metadata is None:
@@ -146,9 +146,8 @@ def _quantize_qkv_fp8(
 
         # Quantize joint_key/joint_value with separate scales
         if attn_metadata.joint_key is not None and attn_metadata.joint_value is not None:
-            jk, jv, jk_scale, jv_scale = quantize_kv_fp8(
+            jk, jv, jk_scale, jv_scale = quantize_kv_fp8_fast(
                 attn_metadata.joint_key, attn_metadata.joint_value,
-                cached_scales=None,
             )
             attn_metadata.joint_key = jk
             attn_metadata.joint_value = jv
diff --git a/vllm_omni/quantization/kv_quant.py b/vllm_omni/quantization/kv_quant.py
index 94992c1f11d..54240c1db38 100644
--- a/vllm_omni/quantization/kv_quant.py
+++ b/vllm_omni/quantization/kv_quant.py
@@ -144,3 +144,36 @@ def dequantize_fp8(
         Dequantized tensor: ``tensor.to(output_dtype) * inv_scale``.
     """
     return (tensor.to(output_dtype) * inv_scale).to(output_dtype)
+
+
+def quantize_qkv_fp8_fast(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor,
+           torch.Tensor, torch.Tensor, torch.Tensor]:
+    """Ultra-fast FP8 quantization using direct saturating cast (no amax).
+
+    For diffusion attention where Q/K/V values are typically in [-10, 10],
+    well within float8_e4m3fn range (±448). Eliminates the expensive
+    per-tensor amax reduction that dominates quantization overhead at
+    large sequence lengths (50K+ tokens).
+
+    Scale is fixed at 1.0 (identity), so descale is also 1.0.
+    """
+    one = torch.ones(1, dtype=torch.float32, device=query.device)
+    fp8_q = query.to(torch.float8_e4m3fn)
+    fp8_k = key.to(torch.float8_e4m3fn)
+    fp8_v = value.to(torch.float8_e4m3fn)
+    return fp8_q, fp8_k, fp8_v, one, one, one
+
+
+def quantize_kv_fp8_fast(
+    key: torch.Tensor,
+    value: torch.Tensor,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    """Fast FP8 quantization for K/V only (joint attention path)."""
+    one = torch.ones(1, dtype=torch.float32, device=key.device)
+    fp8_k = key.to(torch.float8_e4m3fn)
+    fp8_v = value.to(torch.float8_e4m3fn)
+    return fp8_k, fp8_v, one, one

From 529b7fb3a66fce41404a1ca92ada509603d8cb0b Mon Sep 17 00:00:00 2001
From: lishunyang <lishunyang12@163.com>
Date: Wed, 8 Apr 2026 00:05:22 +0800
Subject: [PATCH 30/45] Move FP8 quantization into attention backends with
 per-platform framework

Signed-off-by: lishunyang <lishunyang12@163.com>
---
 tests/diffusion/quantization/test_kv_quant.py |  30 +++--
 .../diffusion/attention/backends/abstract.py  |  62 ++++++++--
 .../attention/backends/flash_attn.py          |  71 +++++++----
 .../diffusion/attention/backends/sdpa.py      |  17 +--
 vllm_omni/diffusion/attention/layer.py        | 116 +++++-------------
 vllm_omni/diffusion/data.py                   |  12 +-
 vllm_omni/quantization/kv_quant.py            |   6 +
 7 files changed, 165 insertions(+), 149 deletions(-)

diff --git a/tests/diffusion/quantization/test_kv_quant.py b/tests/diffusion/quantization/test_kv_quant.py
index e9c7eebe861..a3276cc5319 100644
--- a/tests/diffusion/quantization/test_kv_quant.py
+++ b/tests/diffusion/quantization/test_kv_quant.py
@@ -121,19 +121,23 @@ def test_kv_cache_dtype_config_field():
     assert config_default.kv_cache_dtype is None
 
 
-def test_attention_metadata_scales():
-    """AttentionMetadata should have q/k/v and joint scale fields."""
+def test_is_quantized_kv_cache():
+    """is_quantized_kv_cache should detect FP8 dtype strings."""
+    from vllm_omni.quantization.kv_quant import is_quantized_kv_cache
+
+    assert is_quantized_kv_cache("fp8") is True
+    assert is_quantized_kv_cache("fp8_e4m3") is True
+    assert is_quantized_kv_cache(None) is False
+    assert is_quantized_kv_cache("auto") is False
+    assert is_quantized_kv_cache("bfloat16") is False
+
+
+def test_attention_metadata_kv_cache_dtype():
+    """AttentionMetadata should have kv_cache_dtype field."""
     from vllm_omni.diffusion.attention.backends.abstract import AttentionMetadata
 
     meta = AttentionMetadata()
-    assert meta.q_scale is None
-    assert meta.k_scale is None
-    assert meta.v_scale is None
-    assert meta.jk_scale is None
-    assert meta.jv_scale is None
-
-    scale = torch.tensor(0.5)
-    meta.q_scale = scale
-    meta.k_scale = scale
-    meta.v_scale = scale
-    assert meta.q_scale is scale
+    assert meta.kv_cache_dtype is None
+
+    meta.kv_cache_dtype = "fp8"
+    assert meta.kv_cache_dtype == "fp8"
diff --git a/vllm_omni/diffusion/attention/backends/abstract.py b/vllm_omni/diffusion/attention/backends/abstract.py
index 27476e8c43c..4a02c52b881 100644
--- a/vllm_omni/diffusion/attention/backends/abstract.py
+++ b/vllm_omni/diffusion/attention/backends/abstract.py
@@ -6,9 +6,12 @@
 from typing import Generic, TypeVar
 
 import torch
+from vllm.logger import init_logger
 
 from vllm_omni.platforms import current_omni_platform
 
+logger = init_logger(__name__)
+
 
 class AttentionBackend(ABC):
     """Abstract class for diffusion attention backends."""
@@ -19,6 +22,15 @@ class AttentionBackend(ABC):
     def supports_attention_mask(cls) -> bool:
         return False
 
+    @classmethod
+    def supports_kv_cache_dtype(cls, kv_cache_dtype: str | None) -> bool:
+        """Whether this backend supports the given KV cache quantization dtype.
+
+        Override in subclasses that support quantized KV (e.g. FP8).
+        Default: only None (no quantization) is supported.
+        """
+        return kv_cache_dtype is None
+
     @staticmethod
     @abstractmethod
     def get_name() -> str:
@@ -65,19 +77,21 @@ class AttentionMetadata:
     joint_strategy: str = "front"
     # the strategy to joint the query, key, and value, can be "front" or "rear"
 
-    # FP8 attention quantization dequant scales (set by Attention._quantize_qkv_fp8)
-    q_scale: torch.Tensor | None = None
-    k_scale: torch.Tensor | None = None
-    v_scale: torch.Tensor | None = None
-    # Separate scales for joint (img+txt concat) key/value
-    jk_scale: torch.Tensor | None = None
-    jv_scale: torch.Tensor | None = None
+    # KV cache dtype for quantization (e.g. "fp8"). Each backend decides
+    # whether and how to quantize Q/K/V based on this field.
+    kv_cache_dtype: str | None = None
 
 
 T = TypeVar("T", bound=AttentionMetadata)
 
 
 class AttentionImpl(ABC, Generic[T]):
+
+    # Per-platform kv_cache_dtype support. Maps platform name to set of
+    # supported dtype strings. Subclasses override to declare support.
+    # Example: {"CUDA": {"fp8", "fp8_e4m3"}, "NPU": {"fp8"}}
+    _supported_kv_cache_dtypes: dict[str, set[str]] = {}
+
     @abstractmethod
     def __init__(
         self,
@@ -91,6 +105,35 @@ def __init__(
     ) -> None:
         raise NotImplementedError
 
+    def _handle_kv_cache_dtype(
+        self,
+        attn_metadata: T | None,
+        platform: str,
+    ) -> None:
+        """Check kv_cache_dtype compatibility for this platform.
+
+        If the requested kv_cache_dtype is not in _supported_kv_cache_dtypes
+        for the current platform, it is cleared to None with a warning.
+
+        To add FP8 support for a new platform, add the platform key:
+            _supported_kv_cache_dtypes = {"CUDA": {"fp8"}, "NPU": {"fp8"}}
+        """
+        if attn_metadata is None:
+            return
+        kv_cache_dtype = attn_metadata.kv_cache_dtype
+        if kv_cache_dtype is None:
+            return
+        supported = self._supported_kv_cache_dtypes.get(platform, set())
+        if kv_cache_dtype not in supported:
+            logger.warning_once(
+                "kv_cache_dtype='%s' requested but %s on %s does not support "
+                "it. Running in native dtype.",
+                kv_cache_dtype,
+                type(self).__name__,
+                platform,
+            )
+            attn_metadata.kv_cache_dtype = None
+
     def forward(
         self,
         query: torch.Tensor,
@@ -100,14 +143,19 @@ def forward(
     ) -> torch.Tensor:
         """Dispatch to platform-specific forward implementation."""
         if current_omni_platform.is_rocm():
+            self._handle_kv_cache_dtype(attn_metadata, "HIP")
             return self.forward_hip(query, key, value, attn_metadata)
         elif current_omni_platform.is_cuda():
+            self._handle_kv_cache_dtype(attn_metadata, "CUDA")
             return self.forward_cuda(query, key, value, attn_metadata)
         elif current_omni_platform.is_npu():
+            self._handle_kv_cache_dtype(attn_metadata, "NPU")
             return self.forward_npu(query, key, value, attn_metadata)
         elif current_omni_platform.is_xpu():
+            self._handle_kv_cache_dtype(attn_metadata, "XPU")
             return self.forward_xpu(query, key, value, attn_metadata)
         elif current_omni_platform.is_musa():
+            self._handle_kv_cache_dtype(attn_metadata, "MUSA")
             return self.forward_musa(query, key, value, attn_metadata)
         else:
             raise NotImplementedError(f"No forward implementation for platform: {current_omni_platform}")
diff --git a/vllm_omni/diffusion/attention/backends/flash_attn.py b/vllm_omni/diffusion/attention/backends/flash_attn.py
index 60ccba99c9b..6d9d8a13cad 100644
--- a/vllm_omni/diffusion/attention/backends/flash_attn.py
+++ b/vllm_omni/diffusion/attention/backends/flash_attn.py
@@ -20,6 +20,14 @@ class FlashAttentionBackend(AttentionBackend):
     def supports_attention_mask(cls) -> bool:
         return True
 
+    @classmethod
+    def supports_kv_cache_dtype(cls, kv_cache_dtype: str | None) -> bool:
+        if kv_cache_dtype is None:
+            return True
+        from vllm_omni.quantization.kv_quant import is_quantized_kv_cache
+
+        return is_quantized_kv_cache(kv_cache_dtype)
+
     @staticmethod
     def get_supported_head_sizes() -> list[int]:
         return [64, 96, 128, 192, 256]
@@ -34,6 +42,14 @@ def get_impl_cls() -> type["FlashAttentionImpl"]:
 
 
 class FlashAttentionImpl(AttentionImpl):
+    # FP8 KV quantization: currently supported on CUDA and HIP.
+    # NPU/XPU contributors: add your platform here when implementing
+    # FP8 support in forward_npu()/forward_xpu().
+    _supported_kv_cache_dtypes = {
+        "CUDA": {"fp8", "fp8_e4m3"},
+        "HIP": {"fp8", "fp8_e4m3"},
+    }
+
     def __init__(
         self,
         num_heads: int,
@@ -106,8 +122,10 @@ def forward_cuda(
         attn_metadata: AttentionMetadata = None,
     ) -> torch.Tensor:
         """CUDA/ROCm flash attention implementation."""
-        # Dispatch to FP8 path if Q/K/V are quantized
-        if key.dtype == torch.float8_e4m3fn:
+        from vllm_omni.quantization.kv_quant import is_quantized_kv_cache
+
+        kv_cache_dtype = attn_metadata.kv_cache_dtype if attn_metadata else None
+        if is_quantized_kv_cache(kv_cache_dtype):
             return self._forward_fp8(query, key, value, attn_metadata)
         from vllm_omni.diffusion.attention.backends.utils.fa import (
             HAS_FLASH_ATTN,
@@ -234,21 +252,36 @@ def _forward_fp8(
         value: torch.Tensor,
         attn_metadata: AttentionMetadata,
     ) -> torch.Tensor:
-        """Optimized FP8 Q/K/V attention path.
+        """FP8 attention: quantize Q/K/V here, then use FA3 native or BF16 fallback.
 
-        Uses fa3_attn_func directly (non-varlen) for minimum overhead.
-        With scale=1.0 fast quantization, descale tensors are all-ones.
+        Quantization is owned by the backend so that:
+        1. Non-FP8 backends (SDPA) never pay the quant/dequant cost.
+        2. Each platform can plug in its own FP8 conversion logic.
         """
-        q_scale = attn_metadata.q_scale
-        k_scale = attn_metadata.k_scale
-        v_scale = attn_metadata.v_scale
+        from vllm_omni.quantization.kv_quant import (
+            quantize_kv_fp8_fast,
+            quantize_qkv_fp8_fast,
+        )
+
+        # Quantize Q/K/V using fast saturating cast (scale=1.0)
+        fp8_q, fp8_k, fp8_v, q_scale, k_scale, v_scale = quantize_qkv_fp8_fast(
+            query, key, value
+        )
+
+        # Also quantize joint K/V if present
+        if attn_metadata.joint_key is not None and attn_metadata.joint_value is not None:
+            jk, jv, _, _ = quantize_kv_fp8_fast(
+                attn_metadata.joint_key, attn_metadata.joint_value
+            )
+            attn_metadata.joint_key = jk
+            attn_metadata.joint_value = jv
 
         B, S, H, D = key.shape
         q_descale = self._reshape_descale(q_scale, B, H)
         k_descale = self._reshape_descale(k_scale, B, H)
         v_descale = self._reshape_descale(v_scale, B, H)
 
-        # Primary path: fa3_attn_func (non-varlen, lowest overhead)
+        # Primary path: FA3 native FP8
         from vllm_omni.diffusion.attention.backends.ring.ring_globals import (
             HAS_FA3,
             fa3_attn_func,
@@ -256,28 +289,18 @@ def _forward_fp8(
 
         if HAS_FA3 and fa3_attn_func is not None:
             out = fa3_attn_func(
-                query, key, value,
+                fp8_q, fp8_k, fp8_v,
                 softmax_scale=self.softmax_scale,
                 causal=self.causal,
                 q_descale=q_descale,
                 k_descale=k_descale,
                 v_descale=v_descale,
             )
-            if isinstance(out, tuple):
-                out = out[0]
-            return out
-
-        # Fallback: dequant to BF16
-        from vllm_omni.quantization.kv_quant import dequantize_fp8
+            return self._unwrap_flash_output(out)
 
+        # Fallback: no FA3, run standard BF16 path
         logger.warning_once(
-            "No FA3 available for FP8 attention. Dequantizing to BF16."
+            "No FA3 available for FP8 attention. Running in BF16."
         )
-        output_dtype = torch.bfloat16
-        query = dequantize_fp8(query, q_scale, output_dtype)
-        key = dequantize_fp8(key, k_scale, output_dtype)
-        value = dequantize_fp8(value, v_scale, output_dtype)
-        attn_metadata.q_scale = None
-        attn_metadata.k_scale = None
-        attn_metadata.v_scale = None
+        attn_metadata.kv_cache_dtype = None
         return self.forward_cuda(query, key, value, attn_metadata)
diff --git a/vllm_omni/diffusion/attention/backends/sdpa.py b/vllm_omni/diffusion/attention/backends/sdpa.py
index f400f57dba8..7222306d71d 100644
--- a/vllm_omni/diffusion/attention/backends/sdpa.py
+++ b/vllm_omni/diffusion/attention/backends/sdpa.py
@@ -97,21 +97,8 @@ def _forward_impl(
         attn_metadata: AttentionMetadata | None = None,
         mask_mode: SDPAMaskMode = "broadcast_k",
     ) -> torch.Tensor:
-        # FP8 dequantization: SDPA does not support FP8 natively
-        if key.dtype == torch.float8_e4m3fn:
-            from vllm_omni.quantization.kv_quant import dequantize_fp8
-
-            output_dtype = torch.bfloat16
-            q_scale = attn_metadata.q_scale if attn_metadata else None
-            k_scale = attn_metadata.k_scale if attn_metadata else None
-            v_scale = attn_metadata.v_scale if attn_metadata else None
-            query = dequantize_fp8(query, q_scale, output_dtype)
-            key = dequantize_fp8(key, k_scale, output_dtype)
-            value = dequantize_fp8(value, v_scale, output_dtype)
-            logger.warning_once(
-                "FP8 attention with SDPA backend: dequantizing to compute dtype. "
-                "No compute benefit. Use FA3 for optimal FP8 support."
-            )
+        # Note: unsupported kv_cache_dtype is already warned and cleared
+        # by AttentionImpl._handle_kv_cache_dtype() in the base forward().
 
         # Normalize mask before permuting q/k/v.
         # _maybe_reshape_attn_mask expects sequence length on dim=1.
diff --git a/vllm_omni/diffusion/attention/layer.py b/vllm_omni/diffusion/attention/layer.py
index 582a54f236d..9bf2cd84d72 100644
--- a/vllm_omni/diffusion/attention/layer.py
+++ b/vllm_omni/diffusion/attention/layer.py
@@ -93,12 +93,10 @@ def __init__(
         # Fallback strategy when SP is not active (outside sharded regions)
         self._no_parallel_strategy = NoParallelAttention()
 
-        # FP8 attention quantization: resolved lazily in forward() because
+        # KV cache quantization: resolved lazily in forward() because
         # forward_context is not available during model loading.
-        self._fp8_attn_enabled: bool | None = None
-        # Cached scales for delayed scaling (reuse previous timestep's scales)
-        self._cached_qkv_scales: tuple | None = None
-        self._cached_jkv_scales: tuple | None = None
+        self._kv_cache_dtype: str | None = None
+        self._kv_cache_dtype_resolved: bool = False
 
     def _get_active_parallel_strategy(self):
         """Get the parallel strategy based on current SP active state.
@@ -115,71 +113,33 @@ def _get_active_parallel_strategy(self):
                 return self._no_parallel_strategy
         return self.parallel_strategy
 
-    def _quantize_qkv_fp8(
-        self,
-        query: torch.Tensor,
-        key: torch.Tensor,
-        value: torch.Tensor,
-        attn_metadata: AttentionMetadata | None,
-    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, AttentionMetadata | None]:
-        """Quantize Q/K/V tensors to FP8 and store scales in attn_metadata.
-
-        Uses delayed scaling: first call computes dynamic scales (amax),
-        subsequent calls reuse the cached scales (static, no amax).
-        Scales are refreshed each timestep since the first layer in each
-        step always runs dynamic.
-        """
-        from vllm_omni.quantization.kv_quant import (
-            quantize_kv_fp8_fast,
-            quantize_qkv_fp8_fast,
-        )
-
-        fp8_q, fp8_k, fp8_v, q_scale, k_scale, v_scale = quantize_qkv_fp8_fast(
-            query, key, value
-        )
-
-        if attn_metadata is None:
-            attn_metadata = AttentionMetadata()
-        attn_metadata.q_scale = q_scale
-        attn_metadata.k_scale = k_scale
-        attn_metadata.v_scale = v_scale
-
-        # Quantize joint_key/joint_value with separate scales
-        if attn_metadata.joint_key is not None and attn_metadata.joint_value is not None:
-            jk, jv, jk_scale, jv_scale = quantize_kv_fp8_fast(
-                attn_metadata.joint_key, attn_metadata.joint_value,
-            )
-            attn_metadata.joint_key = jk
-            attn_metadata.joint_value = jv
-            attn_metadata.jk_scale = jk_scale
-            attn_metadata.jv_scale = jv_scale
-            self._cached_jkv_scales = (jk_scale, jv_scale)
-
-        return fp8_q, fp8_k, fp8_v, attn_metadata
-
-    def _resolve_fp8_attn(self) -> bool:
-        """Lazily resolve FP8 attention config from forward context."""
-        if self._fp8_attn_enabled is not None:
-            return self._fp8_attn_enabled
+    def _resolve_kv_cache_dtype(self) -> str | None:
+        """Lazily resolve kv_cache_dtype from forward context."""
+        if self._kv_cache_dtype_resolved:
+            return self._kv_cache_dtype
         try:
             config = get_forward_context().omni_diffusion_config
-            enabled = config.kv_cache_dtype == "fp8"
-            logger.info(
-                "FP8 attention resolved: kv_cache_dtype=%s, enabled=%s",
-                getattr(config, "kv_cache_dtype", "MISSING"),
-                enabled,
-            )
-        except Exception as e:
-            logger.warning("FP8 attention resolve failed: %s", e)
-            enabled = False
-        if enabled and self.use_ring:
-            raise ValueError(
-                "FP8 attention quantization is not compatible with ring attention "
-                "(ring_degree > 1). Ring kernels do not propagate FP8 descale "
-                "factors. Use Ulysses SP instead."
-            )
-        self._fp8_attn_enabled = enabled
-        return enabled
+            dtype = config.kv_cache_dtype
+        except Exception:
+            dtype = None
+        if dtype:
+            if not self.attn_backend.supports_kv_cache_dtype(dtype):
+                logger.warning(
+                    "Attention backend %s does not support kv_cache_dtype='%s'. "
+                    "KV quantization will be disabled.",
+                    self.attn_backend.get_name(),
+                    dtype,
+                )
+                dtype = None
+            elif self.use_ring:
+                raise ValueError(
+                    "FP8 KV quantization is not compatible with ring attention "
+                    "(ring_degree > 1). Ring kernels do not propagate FP8 descale "
+                    "factors. Use Ulysses SP instead."
+                )
+        self._kv_cache_dtype = dtype
+        self._kv_cache_dtype_resolved = True
+        return dtype
 
     def forward(
         self,
@@ -196,20 +156,12 @@ def forward(
         # For Ring: Concat joint_q
         query, key, value, attn_metadata, ctx = strategy.pre_attention(query, key, value, attn_metadata)
 
-        # 1.5 FP8 Q/K/V quantization (after AllToAll stays BF16, before kernel)
-        if self._resolve_fp8_attn():
-            # Zero out padding positions before quantizing — FP8 path skips
-            # varlen to avoid FA3 varlen+descale bug, so padding must be
-            # handled by zeroing K (makes softmax weight ≈ 0 for those positions).
-            if attn_metadata is not None and attn_metadata.attn_mask is not None:
-                mask = attn_metadata.attn_mask  # (B, S) bool
-                if not torch.all(mask):
-                    m = mask.unsqueeze(-1).unsqueeze(-1)
-                    key = key * m
-                    value = value * m
-            query, key, value, attn_metadata = self._quantize_qkv_fp8(
-                query, key, value, attn_metadata
-            )
+        # Signal KV quantization to backends via metadata
+        kv_cache_dtype = self._resolve_kv_cache_dtype()
+        if kv_cache_dtype:
+            if attn_metadata is None:
+                attn_metadata = AttentionMetadata()
+            attn_metadata.kv_cache_dtype = kv_cache_dtype
 
         # 2. Kernel Execution (Computation)
         if self.use_ring and strategy is not self._no_parallel_strategy:
diff --git a/vllm_omni/diffusion/data.py b/vllm_omni/diffusion/data.py
index a100d6d1584..a6290461649 100644
--- a/vllm_omni/diffusion/data.py
+++ b/vllm_omni/diffusion/data.py
@@ -498,15 +498,11 @@ class OmniDiffusionConfig:
     # Per-component: {"transformer": {"method": "fp8"}, "vae": None}
     quantization_config: str | QuantizationConfig | dict[str, Any] | None = None
 
-    # FP8 KV quantization: dynamically quantize attention K/V tensors to
-    # float8_e4m3fn each forward pass. Orthogonal to weight quantization.
+    # KV cache dtype for attention. Aligned with upstream vLLM's --kv-cache-dtype.
+    # None = native dtype (no quantization).
+    # "fp8" = dynamic FP8 (float8_e4m3fn) quantization per forward pass.
     # On Hopper+FA3: native FP8 attention (memory + compute savings).
-    # On FA2/SDPA: dequant fallback (memory-only savings).
-    kv_quantization: bool = False
-
-    # FP8 attention quantization (orthogonal to weight quantization).
-    # "fp8": dynamically quantize Q/K/V to float8_e4m3fn each forward pass.
-    # None or "auto": disabled.
+    # On other backends: no benefit, backends skip quantization.
     kv_cache_dtype: str | None = None
 
     # Diffusion pipeline Profiling config
diff --git a/vllm_omni/quantization/kv_quant.py b/vllm_omni/quantization/kv_quant.py
index 54240c1db38..01a9d2068b0 100644
--- a/vllm_omni/quantization/kv_quant.py
+++ b/vllm_omni/quantization/kv_quant.py
@@ -17,6 +17,12 @@
 
 logger = init_logger(__name__)
 
+
+def is_quantized_kv_cache(kv_cache_dtype: str | None) -> bool:
+    """Check if the KV cache dtype implies quantized storage."""
+    return kv_cache_dtype in ("fp8", "fp8_e4m3")
+
+
 # Try to use vLLM's fused CUDA kernel; fall back to PyTorch ops.
 try:
     from vllm._custom_ops import scaled_fp8_quant as _vllm_scaled_fp8_quant

From a275bea55657e295e482b4373cd10a4ecef7e8c0 Mon Sep 17 00:00:00 2001
From: lishunyang <lishunyang12@163.com>
Date: Wed, 8 Apr 2026 00:11:09 +0800
Subject: [PATCH 31/45] Use table-driven platform dispatch for attention
 forward

Signed-off-by: lishunyang <lishunyang12@163.com>
---
 .../diffusion/attention/backends/abstract.py  | 56 ++++++++++---------
 .../attention/backends/flash_attn.py          |  6 +-
 2 files changed, 33 insertions(+), 29 deletions(-)

diff --git a/vllm_omni/diffusion/attention/backends/abstract.py b/vllm_omni/diffusion/attention/backends/abstract.py
index 4a02c52b881..67ce819e60a 100644
--- a/vllm_omni/diffusion/attention/backends/abstract.py
+++ b/vllm_omni/diffusion/attention/backends/abstract.py
@@ -87,9 +87,13 @@ class AttentionMetadata:
 
 class AttentionImpl(ABC, Generic[T]):
 
-    # Per-platform kv_cache_dtype support. Maps platform name to set of
-    # supported dtype strings. Subclasses override to declare support.
-    # Example: {"CUDA": {"fp8", "fp8_e4m3"}, "NPU": {"fp8"}}
+    # Per-platform kv_cache_dtype support. Maps OmniPlatformEnum value
+    # (e.g. "cuda", "npu") to the set of quantized dtypes that platform
+    # handles. The base forward() checks this before dispatching and
+    # clears unsupported dtypes with a warning.
+    #
+    # To add FP8 support for a new platform in a subclass:
+    #   _supported_kv_cache_dtypes = {"cuda": {"fp8"}, "npu": {"fp8"}}
     _supported_kv_cache_dtypes: dict[str, set[str]] = {}
 
     @abstractmethod
@@ -105,32 +109,39 @@ def __init__(
     ) -> None:
         raise NotImplementedError
 
+    # Platform enum value → forward method name. New platforms only need
+    # to implement forward_{name}() and add an entry here.
+    _PLATFORM_DISPATCH: dict[str, str] = {
+        "cuda": "forward_cuda",
+        "rocm": "forward_hip",
+        "npu": "forward_npu",
+        "xpu": "forward_xpu",
+        "musa": "forward_musa",
+    }
+
     def _handle_kv_cache_dtype(
         self,
         attn_metadata: T | None,
-        platform: str,
+        platform_key: str,
     ) -> None:
         """Check kv_cache_dtype compatibility for this platform.
 
         If the requested kv_cache_dtype is not in _supported_kv_cache_dtypes
         for the current platform, it is cleared to None with a warning.
-
-        To add FP8 support for a new platform, add the platform key:
-            _supported_kv_cache_dtypes = {"CUDA": {"fp8"}, "NPU": {"fp8"}}
         """
         if attn_metadata is None:
             return
         kv_cache_dtype = attn_metadata.kv_cache_dtype
         if kv_cache_dtype is None:
             return
-        supported = self._supported_kv_cache_dtypes.get(platform, set())
+        supported = self._supported_kv_cache_dtypes.get(platform_key, set())
         if kv_cache_dtype not in supported:
             logger.warning_once(
                 "kv_cache_dtype='%s' requested but %s on %s does not support "
                 "it. Running in native dtype.",
                 kv_cache_dtype,
                 type(self).__name__,
-                platform,
+                platform_key,
             )
             attn_metadata.kv_cache_dtype = None
 
@@ -142,23 +153,16 @@ def forward(
         attn_metadata: T | None = None,
     ) -> torch.Tensor:
         """Dispatch to platform-specific forward implementation."""
-        if current_omni_platform.is_rocm():
-            self._handle_kv_cache_dtype(attn_metadata, "HIP")
-            return self.forward_hip(query, key, value, attn_metadata)
-        elif current_omni_platform.is_cuda():
-            self._handle_kv_cache_dtype(attn_metadata, "CUDA")
-            return self.forward_cuda(query, key, value, attn_metadata)
-        elif current_omni_platform.is_npu():
-            self._handle_kv_cache_dtype(attn_metadata, "NPU")
-            return self.forward_npu(query, key, value, attn_metadata)
-        elif current_omni_platform.is_xpu():
-            self._handle_kv_cache_dtype(attn_metadata, "XPU")
-            return self.forward_xpu(query, key, value, attn_metadata)
-        elif current_omni_platform.is_musa():
-            self._handle_kv_cache_dtype(attn_metadata, "MUSA")
-            return self.forward_musa(query, key, value, attn_metadata)
-        else:
-            raise NotImplementedError(f"No forward implementation for platform: {current_omni_platform}")
+        platform_key = current_omni_platform._omni_enum.value
+        method_name = self._PLATFORM_DISPATCH.get(platform_key)
+        if method_name is None:
+            raise NotImplementedError(
+                f"No forward implementation for platform: {platform_key}. "
+                f"Register it in AttentionImpl._PLATFORM_DISPATCH."
+            )
+        self._handle_kv_cache_dtype(attn_metadata, platform_key)
+        method = getattr(self, method_name)
+        return method(query, key, value, attn_metadata)
 
     def forward_cuda(
         self,
diff --git a/vllm_omni/diffusion/attention/backends/flash_attn.py b/vllm_omni/diffusion/attention/backends/flash_attn.py
index 6d9d8a13cad..77cfca52bed 100644
--- a/vllm_omni/diffusion/attention/backends/flash_attn.py
+++ b/vllm_omni/diffusion/attention/backends/flash_attn.py
@@ -42,12 +42,12 @@ def get_impl_cls() -> type["FlashAttentionImpl"]:
 
 
 class FlashAttentionImpl(AttentionImpl):
-    # FP8 KV quantization: currently supported on CUDA and HIP.
+    # FP8 KV quantization: currently supported on CUDA and ROCm.
     # NPU/XPU contributors: add your platform here when implementing
     # FP8 support in forward_npu()/forward_xpu().
     _supported_kv_cache_dtypes = {
-        "CUDA": {"fp8", "fp8_e4m3"},
-        "HIP": {"fp8", "fp8_e4m3"},
+        "cuda": {"fp8", "fp8_e4m3"},
+        "rocm": {"fp8", "fp8_e4m3"},
     }
 
     def __init__(

From fc095e076dfa945e933059944d1fdcb59a68372b Mon Sep 17 00:00:00 2001
From: lishunyang <lishunyang12@163.com>
Date: Wed, 8 Apr 2026 00:14:10 +0800
Subject: [PATCH 32/45] Scope FP8 KV support to CUDA only, comment placeholders
 for other platforms

Signed-off-by: lishunyang <lishunyang12@163.com>
---
 vllm_omni/diffusion/attention/backends/flash_attn.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/vllm_omni/diffusion/attention/backends/flash_attn.py b/vllm_omni/diffusion/attention/backends/flash_attn.py
index 77cfca52bed..8495862b85c 100644
--- a/vllm_omni/diffusion/attention/backends/flash_attn.py
+++ b/vllm_omni/diffusion/attention/backends/flash_attn.py
@@ -42,12 +42,13 @@ def get_impl_cls() -> type["FlashAttentionImpl"]:
 
 
 class FlashAttentionImpl(AttentionImpl):
-    # FP8 KV quantization: currently supported on CUDA and ROCm.
-    # NPU/XPU contributors: add your platform here when implementing
-    # FP8 support in forward_npu()/forward_xpu().
+    # Per-platform FP8 KV quantization support.
+    # To enable FP8 on a new platform, add its OmniPlatformEnum value here
+    # and handle kv_cache_dtype in the corresponding forward_{platform}().
     _supported_kv_cache_dtypes = {
         "cuda": {"fp8", "fp8_e4m3"},
-        "rocm": {"fp8", "fp8_e4m3"},
+        # "rocm": {"fp8", "fp8_e4m3"},
+        # "npu": {"fp8"},
     }
 
     def __init__(

From 88bd707d27de61eb6388c856ca95129bda31de5b Mon Sep 17 00:00:00 2001
From: lishunyang <lishunyang12@163.com>
Date: Wed, 8 Apr 2026 00:27:05 +0800
Subject: [PATCH 33/45] Keep dispatch table complete, silence CUDA kernel
 warning for non-CUDA platforms

Signed-off-by: lishunyang <lishunyang12@163.com>
---
 vllm_omni/diffusion/attention/backends/abstract.py | 4 ++--
 vllm_omni/quantization/kv_quant.py                 | 7 ++-----
 2 files changed, 4 insertions(+), 7 deletions(-)

diff --git a/vllm_omni/diffusion/attention/backends/abstract.py b/vllm_omni/diffusion/attention/backends/abstract.py
index 67ce819e60a..e8cb8abd65e 100644
--- a/vllm_omni/diffusion/attention/backends/abstract.py
+++ b/vllm_omni/diffusion/attention/backends/abstract.py
@@ -109,8 +109,8 @@ def __init__(
     ) -> None:
         raise NotImplementedError
 
-    # Platform enum value → forward method name. New platforms only need
-    # to implement forward_{name}() and add an entry here.
+    # Platform enum value → forward method name. To add a new platform,
+    # implement forward_{name}() and register it here.
     _PLATFORM_DISPATCH: dict[str, str] = {
         "cuda": "forward_cuda",
         "rocm": "forward_hip",
diff --git a/vllm_omni/quantization/kv_quant.py b/vllm_omni/quantization/kv_quant.py
index 01a9d2068b0..fbc90a3e9dd 100644
--- a/vllm_omni/quantization/kv_quant.py
+++ b/vllm_omni/quantization/kv_quant.py
@@ -23,17 +23,14 @@ def is_quantized_kv_cache(kv_cache_dtype: str | None) -> bool:
     return kv_cache_dtype in ("fp8", "fp8_e4m3")
 
 
-# Try to use vLLM's fused CUDA kernel; fall back to PyTorch ops.
+# Try to use vLLM's fused CUDA kernel for quantization.
+# Falls back to device-agnostic PyTorch ops (works on any platform).
 try:
     from vllm._custom_ops import scaled_fp8_quant as _vllm_scaled_fp8_quant
 
     _HAS_FUSED_QUANT = True
 except ImportError:
     _HAS_FUSED_QUANT = False
-    logger.warning_once(
-        "vLLM scaled_fp8_quant not available, using PyTorch ops fallback. "
-        "FP8 attention will work but with higher quantization overhead."
-    )
 
 
 def _quantize_tensor_fp8(

From a9a5037c5f91c78a2fc8fe10aea1683cf8077042 Mon Sep 17 00:00:00 2001
From: lishunyang <lishunyang12@163.com>
Date: Wed, 8 Apr 2026 00:44:00 +0800
Subject: [PATCH 34/45] Fix: skip CUDA fused quant kernel on non-CUDA tensors

Signed-off-by: lishunyang <lishunyang12@163.com>
---
 vllm_omni/quantization/kv_quant.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm_omni/quantization/kv_quant.py b/vllm_omni/quantization/kv_quant.py
index fbc90a3e9dd..73b83cd74a7 100644
--- a/vllm_omni/quantization/kv_quant.py
+++ b/vllm_omni/quantization/kv_quant.py
@@ -47,7 +47,7 @@ def _quantize_tensor_fp8(
     Returns:
         ``(fp8_tensor, inv_scale)`` where inv_scale is the dequant scale.
     """
-    if _HAS_FUSED_QUANT:
+    if _HAS_FUSED_QUANT and tensor.is_cuda:
         orig_shape = tensor.shape
         flat = tensor.reshape(-1, orig_shape[-1])
         # Pass cached_scale for static quant (no amax), None for dynamic

From 68bc96ef80c5d416c99998ac142e03c0b55d3d22 Mon Sep 17 00:00:00 2001
From: lishunyang <lishunyang12@163.com>
Date: Wed, 8 Apr 2026 00:49:39 +0800
Subject: [PATCH 35/45] Add tests for fast quant, backend support, and
 per-platform dtype guard

Signed-off-by: lishunyang <lishunyang12@163.com>
---
 tests/diffusion/quantization/test_kv_quant.py | 90 +++++++++++++++++++
 1 file changed, 90 insertions(+)

diff --git a/tests/diffusion/quantization/test_kv_quant.py b/tests/diffusion/quantization/test_kv_quant.py
index a3276cc5319..f793e656a15 100644
--- a/tests/diffusion/quantization/test_kv_quant.py
+++ b/tests/diffusion/quantization/test_kv_quant.py
@@ -141,3 +141,93 @@ def test_attention_metadata_kv_cache_dtype():
 
     meta.kv_cache_dtype = "fp8"
     assert meta.kv_cache_dtype == "fp8"
+
+
+def test_fast_qkv_quantization():
+    """quantize_qkv_fp8_fast should use scale=1.0 (direct cast)."""
+    from vllm_omni.quantization.kv_quant import quantize_qkv_fp8_fast
+
+    q = torch.randn(1, 32, 4, 64, dtype=torch.bfloat16)
+    k = torch.randn(1, 32, 4, 64, dtype=torch.bfloat16)
+    v = torch.randn(1, 32, 4, 64, dtype=torch.bfloat16)
+
+    fp8_q, fp8_k, fp8_v, q_s, k_s, v_s = quantize_qkv_fp8_fast(q, k, v)
+
+    assert fp8_q.dtype == torch.float8_e4m3fn
+    assert fp8_k.dtype == torch.float8_e4m3fn
+    assert fp8_v.dtype == torch.float8_e4m3fn
+    # Fast path uses scale=1.0
+    assert q_s.item() == 1.0
+    assert k_s.item() == 1.0
+    assert v_s.item() == 1.0
+
+
+def test_fast_kv_quantization():
+    """quantize_kv_fp8_fast for joint attention path."""
+    from vllm_omni.quantization.kv_quant import quantize_kv_fp8_fast
+
+    k = torch.randn(1, 64, 4, 32, dtype=torch.bfloat16)
+    v = torch.randn(1, 64, 4, 32, dtype=torch.bfloat16)
+
+    fp8_k, fp8_v, k_s, v_s = quantize_kv_fp8_fast(k, v)
+
+    assert fp8_k.dtype == torch.float8_e4m3fn
+    assert fp8_v.dtype == torch.float8_e4m3fn
+    assert k_s.item() == 1.0
+    assert v_s.item() == 1.0
+
+
+def test_flash_backend_supports_kv_cache_dtype():
+    """FlashAttentionBackend should declare FP8 support."""
+    from vllm_omni.diffusion.attention.backends.flash_attn import FlashAttentionBackend
+
+    assert FlashAttentionBackend.supports_kv_cache_dtype(None) is True
+    assert FlashAttentionBackend.supports_kv_cache_dtype("fp8") is True
+    assert FlashAttentionBackend.supports_kv_cache_dtype("fp8_e4m3") is True
+    assert FlashAttentionBackend.supports_kv_cache_dtype("mxfp8") is False
+
+
+def test_sdpa_backend_does_not_support_fp8():
+    """SDPABackend should not declare FP8 support."""
+    from vllm_omni.diffusion.attention.backends.sdpa import SDPABackend
+
+    assert SDPABackend.supports_kv_cache_dtype(None) is True
+    assert SDPABackend.supports_kv_cache_dtype("fp8") is False
+
+
+def test_handle_kv_cache_dtype_clears_unsupported():
+    """_handle_kv_cache_dtype should clear unsupported dtype to None."""
+    from vllm_omni.diffusion.attention.backends.abstract import AttentionMetadata
+    from vllm_omni.diffusion.attention.backends.sdpa import SDPAImpl
+
+    impl = SDPAImpl(num_heads=4, head_size=64, softmax_scale=0.125)
+    meta = AttentionMetadata(kv_cache_dtype="fp8")
+
+    # SDPA has empty _supported_kv_cache_dtypes, should clear fp8
+    impl._handle_kv_cache_dtype(meta, "cuda")
+    assert meta.kv_cache_dtype is None
+
+
+def test_handle_kv_cache_dtype_preserves_supported():
+    """_handle_kv_cache_dtype should preserve supported dtype."""
+    from vllm_omni.diffusion.attention.backends.abstract import AttentionMetadata
+    from vllm_omni.diffusion.attention.backends.flash_attn import FlashAttentionImpl
+
+    impl = FlashAttentionImpl(num_heads=4, head_size=64, softmax_scale=0.125)
+    meta = AttentionMetadata(kv_cache_dtype="fp8")
+
+    impl._handle_kv_cache_dtype(meta, "cuda")
+    assert meta.kv_cache_dtype == "fp8"
+
+
+def test_handle_kv_cache_dtype_clears_unsupported_platform():
+    """FP8 on FlashAttention should be cleared for non-CUDA platforms."""
+    from vllm_omni.diffusion.attention.backends.abstract import AttentionMetadata
+    from vllm_omni.diffusion.attention.backends.flash_attn import FlashAttentionImpl
+
+    impl = FlashAttentionImpl(num_heads=4, head_size=64, softmax_scale=0.125)
+    meta = AttentionMetadata(kv_cache_dtype="fp8")
+
+    # NPU not in FlashAttentionImpl._supported_kv_cache_dtypes
+    impl._handle_kv_cache_dtype(meta, "npu")
+    assert meta.kv_cache_dtype is None

From 54e98d906a18a21bc5fac567fd8d1e77f6c15d50 Mon Sep 17 00:00:00 2001
From: SYLAR <125541396+lishunyang12@users.noreply.github.com>
Date: Thu, 9 Apr 2026 00:15:58 +0800
Subject: [PATCH 36/45] Update
 vllm_omni/diffusion/attention/backends/abstract.py

Co-authored-by: Canlin Guo <961750412@qq.com>
Signed-off-by: SYLAR <125541396+lishunyang12@users.noreply.github.com>
---
 vllm_omni/diffusion/attention/backends/abstract.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm_omni/diffusion/attention/backends/abstract.py b/vllm_omni/diffusion/attention/backends/abstract.py
index e8cb8abd65e..b4a269d1660 100644
--- a/vllm_omni/diffusion/attention/backends/abstract.py
+++ b/vllm_omni/diffusion/attention/backends/abstract.py
@@ -153,7 +153,7 @@ def forward(
         attn_metadata: T | None = None,
     ) -> torch.Tensor:
         """Dispatch to platform-specific forward implementation."""
-        platform_key = current_omni_platform._omni_enum.value
+        platform_key = current_omni_platform.device_name
         method_name = self._PLATFORM_DISPATCH.get(platform_key)
         if method_name is None:
             raise NotImplementedError(

From b45e17949d916fb4de0fb6d6ad41c6a4d8773c9c Mon Sep 17 00:00:00 2001
From: lishunyang <lishunyang12@163.com>
Date: Thu, 9 Apr 2026 21:34:30 +0800
Subject: [PATCH 37/45] Add SageAttention vs FlashAttention benchmark script

Signed-off-by: lishunyang <lishunyang12@163.com>
---
 benchmarks/diffusion/bench_sage_comparison.sh | 35 +++++++++++++++++++
 1 file changed, 35 insertions(+)
 create mode 100644 benchmarks/diffusion/bench_sage_comparison.sh

diff --git a/benchmarks/diffusion/bench_sage_comparison.sh b/benchmarks/diffusion/bench_sage_comparison.sh
new file mode 100644
index 00000000000..6e9d0caf009
--- /dev/null
+++ b/benchmarks/diffusion/bench_sage_comparison.sh
@@ -0,0 +1,35 @@
+#!/bin/bash
+# Benchmark: HunyuanVideo 1.5 480p — BF16 baseline vs SageAttention
+# Resolution: 480×832, 33 frames
+set -e
+
+MODEL="hunyuanvideo-community/HunyuanVideo-1.5-480p_t2v"
+PROMPT="A serene lakeside sunrise with mist over the water."
+SCRIPT="examples/offline_inference/text_to_video/text_to_video.py"
+OUTPUT_DIR="${OUTPUT_DIR:-/workspace}"
+
+COMMON_ARGS="--model $MODEL \
+  --height 480 --width 832 --num-frames 33 \
+  --num-inference-steps 50 \
+  --guidance-scale 6.0 \
+  --seed 42 \
+  --vae-use-tiling \
+  --enforce-eager"
+
+echo "============================================"
+echo "=== 1/2: BF16 + FlashAttention (baseline)==="
+echo "============================================"
+DIFFUSION_ATTENTION_BACKEND=FLASH_ATTN \
+  python $SCRIPT $COMMON_ARGS \
+  --output "$OUTPUT_DIR/output_flash_attn.mp4"
+
+echo ""
+echo "============================================"
+echo "=== 2/2: BF16 + SageAttention            ==="
+echo "============================================"
+DIFFUSION_ATTENTION_BACKEND=SAGE_ATTN \
+  python $SCRIPT $COMMON_ARGS \
+  --output "$OUTPUT_DIR/output_sage_attn.mp4"
+
+echo ""
+echo "=== Done. Compare: output_flash_attn.mp4 vs output_sage_attn.mp4 ==="

From 087dc093187f9e89c4f9e7c294a280331b89c11b Mon Sep 17 00:00:00 2001
From: lishunyang <lishunyang12@163.com>
Date: Thu, 9 Apr 2026 21:47:45 +0800
Subject: [PATCH 38/45] Fix model name to HunyuanVideo-1.5-Diffusers-480p_t2v

Signed-off-by: lishunyang <lishunyang12@163.com>
---
 benchmarks/diffusion/bench_sage_comparison.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/diffusion/bench_sage_comparison.sh b/benchmarks/diffusion/bench_sage_comparison.sh
index 6e9d0caf009..0c5b20fc4c9 100644
--- a/benchmarks/diffusion/bench_sage_comparison.sh
+++ b/benchmarks/diffusion/bench_sage_comparison.sh
@@ -3,7 +3,7 @@
 # Resolution: 480×832, 33 frames
 set -e
 
-MODEL="hunyuanvideo-community/HunyuanVideo-1.5-480p_t2v"
+MODEL="hunyuanvideo-community/HunyuanVideo-1.5-Diffusers-480p_t2v"
 PROMPT="A serene lakeside sunrise with mist over the water."
 SCRIPT="examples/offline_inference/text_to_video/text_to_video.py"
 OUTPUT_DIR="${OUTPUT_DIR:-/workspace}"

From 4da4f9458a66954edbd7baeaf3f0cb810ea7e793 Mon Sep 17 00:00:00 2001
From: lishunyang <lishunyang12@163.com>
Date: Thu, 9 Apr 2026 22:21:21 +0800
Subject: [PATCH 39/45] Add attention kernel benchmark (FA vs Sage vs SDPA)

Signed-off-by: lishunyang <lishunyang12@163.com>
---
 benchmarks/diffusion/bench_attn_kernel.py | 138 ++++++++++++++++++++++
 1 file changed, 138 insertions(+)
 create mode 100644 benchmarks/diffusion/bench_attn_kernel.py

diff --git a/benchmarks/diffusion/bench_attn_kernel.py b/benchmarks/diffusion/bench_attn_kernel.py
new file mode 100644
index 00000000000..b175322a1cf
--- /dev/null
+++ b/benchmarks/diffusion/bench_attn_kernel.py
@@ -0,0 +1,138 @@
+# SPDX-License-Identifier: Apache-2.0
+"""
+Kernel-level benchmark: FlashAttention vs SageAttention
+Isolates attention kernel performance from model loading, torch.compile, VAE, etc.
+
+HunyuanVideo 1.5 config: 16 heads, head_dim=128
+Latent seq lengths (after VAE compression 4x temporal, 16x spatial):
+  480x832, 33 frames  -> ~9 x 30 x 52  = ~14,040
+  480x832, 121 frames -> ~31 x 30 x 52 = ~48,360
+
+Usage:
+  python bench_attn_kernel.py
+  python bench_attn_kernel.py --seq-len 48360 --num-heads 16 --head-dim 128
+"""
+
+import argparse
+import time
+
+import torch
+
+
+def benchmark_fn(fn, warmup=5, repeat=20, **kwargs):
+    """Benchmark a function with CUDA synchronization."""
+    for _ in range(warmup):
+        fn(**kwargs)
+    torch.cuda.synchronize()
+
+    times = []
+    for _ in range(repeat):
+        torch.cuda.synchronize()
+        t0 = time.perf_counter()
+        fn(**kwargs)
+        torch.cuda.synchronize()
+        t1 = time.perf_counter()
+        times.append((t1 - t0) * 1000)  # ms
+
+    times.sort()
+    # trim top/bottom 20%
+    trim = max(1, len(times) // 5)
+    trimmed = times[trim:-trim] if trim < len(times) // 2 else times
+    avg = sum(trimmed) / len(trimmed)
+    return avg, min(times), max(times)
+
+
+def bench_flash_attn(q, k, v):
+    from flash_attn import flash_attn_func
+    return flash_attn_func(q, k, v, causal=False)
+
+
+def bench_sage_attn(q, k, v):
+    from sageattention import sageattn
+    return sageattn(q, k, v, tensor_layout="NHD", is_causal=False)
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--seq-len", type=int, default=48360,
+                        help="Sequence length (default: 48360 for 121 frames)")
+    parser.add_argument("--num-heads", type=int, default=16)
+    parser.add_argument("--head-dim", type=int, default=128)
+    parser.add_argument("--batch-size", type=int, default=1)
+    parser.add_argument("--dtype", type=str, default="bfloat16",
+                        choices=["bfloat16", "float16"])
+    parser.add_argument("--warmup", type=int, default=5)
+    parser.add_argument("--repeat", type=int, default=20)
+    args = parser.parse_args()
+
+    dtype = getattr(torch, args.dtype)
+    device = "cuda"
+    B, S, H, D = args.batch_size, args.seq_len, args.num_heads, args.head_dim
+
+    print(f"Config: B={B}, S={S}, H={H}, D={D}, dtype={args.dtype}")
+    print(f"Tensor shape: ({B}, {S}, {H}, {D})")
+    mem_per_tensor = B * S * H * D * (2 if dtype == torch.float16 or dtype == torch.bfloat16 else 4)
+    print(f"Memory per Q/K/V tensor: {mem_per_tensor / 1e6:.1f} MB")
+    print(f"Warmup={args.warmup}, Repeat={args.repeat}")
+    print()
+
+    q = torch.randn(B, S, H, D, dtype=dtype, device=device)
+    k = torch.randn(B, S, H, D, dtype=dtype, device=device)
+    v = torch.randn(B, S, H, D, dtype=dtype, device=device)
+
+    results = {}
+
+    # --- FlashAttention ---
+    try:
+        from flash_attn import flash_attn_func  # noqa: F401
+        avg, lo, hi = benchmark_fn(bench_flash_attn, warmup=args.warmup,
+                                   repeat=args.repeat, q=q, k=k, v=v)
+        results["FlashAttention"] = (avg, lo, hi)
+        print(f"FlashAttention:  avg={avg:7.2f} ms  min={lo:7.2f} ms  max={hi:7.2f} ms")
+    except ImportError:
+        print("FlashAttention:  NOT AVAILABLE (flash_attn not installed)")
+    except Exception as e:
+        print(f"FlashAttention:  ERROR - {e}")
+
+    # --- SageAttention ---
+    try:
+        from sageattention import sageattn  # noqa: F401
+        avg, lo, hi = benchmark_fn(bench_sage_attn, warmup=args.warmup,
+                                   repeat=args.repeat, q=q, k=k, v=v)
+        results["SageAttention"] = (avg, lo, hi)
+        print(f"SageAttention:   avg={avg:7.2f} ms  min={lo:7.2f} ms  max={hi:7.2f} ms")
+    except ImportError:
+        print("SageAttention:   NOT AVAILABLE (sageattention not installed)")
+    except Exception as e:
+        print(f"SageAttention:   ERROR - {e}")
+
+    # --- torch SDPA ---
+    try:
+        # SDPA expects (B, H, S, D)
+        q_sdpa = q.transpose(1, 2)
+        k_sdpa = k.transpose(1, 2)
+        v_sdpa = v.transpose(1, 2)
+
+        def bench_sdpa(q, k, v):
+            return torch.nn.functional.scaled_dot_product_attention(
+                q, k, v, is_causal=False)
+
+        avg, lo, hi = benchmark_fn(bench_sdpa, warmup=args.warmup,
+                                   repeat=args.repeat, q=q_sdpa, k=k_sdpa, v=v_sdpa)
+        results["torch SDPA"] = (avg, lo, hi)
+        print(f"torch SDPA:      avg={avg:7.2f} ms  min={lo:7.2f} ms  max={hi:7.2f} ms")
+    except Exception as e:
+        print(f"torch SDPA:      ERROR - {e}")
+
+    # --- Summary ---
+    if len(results) >= 2:
+        print()
+        baseline_name = "FlashAttention" if "FlashAttention" in results else list(results.keys())[0]
+        baseline_avg = results[baseline_name][0]
+        for name, (avg, lo, hi) in results.items():
+            ratio = avg / baseline_avg
+            print(f"  {name:20s}  {avg:7.2f} ms  ({ratio:.2f}x vs {baseline_name})")
+
+
+if __name__ == "__main__":
+    main()

From dd18ba37aa655317e2cebf3a5a464c5dff307e14 Mon Sep 17 00:00:00 2001
From: lishunyang <lishunyang12@163.com>
Date: Thu, 9 Apr 2026 22:24:11 +0800
Subject: [PATCH 40/45] Support fa3_fwd_interface in kernel benchmark

Signed-off-by: lishunyang <lishunyang12@163.com>
---
 benchmarks/diffusion/bench_attn_kernel.py | 35 +++++++++++++++++------
 1 file changed, 26 insertions(+), 9 deletions(-)

diff --git a/benchmarks/diffusion/bench_attn_kernel.py b/benchmarks/diffusion/bench_attn_kernel.py
index b175322a1cf..6ff3d6fb421 100644
--- a/benchmarks/diffusion/bench_attn_kernel.py
+++ b/benchmarks/diffusion/bench_attn_kernel.py
@@ -42,9 +42,23 @@ def benchmark_fn(fn, warmup=5, repeat=20, **kwargs):
     return avg, min(times), max(times)
 
 
-def bench_flash_attn(q, k, v):
-    from flash_attn import flash_attn_func
-    return flash_attn_func(q, k, v, causal=False)
+def _get_flash_attn_func():
+    """Try fa3_fwd_interface -> flash_attn_interface -> flash_attn (same order as vllm-omni)."""
+    for module_name in [
+        "fa3_fwd_interface",
+        "flash_attn_interface",
+        "flash_attn",
+    ]:
+        try:
+            mod = __import__(module_name, fromlist=["flash_attn_func"])
+            return getattr(mod, "flash_attn_func"), module_name
+        except (ImportError, AttributeError):
+            continue
+    return None, None
+
+
+def bench_flash_attn(q, k, v, _fn=None):
+    return _fn(q, k, v, causal=False)
 
 
 def bench_sage_attn(q, k, v):
@@ -82,15 +96,18 @@ def main():
 
     results = {}
 
-    # --- FlashAttention ---
+    # --- FlashAttention (fa3_fwd / flash_attn_interface / flash_attn) ---
     try:
-        from flash_attn import flash_attn_func  # noqa: F401
+        fa_func, fa_module = _get_flash_attn_func()
+        if fa_func is None:
+            raise ImportError("none of fa3_fwd_interface, flash_attn_interface, flash_attn found")
+        label = f"FlashAttn ({fa_module})"
         avg, lo, hi = benchmark_fn(bench_flash_attn, warmup=args.warmup,
-                                   repeat=args.repeat, q=q, k=k, v=v)
+                                   repeat=args.repeat, q=q, k=k, v=v, _fn=fa_func)
         results["FlashAttention"] = (avg, lo, hi)
-        print(f"FlashAttention:  avg={avg:7.2f} ms  min={lo:7.2f} ms  max={hi:7.2f} ms")
-    except ImportError:
-        print("FlashAttention:  NOT AVAILABLE (flash_attn not installed)")
+        print(f"{label:24s} avg={avg:7.2f} ms  min={lo:7.2f} ms  max={hi:7.2f} ms")
+    except ImportError as e:
+        print(f"FlashAttention:  NOT AVAILABLE ({e})")
     except Exception as e:
         print(f"FlashAttention:  ERROR - {e}")
 

From 19c7df6ee11690c64ac9b6f945ae6a52af8f2b75 Mon Sep 17 00:00:00 2001
From: lishunyang <lishunyang12@163.com>
Date: Thu, 9 Apr 2026 22:33:16 +0800
Subject: [PATCH 41/45] Rewrite kernel bench to match SageAttention official
 style

TFLOPS metric, CUDA events timing, L2 flush, sweep mode.
Ref: https://github.com/thu-ml/SageAttention/tree/main/bench

Signed-off-by: lishunyang <lishunyang12@163.com>
---
 benchmarks/diffusion/bench_attn_kernel.py | 266 ++++++++++++++--------
 1 file changed, 167 insertions(+), 99 deletions(-)

diff --git a/benchmarks/diffusion/bench_attn_kernel.py b/benchmarks/diffusion/bench_attn_kernel.py
index 6ff3d6fb421..a3f0cb8e1d7 100644
--- a/benchmarks/diffusion/bench_attn_kernel.py
+++ b/benchmarks/diffusion/bench_attn_kernel.py
@@ -1,16 +1,24 @@
 # SPDX-License-Identifier: Apache-2.0
 """
-Kernel-level benchmark: FlashAttention vs SageAttention
-Isolates attention kernel performance from model loading, torch.compile, VAE, etc.
+Kernel-level benchmark: FA3 vs SageAttention vs SDPA
+Follows SageAttention official bench style (TFLOPS + sweep seq lengths).
+Reference: https://github.com/thu-ml/SageAttention/tree/main/bench
 
-HunyuanVideo 1.5 config: 16 heads, head_dim=128
-Latent seq lengths (after VAE compression 4x temporal, 16x spatial):
-  480x832, 33 frames  -> ~9 x 30 x 52  = ~14,040
-  480x832, 121 frames -> ~31 x 30 x 52 = ~48,360
+HunyuanVideo 1.5 diffusion config: B=1, H=16, D=128
+LLM-style config (SageAttention default): B=4, H=32, D=128
 
 Usage:
+  # Diffusion config (default) — HunyuanVideo 1.5
   python bench_attn_kernel.py
-  python bench_attn_kernel.py --seq-len 48360 --num-heads 16 --head-dim 128
+
+  # LLM config (matches SageAttention official bench)
+  python bench_attn_kernel.py --batch-size 4 --num-heads 32 --dtype float16
+
+  # Single seq length
+  python bench_attn_kernel.py --seq-len 48360
+
+  # Sweep mode (multiple seq lengths)
+  python bench_attn_kernel.py --sweep
 """
 
 import argparse
@@ -19,31 +27,44 @@
 import torch
 
 
-def benchmark_fn(fn, warmup=5, repeat=20, **kwargs):
-    """Benchmark a function with CUDA synchronization."""
+def _flush_l2():
+    """Flush L2 cache with 256 MB zeros (same as SageAttention bench)."""
+    cache = torch.empty(int(256e6 // 4), dtype=torch.int, device="cuda")
+    cache.zero_()
+
+
+def benchmark_fn(fn, warmup=5, repeat=100, flush_l2=True):
+    """Benchmark with CUDA events (matches SageAttention bench style)."""
+    # warmup
     for _ in range(warmup):
-        fn(**kwargs)
+        fn()
+
     torch.cuda.synchronize()
+    start = torch.cuda.Event(enable_timing=True)
+    end = torch.cuda.Event(enable_timing=True)
 
-    times = []
+    start.record()
     for _ in range(repeat):
-        torch.cuda.synchronize()
-        t0 = time.perf_counter()
-        fn(**kwargs)
-        torch.cuda.synchronize()
-        t1 = time.perf_counter()
-        times.append((t1 - t0) * 1000)  # ms
+        if flush_l2:
+            _flush_l2()
+        fn()
+    end.record()
+    torch.cuda.synchronize()
+
+    elapsed_ms = start.elapsed_time(end) / repeat
+    return elapsed_ms
+
 
-    times.sort()
-    # trim top/bottom 20%
-    trim = max(1, len(times) // 5)
-    trimmed = times[trim:-trim] if trim < len(times) // 2 else times
-    avg = sum(trimmed) / len(trimmed)
-    return avg, min(times), max(times)
+def calc_flops(batch, heads, headdim, seq_len, causal=False):
+    """Standard attention FLOPS: 4 * B * H * D * S^2 (halved if causal)."""
+    flops = 4 * batch * heads * headdim * seq_len * seq_len
+    if causal:
+        flops //= 2
+    return flops
 
 
 def _get_flash_attn_func():
-    """Try fa3_fwd_interface -> flash_attn_interface -> flash_attn (same order as vllm-omni)."""
+    """Try fa3_fwd_interface -> flash_attn_interface -> flash_attn."""
     for module_name in [
         "fa3_fwd_interface",
         "flash_attn_interface",
@@ -57,38 +78,10 @@ def _get_flash_attn_func():
     return None, None
 
 
-def bench_flash_attn(q, k, v, _fn=None):
-    return _fn(q, k, v, causal=False)
-
-
-def bench_sage_attn(q, k, v):
-    from sageattention import sageattn
-    return sageattn(q, k, v, tensor_layout="NHD", is_causal=False)
-
-
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--seq-len", type=int, default=48360,
-                        help="Sequence length (default: 48360 for 121 frames)")
-    parser.add_argument("--num-heads", type=int, default=16)
-    parser.add_argument("--head-dim", type=int, default=128)
-    parser.add_argument("--batch-size", type=int, default=1)
-    parser.add_argument("--dtype", type=str, default="bfloat16",
-                        choices=["bfloat16", "float16"])
-    parser.add_argument("--warmup", type=int, default=5)
-    parser.add_argument("--repeat", type=int, default=20)
-    args = parser.parse_args()
-
-    dtype = getattr(torch, args.dtype)
+def run_single(B, S, H, D, dtype, repeat, causal=False):
+    """Run all backends for a single (B, S, H, D) config."""
     device = "cuda"
-    B, S, H, D = args.batch_size, args.seq_len, args.num_heads, args.head_dim
-
-    print(f"Config: B={B}, S={S}, H={H}, D={D}, dtype={args.dtype}")
-    print(f"Tensor shape: ({B}, {S}, {H}, {D})")
-    mem_per_tensor = B * S * H * D * (2 if dtype == torch.float16 or dtype == torch.bfloat16 else 4)
-    print(f"Memory per Q/K/V tensor: {mem_per_tensor / 1e6:.1f} MB")
-    print(f"Warmup={args.warmup}, Repeat={args.repeat}")
-    print()
+    flops = calc_flops(B, H, D, S, causal)
 
     q = torch.randn(B, S, H, D, dtype=dtype, device=device)
     k = torch.randn(B, S, H, D, dtype=dtype, device=device)
@@ -96,59 +89,134 @@ def main():
 
     results = {}
 
-    # --- FlashAttention (fa3_fwd / flash_attn_interface / flash_attn) ---
-    try:
-        fa_func, fa_module = _get_flash_attn_func()
-        if fa_func is None:
-            raise ImportError("none of fa3_fwd_interface, flash_attn_interface, flash_attn found")
-        label = f"FlashAttn ({fa_module})"
-        avg, lo, hi = benchmark_fn(bench_flash_attn, warmup=args.warmup,
-                                   repeat=args.repeat, q=q, k=k, v=v, _fn=fa_func)
-        results["FlashAttention"] = (avg, lo, hi)
-        print(f"{label:24s} avg={avg:7.2f} ms  min={lo:7.2f} ms  max={hi:7.2f} ms")
-    except ImportError as e:
-        print(f"FlashAttention:  NOT AVAILABLE ({e})")
-    except Exception as e:
-        print(f"FlashAttention:  ERROR - {e}")
+    # --- FA3 / FlashAttention ---
+    fa_func, fa_module = _get_flash_attn_func()
+    if fa_func is not None:
+        try:
+            ms = benchmark_fn(lambda: fa_func(q, k, v, causal=causal), repeat=repeat)
+            tflops = flops / ms / 1e9  # ms -> s -> TFLOPS
+            results["FA3"] = (ms, tflops)
+        except Exception as e:
+            results["FA3"] = (None, f"ERROR: {e}")
+    else:
+        results["FA3"] = (None, "N/A")
 
     # --- SageAttention ---
     try:
-        from sageattention import sageattn  # noqa: F401
-        avg, lo, hi = benchmark_fn(bench_sage_attn, warmup=args.warmup,
-                                   repeat=args.repeat, q=q, k=k, v=v)
-        results["SageAttention"] = (avg, lo, hi)
-        print(f"SageAttention:   avg={avg:7.2f} ms  min={lo:7.2f} ms  max={hi:7.2f} ms")
+        from sageattention import sageattn
+        ms = benchmark_fn(
+            lambda: sageattn(q, k, v, tensor_layout="NHD", is_causal=causal),
+            repeat=repeat,
+        )
+        tflops = flops / ms / 1e9
+        results["SageAttn"] = (ms, tflops)
     except ImportError:
-        print("SageAttention:   NOT AVAILABLE (sageattention not installed)")
+        results["SageAttn"] = (None, "N/A")
     except Exception as e:
-        print(f"SageAttention:   ERROR - {e}")
+        results["SageAttn"] = (None, f"ERROR: {e}")
 
     # --- torch SDPA ---
     try:
-        # SDPA expects (B, H, S, D)
-        q_sdpa = q.transpose(1, 2)
-        k_sdpa = k.transpose(1, 2)
-        v_sdpa = v.transpose(1, 2)
-
-        def bench_sdpa(q, k, v):
-            return torch.nn.functional.scaled_dot_product_attention(
-                q, k, v, is_causal=False)
-
-        avg, lo, hi = benchmark_fn(bench_sdpa, warmup=args.warmup,
-                                   repeat=args.repeat, q=q_sdpa, k=k_sdpa, v=v_sdpa)
-        results["torch SDPA"] = (avg, lo, hi)
-        print(f"torch SDPA:      avg={avg:7.2f} ms  min={lo:7.2f} ms  max={hi:7.2f} ms")
+        q_sdpa = q.transpose(1, 2).contiguous()
+        k_sdpa = k.transpose(1, 2).contiguous()
+        v_sdpa = v.transpose(1, 2).contiguous()
+        ms = benchmark_fn(
+            lambda: torch.nn.functional.scaled_dot_product_attention(
+                q_sdpa, k_sdpa, v_sdpa, is_causal=causal
+            ),
+            repeat=repeat,
+        )
+        tflops = flops / ms / 1e9
+        results["SDPA"] = (ms, tflops)
     except Exception as e:
-        print(f"torch SDPA:      ERROR - {e}")
-
-    # --- Summary ---
-    if len(results) >= 2:
-        print()
-        baseline_name = "FlashAttention" if "FlashAttention" in results else list(results.keys())[0]
-        baseline_avg = results[baseline_name][0]
-        for name, (avg, lo, hi) in results.items():
-            ratio = avg / baseline_avg
-            print(f"  {name:20s}  {avg:7.2f} ms  ({ratio:.2f}x vs {baseline_name})")
+        results["SDPA"] = (None, f"ERROR: {e}")
+
+    return results
+
+
+def print_row(seq_len, results):
+    """Print one row of results."""
+    parts = [f"S={seq_len:>6d}"]
+    for name in ["FA3", "SageAttn", "SDPA"]:
+        ms, tflops = results.get(name, (None, "N/A"))
+        if ms is not None:
+            parts.append(f"{name}: {ms:7.2f} ms ({tflops:6.1f} TFLOPS)")
+        else:
+            parts.append(f"{name}: {tflops}")
+    print("  ".join(parts))
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Attention kernel benchmark (FA3 vs SageAttn vs SDPA)")
+    parser.add_argument("--batch-size", type=int, default=1)
+    parser.add_argument("--num-heads", type=int, default=16)
+    parser.add_argument("--head-dim", type=int, default=128)
+    parser.add_argument("--dtype", type=str, default="bfloat16",
+                        choices=["bfloat16", "float16"])
+    parser.add_argument("--repeat", type=int, default=100)
+    parser.add_argument("--causal", action="store_true")
+    parser.add_argument("--seq-len", type=int, default=None,
+                        help="Single seq length to test")
+    parser.add_argument("--sweep", action="store_true",
+                        help="Sweep standard seq lengths (1K-32K) + diffusion lengths")
+    args = parser.parse_args()
+
+    dtype = getattr(torch, args.dtype)
+    B, H, D = args.batch_size, args.num_heads, args.head_dim
+
+    fa_func, fa_module = _get_flash_attn_func()
+    fa_label = f"fa3_fwd ({fa_module})" if fa_module else "N/A"
+
+    print(f"Config: B={B}, H={H}, D={D}, dtype={args.dtype}, causal={args.causal}")
+    print(f"FlashAttn source: {fa_label}")
+    print(f"Repeat: {args.repeat}, L2 flush: enabled")
+    print(f"GPU: {torch.cuda.get_device_name(0)}")
+    print()
+
+    if args.seq_len is not None:
+        # Single seq length mode
+        seq_lens = [args.seq_len]
+    elif args.sweep:
+        # Sweep: standard (SageAttention bench) + diffusion-specific
+        seq_lens = [1024, 2048, 4096, 8192, 14040, 16384, 32768, 48360]
+    else:
+        # Default: diffusion-relevant lengths
+        seq_lens = [14040, 48360]
+
+    print(f"{'S':>8s}  {'FA3':>24s}  {'SageAttn':>24s}  {'SDPA':>24s}")
+    print("-" * 90)
+
+    all_results = {}
+    for S in seq_lens:
+        results = run_single(B, S, H, D, dtype, args.repeat, causal=args.causal)
+        all_results[S] = results
+
+        parts = [f"{S:>8d}"]
+        for name in ["FA3", "SageAttn", "SDPA"]:
+            ms, tflops = results.get(name, (None, "N/A"))
+            if ms is not None:
+                parts.append(f"{ms:7.2f} ms / {tflops:6.1f} TF")
+            else:
+                parts.append(f"{'N/A':>24s}")
+        print("  ".join(parts))
+
+    # Summary
+    print()
+    print("Speedup vs FA3:")
+    for S in seq_lens:
+        results = all_results[S]
+        fa3_ms = results["FA3"][0] if results["FA3"][0] else None
+        if fa3_ms is None:
+            continue
+        parts = [f"  S={S:>6d}"]
+        for name in ["SageAttn", "SDPA"]:
+            ms = results[name][0] if results[name][0] else None
+            if ms:
+                parts.append(f"{name}: {ms/fa3_ms:.2f}x")
+            else:
+                parts.append(f"{name}: N/A")
+        print("  ".join(parts))
 
 
 if __name__ == "__main__":

From c20feec068d94fa85375d951064f4635692b6fe4 Mon Sep 17 00:00:00 2001
From: lishunyang <lishunyang12@163.com>
Date: Thu, 9 Apr 2026 22:42:01 +0800
Subject: [PATCH 42/45] Rewrite bench to exactly follow SageAttention official
 bench

Signed-off-by: lishunyang <lishunyang12@163.com>
---
 benchmarks/diffusion/bench_attn_kernel.py | 501 +++++++++++++---------
 1 file changed, 295 insertions(+), 206 deletions(-)

diff --git a/benchmarks/diffusion/bench_attn_kernel.py b/benchmarks/diffusion/bench_attn_kernel.py
index a3f0cb8e1d7..936d403da99 100644
--- a/benchmarks/diffusion/bench_attn_kernel.py
+++ b/benchmarks/diffusion/bench_attn_kernel.py
@@ -1,223 +1,312 @@
-# SPDX-License-Identifier: Apache-2.0
 """
-Kernel-level benchmark: FA3 vs SageAttention vs SDPA
-Follows SageAttention official bench style (TFLOPS + sweep seq lengths).
-Reference: https://github.com/thu-ml/SageAttention/tree/main/bench
+Reimplemented from SageAttention official bench:
+https://github.com/thu-ml/SageAttention/tree/main/bench
 
-HunyuanVideo 1.5 diffusion config: B=1, H=16, D=128
-LLM-style config (SageAttention default): B=4, H=32, D=128
+Scripts:
+  bench_baseline.py       -> --method fa2/torch/xformers
+  bench_fa3.py            -> --method fa3
+  bench_qk_int8_pv_fp16_cuda.py   -> --method sage_int8_fp16_cuda
+  bench_qk_int8_pv_fp16_triton.py -> --method sage_int8_fp16_triton
+  bench_qk_int8_pv_fp8_cuda.py    -> --method sage_int8_fp8_cuda (SM89, RTX 4090)
+  bench_qk_int8_pv_fp8_cuda_sm90.py -> --method sage_int8_fp8_cuda_sm90 (H100)
 
 Usage:
-  # Diffusion config (default) — HunyuanVideo 1.5
-  python bench_attn_kernel.py
-
-  # LLM config (matches SageAttention official bench)
-  python bench_attn_kernel.py --batch-size 4 --num-heads 32 --dtype float16
-
-  # Single seq length
-  python bench_attn_kernel.py --seq-len 48360
-
-  # Sweep mode (multiple seq lengths)
-  python bench_attn_kernel.py --sweep
+  python bench_attn_kernel.py --method fa3
+  python bench_attn_kernel.py --method fa2
+  python bench_attn_kernel.py --method torch
+  python bench_attn_kernel.py --method sage_int8_fp16_cuda
+  python bench_attn_kernel.py --method sage_int8_fp16_triton
+  python bench_attn_kernel.py --method sage_int8_fp8_cuda
+  python bench_attn_kernel.py --method sage_int8_fp8_cuda_sm90
 """
 
 import argparse
-import time
+import re
+import subprocess
 
 import torch
+from flash_attn.utils.benchmark import benchmark_forward
 
 
-def _flush_l2():
-    """Flush L2 cache with 256 MB zeros (same as SageAttention bench)."""
-    cache = torch.empty(int(256e6 // 4), dtype=torch.int, device="cuda")
-    cache.zero_()
-
-
-def benchmark_fn(fn, warmup=5, repeat=100, flush_l2=True):
-    """Benchmark with CUDA events (matches SageAttention bench style)."""
-    # warmup
-    for _ in range(warmup):
-        fn()
-
-    torch.cuda.synchronize()
-    start = torch.cuda.Event(enable_timing=True)
-    end = torch.cuda.Event(enable_timing=True)
-
-    start.record()
-    for _ in range(repeat):
-        if flush_l2:
-            _flush_l2()
-        fn()
-    end.record()
-    torch.cuda.synchronize()
-
-    elapsed_ms = start.elapsed_time(end) / repeat
-    return elapsed_ms
-
-
-def calc_flops(batch, heads, headdim, seq_len, causal=False):
-    """Standard attention FLOPS: 4 * B * H * D * S^2 (halved if causal)."""
-    flops = 4 * batch * heads * headdim * seq_len * seq_len
-    if causal:
-        flops //= 2
-    return flops
+def get_cuda_version():
+    try:
+        output = subprocess.check_output(['nvcc', '--version']).decode()
+        match = re.search(r'release (\d+)\.(\d+)', output)
+        if match:
+            major, minor = int(match.group(1)), int(match.group(2))
+            return major, minor
+    except Exception as e:
+        print("Failed to get CUDA version:", e)
+    return None, None
 
 
-def _get_flash_attn_func():
-    """Try fa3_fwd_interface -> flash_attn_interface -> flash_attn."""
-    for module_name in [
-        "fa3_fwd_interface",
-        "flash_attn_interface",
-        "flash_attn",
-    ]:
+parser = argparse.ArgumentParser(description='Attention Kernel Benchmark (SageAttention official style)')
+parser.add_argument('--method', type=str, default='fa3',
+                    choices=['fa2', 'torch', 'xformers', 'fa3',
+                             'sage_int8_fp16_cuda', 'sage_int8_fp16_triton',
+                             'sage_int8_fp8_cuda', 'sage_int8_fp8_cuda_sm90'])
+parser.add_argument('--batch_size', type=int, default=4, help='Batch size')
+parser.add_argument('--num_heads', type=int, default=32, help='Number of heads')
+parser.add_argument('--head_dim', type=int, default=128, help='Head dimension')
+parser.add_argument('--quant_gran', type=str, default='per_warp', choices=['per_warp', 'per_thread'],
+                    help='Quantization granularity (sage kernels only)')
+parser.add_argument('--pv_accum_dtype', type=str, default=None,
+                    help='PV accumulation dtype (sage kernels only)')
+args = parser.parse_args()
+
+head = args.num_heads
+batch = args.batch_size
+headdim = args.head_dim
+
+# ============================================================
+# bench_baseline: fa2 / torch / xformers
+# ============================================================
+if args.method in ('fa2', 'torch', 'xformers'):
+    from torch.nn.functional import scaled_dot_product_attention as sdpa
+
+    torch.backends.cuda.enable_flash_sdp(args.method == 'fa2')
+    torch.backends.cuda.enable_math_sdp(args.method == 'torch')
+    torch.backends.cuda.enable_mem_efficient_sdp(args.method == 'xformers')
+
+    print(f"Baseline: {args.method}")
+    print(f"batch: {batch}, head: {head}, headdim: {headdim}")
+
+    for is_causal in [False, True]:
+        print(f"is_causal: {is_causal}")
+        for seq_len in sorted({1024, 2048, 4096, 8192, 16384, 32768}):
+            flops = 4 * head * batch * headdim * seq_len * seq_len // (2 if is_causal else 1)
+            q = torch.randn(batch, head, seq_len, headdim, dtype=torch.float16, device="cuda")
+            k = torch.randn(batch, head, seq_len, headdim, dtype=torch.float16, device="cuda")
+            v = torch.randn(batch, head, seq_len, headdim, dtype=torch.float16, device="cuda")
+            for i in range(5): sdpa(q, k, v, is_causal=is_causal)
+            torch.cuda.synchronize()
+            _, time = benchmark_forward(sdpa, q, k, v, is_causal=is_causal, repeats=100, verbose=False, desc='Triton')
+            print(f'{seq_len} flops:{flops/time.mean*1e-12}')
+
+# ============================================================
+# bench_fa3
+# ============================================================
+elif args.method == 'fa3':
+    # Try fa3_fwd_interface first (vllm-omni custom build), then flash_attn_interface
+    flash_attn_func_v3 = None
+    fa3_source = None
+    for mod_name in ['fa3_fwd_interface', 'flash_attn_interface']:
         try:
-            mod = __import__(module_name, fromlist=["flash_attn_func"])
-            return getattr(mod, "flash_attn_func"), module_name
+            mod = __import__(mod_name, fromlist=['flash_attn_func'])
+            flash_attn_func_v3 = getattr(mod, 'flash_attn_func')
+            fa3_source = mod_name
+            break
         except (ImportError, AttributeError):
             continue
-    return None, None
-
-
-def run_single(B, S, H, D, dtype, repeat, causal=False):
-    """Run all backends for a single (B, S, H, D) config."""
-    device = "cuda"
-    flops = calc_flops(B, H, D, S, causal)
-
-    q = torch.randn(B, S, H, D, dtype=dtype, device=device)
-    k = torch.randn(B, S, H, D, dtype=dtype, device=device)
-    v = torch.randn(B, S, H, D, dtype=dtype, device=device)
-
-    results = {}
-
-    # --- FA3 / FlashAttention ---
-    fa_func, fa_module = _get_flash_attn_func()
-    if fa_func is not None:
-        try:
-            ms = benchmark_fn(lambda: fa_func(q, k, v, causal=causal), repeat=repeat)
-            tflops = flops / ms / 1e9  # ms -> s -> TFLOPS
-            results["FA3"] = (ms, tflops)
-        except Exception as e:
-            results["FA3"] = (None, f"ERROR: {e}")
-    else:
-        results["FA3"] = (None, "N/A")
-
-    # --- SageAttention ---
-    try:
-        from sageattention import sageattn
-        ms = benchmark_fn(
-            lambda: sageattn(q, k, v, tensor_layout="NHD", is_causal=causal),
-            repeat=repeat,
-        )
-        tflops = flops / ms / 1e9
-        results["SageAttn"] = (ms, tflops)
-    except ImportError:
-        results["SageAttn"] = (None, "N/A")
-    except Exception as e:
-        results["SageAttn"] = (None, f"ERROR: {e}")
 
-    # --- torch SDPA ---
-    try:
-        q_sdpa = q.transpose(1, 2).contiguous()
-        k_sdpa = k.transpose(1, 2).contiguous()
-        v_sdpa = v.transpose(1, 2).contiguous()
-        ms = benchmark_fn(
-            lambda: torch.nn.functional.scaled_dot_product_attention(
-                q_sdpa, k_sdpa, v_sdpa, is_causal=causal
-            ),
-            repeat=repeat,
-        )
-        tflops = flops / ms / 1e9
-        results["SDPA"] = (ms, tflops)
-    except Exception as e:
-        results["SDPA"] = (None, f"ERROR: {e}")
-
-    return results
-
-
-def print_row(seq_len, results):
-    """Print one row of results."""
-    parts = [f"S={seq_len:>6d}"]
-    for name in ["FA3", "SageAttn", "SDPA"]:
-        ms, tflops = results.get(name, (None, "N/A"))
-        if ms is not None:
-            parts.append(f"{name}: {ms:7.2f} ms ({tflops:6.1f} TFLOPS)")
-        else:
-            parts.append(f"{name}: {tflops}")
-    print("  ".join(parts))
-
-
-def main():
-    parser = argparse.ArgumentParser(
-        description="Attention kernel benchmark (FA3 vs SageAttn vs SDPA)")
-    parser.add_argument("--batch-size", type=int, default=1)
-    parser.add_argument("--num-heads", type=int, default=16)
-    parser.add_argument("--head-dim", type=int, default=128)
-    parser.add_argument("--dtype", type=str, default="bfloat16",
-                        choices=["bfloat16", "float16"])
-    parser.add_argument("--repeat", type=int, default=100)
-    parser.add_argument("--causal", action="store_true")
-    parser.add_argument("--seq-len", type=int, default=None,
-                        help="Single seq length to test")
-    parser.add_argument("--sweep", action="store_true",
-                        help="Sweep standard seq lengths (1K-32K) + diffusion lengths")
-    args = parser.parse_args()
-
-    dtype = getattr(torch, args.dtype)
-    B, H, D = args.batch_size, args.num_heads, args.head_dim
-
-    fa_func, fa_module = _get_flash_attn_func()
-    fa_label = f"fa3_fwd ({fa_module})" if fa_module else "N/A"
-
-    print(f"Config: B={B}, H={H}, D={D}, dtype={args.dtype}, causal={args.causal}")
-    print(f"FlashAttn source: {fa_label}")
-    print(f"Repeat: {args.repeat}, L2 flush: enabled")
-    print(f"GPU: {torch.cuda.get_device_name(0)}")
-    print()
-
-    if args.seq_len is not None:
-        # Single seq length mode
-        seq_lens = [args.seq_len]
-    elif args.sweep:
-        # Sweep: standard (SageAttention bench) + diffusion-specific
-        seq_lens = [1024, 2048, 4096, 8192, 14040, 16384, 32768, 48360]
-    else:
-        # Default: diffusion-relevant lengths
-        seq_lens = [14040, 48360]
-
-    print(f"{'S':>8s}  {'FA3':>24s}  {'SageAttn':>24s}  {'SDPA':>24s}")
-    print("-" * 90)
-
-    all_results = {}
-    for S in seq_lens:
-        results = run_single(B, S, H, D, dtype, args.repeat, causal=args.causal)
-        all_results[S] = results
-
-        parts = [f"{S:>8d}"]
-        for name in ["FA3", "SageAttn", "SDPA"]:
-            ms, tflops = results.get(name, (None, "N/A"))
-            if ms is not None:
-                parts.append(f"{ms:7.2f} ms / {tflops:6.1f} TF")
-            else:
-                parts.append(f"{'N/A':>24s}")
-        print("  ".join(parts))
-
-    # Summary
-    print()
-    print("Speedup vs FA3:")
-    for S in seq_lens:
-        results = all_results[S]
-        fa3_ms = results["FA3"][0] if results["FA3"][0] else None
-        if fa3_ms is None:
-            continue
-        parts = [f"  S={S:>6d}"]
-        for name in ["SageAttn", "SDPA"]:
-            ms = results[name][0] if results[name][0] else None
-            if ms:
-                parts.append(f"{name}: {ms/fa3_ms:.2f}x")
-            else:
-                parts.append(f"{name}: N/A")
-        print("  ".join(parts))
-
-
-if __name__ == "__main__":
-    main()
+    if flash_attn_func_v3 is None:
+        raise ImportError("Neither fa3_fwd_interface nor flash_attn_interface found. Install FA3.")
+
+    print(f"FlashAttention3 Benchmark (source: {fa3_source})")
+    print(f"batch: {batch}, head: {head}, headdim: {headdim}")
+
+    for is_causal in [False, True]:
+        print(f"is_causal: {is_causal}")
+        for seq_len in sorted({1024, 2048, 4096, 8192, 16384, 32768}):
+            flops = 4 * head * batch * headdim * seq_len * seq_len // (2 if is_causal else 1)
+            q = torch.randn(batch, seq_len, head, headdim, dtype=torch.float16, device="cuda")
+            k = torch.randn(batch, seq_len, head, headdim, dtype=torch.float16, device="cuda")
+            v = torch.randn(batch, seq_len, head, headdim, dtype=torch.float16, device="cuda")
+            for i in range(5): flash_attn_func_v3(q, k, v, causal=is_causal)
+            torch.cuda.synchronize()
+            _, time = benchmark_forward(flash_attn_func_v3, q, k, v, causal=is_causal, repeats=100, verbose=False, desc='Triton')
+            print(f'{seq_len} flops:{flops/time.mean*1e-12}')
+
+# ============================================================
+# bench_qk_int8_pv_fp16_cuda
+# ============================================================
+elif args.method == 'sage_int8_fp16_cuda':
+    import sageattention._qattn_sm80 as qattn
+
+    pv_accum = args.pv_accum_dtype or 'fp16'
+    assert pv_accum in ('fp16', 'fp16+fp32', 'fp32')
+
+    WARP_Q = 16 if (headdim == 128 and pv_accum == "fp16+fp32") else 32
+    WARP_K = 64
+
+    if pv_accum == 'fp32':
+        kernel = qattn.qk_int8_sv_f16_accum_f32_attn
+    elif pv_accum == 'fp16+fp32':
+        kernel = qattn.qk_int8_sv_f16_accum_f16_attn_inst_buf
+    elif pv_accum == 'fp16':
+        kernel = qattn.qk_int8_sv_f16_accum_f16_attn
+
+    _qk_quant_gran = 3 if args.quant_gran == 'per_thread' else 2
+
+    print(f"CUDA QK Int8 PV FP16 Benchmark")
+    print(f"batch: {batch}, head: {head}, headdim: {headdim}, pv_accum_dtype: {pv_accum}")
+
+    for is_causal in [False, True]:
+        _is_causal = 1 if is_causal else 0
+        print(f"is_causal: {is_causal}")
+        for seq_len in sorted({1024, 2048, 4096, 8192, 16384, 32768}):
+            flops = 4 * head * batch * headdim * seq_len * seq_len / (2 if is_causal else 1)
+
+            q = torch.randint(-95, 95, (batch, seq_len, head, headdim), dtype=torch.int8, device="cuda")
+            k = torch.randint(-95, 95, (batch, seq_len, head, headdim), dtype=torch.int8, device="cuda")
+
+            if args.quant_gran == 'per_warp':
+                q_scale = torch.randn(batch, head, seq_len // WARP_Q, dtype=torch.float, device="cuda")
+                k_scale = torch.randn(batch, head, seq_len // WARP_K, dtype=torch.float, device="cuda")
+            elif args.quant_gran == 'per_thread':
+                q_scale = torch.randn(batch, head, seq_len // WARP_Q * 8, dtype=torch.float, device="cuda")
+                k_scale = torch.randn(batch, head, seq_len // WARP_K * 4, dtype=torch.float, device="cuda")
+
+            v = torch.randn(batch, seq_len, head, headdim, dtype=torch.float16, device="cuda")
+            o = torch.empty(batch, seq_len, head, headdim, dtype=torch.float16, device="cuda")
+            sm_scale = 1 / (headdim ** 0.5)
+            for i in range(5): kernel(q, k, v, o, q_scale, k_scale, 0, _is_causal, _qk_quant_gran, sm_scale, 0)
+            torch.cuda.synchronize()
+            _, time = benchmark_forward(kernel, q, k, v, o, q_scale, k_scale, 0, _is_causal, _qk_quant_gran, sm_scale, 0, repeats=100, verbose=False, desc='Triton')
+            print(f'{seq_len} flops:{flops/time.mean*1e-12}')
+
+# ============================================================
+# bench_qk_int8_pv_fp16_triton
+# ============================================================
+elif args.method == 'sage_int8_fp16_triton':
+    from sageattention.triton.attn_qk_int8_per_block import forward
+    from sageattention.triton.attn_qk_int8_per_block_causal import forward as forward_causal
+
+    print(f"Triton QK Int8 PV FP16 Benchmark")
+    print(f"batch_size: {batch}, num_heads: {head}, head_dim: {headdim}")
+
+    # non-causal
+    print("is_causal: False")
+    for seq_len in sorted({1024, 2048, 4096, 8192, 16384, 32768}):
+        flops = 4 * head * batch * headdim * seq_len * seq_len
+
+        q = torch.randint(-100, 100, (batch, head, seq_len, headdim), dtype=torch.int8, device='cuda')
+        k = torch.randint(-100, 100, (batch, head, seq_len, headdim), dtype=torch.int8, device='cuda')
+        v = torch.randn(batch, head, seq_len, headdim, dtype=torch.float16, device='cuda')
+
+        q_scale = torch.randn(batch, head, (seq_len // 128), 1, dtype=torch.float16, device='cuda')
+        k_scale = torch.randn(batch, head, (seq_len // 64), 1, dtype=torch.float16, device='cuda')
+
+        for i in range(5): forward(q, k, v, q_scale, k_scale, output_dtype=torch.bfloat16)
+        torch.cuda.synchronize()
+        _, time = benchmark_forward(forward, q, k, v, q_scale, k_scale, output_dtype=torch.bfloat16, repeats=100, verbose=False, desc='Triton')
+        print(f'{seq_len} flops:{flops/time.mean*1e-12}')
+
+    # causal
+    print("is_causal: True")
+    for seq_len in sorted({1024, 2048, 4096, 8192, 16384, 32768}):
+        flops = 4 * head * batch * headdim * seq_len * seq_len // 2
+
+        q = torch.randint(-100, 100, (batch, head, seq_len, headdim), dtype=torch.int8, device='cuda')
+        k = torch.randint(-100, 100, (batch, head, seq_len, headdim), dtype=torch.int8, device='cuda')
+        v = torch.randn(batch, head, seq_len, headdim, dtype=torch.float16, device='cuda')
+
+        q_scale = torch.randn(batch, head, (seq_len // 128), 1, dtype=torch.float16, device='cuda')
+        k_scale = torch.randn(batch, head, (seq_len // 64), 1, dtype=torch.float16, device='cuda')
+
+        for i in range(5): forward_causal(q, k, v, q_scale, k_scale, output_dtype=torch.bfloat16)
+        torch.cuda.synchronize()
+        _, time = benchmark_forward(forward_causal, q, k, v, q_scale, k_scale, output_dtype=torch.bfloat16, repeats=100, verbose=False, desc='Triton')
+        print(f'{seq_len} flops:{flops/time.mean*1e-12}')
+
+# ============================================================
+# bench_qk_int8_pv_fp8_cuda (SM89 / RTX 4090)
+# ============================================================
+elif args.method == 'sage_int8_fp8_cuda':
+    import sageattention._qattn_sm89 as qattn
+
+    pv_accum = args.pv_accum_dtype or 'fp32+fp16'
+    assert pv_accum in ('fp32', 'fp32+fp32', 'fp32+fp16')
+
+    cuda_major, cuda_minor = get_cuda_version()
+    if (cuda_major, cuda_minor) < (12, 8) and pv_accum == 'fp32+fp16':
+        print("=============\n NOTE: cuda version < 12.8, not support pv_accum_dtype fp32+fp16.")
+        print(" Switch to 'fp32+fp32' automatically\n=============")
+        pv_accum = 'fp32+fp32'
+
+    WARP_Q = 32
+    WARP_K = 64
+
+    if pv_accum == 'fp32':
+        kernel = qattn.qk_int8_sv_f8_accum_f32_attn
+    elif pv_accum == 'fp32+fp32':
+        kernel = qattn.qk_int8_sv_f8_accum_f32_attn_inst_buf
+    elif pv_accum == 'fp32+fp16':
+        kernel = qattn.qk_int8_sv_f8_accum_f16_attn_inst_buf
+
+    _qk_quant_gran = 3 if args.quant_gran == 'per_thread' else 2
+
+    print(f"CUDA QK Int8 PV FP8 Benchmark (SM89)")
+    print(f"batch: {batch}, head: {head}, headdim: {headdim}, pv_accum_dtype: {pv_accum}")
+
+    for is_causal in [False, True]:
+        _is_causal = 1 if is_causal else 0
+        print(f"is_causal: {is_causal}")
+        for seq_len in sorted({1024, 2048, 4096, 8192, 16384, 32768}):
+            flops = 4 * head * batch * headdim * seq_len * seq_len / (2 if is_causal else 1)
+
+            q = torch.randint(-95, 95, (batch, seq_len, head, headdim), dtype=torch.int8, device="cuda")
+            k = torch.randint(-95, 95, (batch, seq_len, head, headdim), dtype=torch.int8, device="cuda")
+            o = torch.empty(batch, seq_len, head, headdim, dtype=torch.float16, device="cuda")
+
+            vm = torch.randn(batch, head, headdim, dtype=torch.float, device="cuda")
+            v_scale = torch.randn(batch, head, headdim, dtype=torch.float, device="cuda")
+
+            if args.quant_gran == 'per_warp':
+                q_scale = torch.randn(batch, head, seq_len // WARP_Q, dtype=torch.float, device="cuda")
+                k_scale = torch.randn(batch, head, seq_len // WARP_K, dtype=torch.float, device="cuda")
+            elif args.quant_gran == 'per_thread':
+                q_scale = torch.randn(batch, head, seq_len // WARP_Q * 8, dtype=torch.float, device="cuda")
+                k_scale = torch.randn(batch, head, seq_len // WARP_K * 4, dtype=torch.float, device="cuda")
+
+            v = torch.randn(batch, headdim, head, seq_len, dtype=torch.float16, device="cuda").to(torch.float8_e4m3fn)
+            sm_scale = 1 / (headdim ** 0.5)
+            for i in range(5): kernel(q, k, v, o, q_scale, k_scale, 0, _is_causal, _qk_quant_gran, sm_scale, 0)
+            torch.cuda.synchronize()
+            _, time = benchmark_forward(kernel, q, k, v, o, q_scale, k_scale, 0, _is_causal, _qk_quant_gran, sm_scale, 0, repeats=100, verbose=False, desc='Triton')
+            print(f'{seq_len} flops:{flops/time.mean*1e-12}')
+
+# ============================================================
+# bench_qk_int8_pv_fp8_cuda_sm90 (H100)
+# ============================================================
+elif args.method == 'sage_int8_fp8_cuda_sm90':
+    import sageattention._qattn_sm90 as qattn
+
+    pv_accum = args.pv_accum_dtype or 'fp32+fp32'
+    assert pv_accum == 'fp32+fp32', "pure fp32 accumulator is not supported for now"
+
+    WARP_Q = 32
+    WARP_K = 64
+
+    kernel = qattn.qk_int8_sv_f8_accum_f32_attn_inst_buf
+
+    _qk_quant_gran = 3 if args.quant_gran == 'per_thread' else 2
+
+    print(f"CUDA QK Int8 PV FP8 SM90 Benchmark")
+    print(f"batch: {batch}, head: {head}, headdim: {headdim}")
+
+    for is_causal in [False, True]:
+        _is_causal = 1 if is_causal else 0
+        print(f"is_causal: {is_causal}")
+        for seq_len in sorted({1024, 2048, 4096, 8192, 16384, 32768}):
+            flops = 4 * head * batch * headdim * seq_len * seq_len / (2 if is_causal else 1)
+
+            q = torch.randint(-95, 95, (batch, head, seq_len, headdim), dtype=torch.int8, device="cuda")
+            k = torch.randint(-95, 95, (batch, head, seq_len, headdim), dtype=torch.int8, device="cuda")
+            o = torch.empty(batch, head, seq_len, headdim, dtype=torch.float16, device="cuda")
+
+            v_scale = torch.randn(batch, head, headdim, dtype=torch.float, device="cuda")
+
+            if args.quant_gran == 'per_warp':
+                q_scale = torch.randn(batch, head, seq_len // 64 * 4, dtype=torch.float, device="cuda")
+                k_scale = torch.randn(batch, head, seq_len // 128, dtype=torch.float, device="cuda")
+            elif args.quant_gran == 'per_thread':
+                q_scale = torch.randn(batch, head, seq_len // 64 * 4 * 8, dtype=torch.float, device="cuda")
+                k_scale = torch.randn(batch, head, seq_len // 128 * 4, dtype=torch.float, device="cuda")
+
+            v = torch.randn(batch, head, headdim, seq_len, dtype=torch.float16, device="cuda").to(torch.float8_e4m3fn)
+            sm_scale = 1 / (headdim ** 0.5)
+            for i in range(5): kernel(q, k, v, o, q_scale, k_scale, 1, _is_causal, _qk_quant_gran, sm_scale, 0)
+            torch.cuda.synchronize()
+            _, time = benchmark_forward(kernel, q, k, v, o, q_scale, k_scale, 1, _is_causal, _qk_quant_gran, sm_scale, 0, repeats=100, verbose=False, desc='Triton')
+            print(f'{seq_len} flops:{flops/time.mean*1e-12}')

From ebf07716b5a08fca077ee7df9e03b87c73dd50cb Mon Sep 17 00:00:00 2001
From: lishunyang <lishunyang12@163.com>
Date: Thu, 9 Apr 2026 22:49:20 +0800
Subject: [PATCH 43/45] Inline benchmark_forward to remove flash_attn
 dependency

Signed-off-by: lishunyang <lishunyang12@163.com>
---
 benchmarks/diffusion/bench_attn_kernel.py | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/benchmarks/diffusion/bench_attn_kernel.py b/benchmarks/diffusion/bench_attn_kernel.py
index 936d403da99..458cf8bc6ed 100644
--- a/benchmarks/diffusion/bench_attn_kernel.py
+++ b/benchmarks/diffusion/bench_attn_kernel.py
@@ -25,7 +25,22 @@
 import subprocess
 
 import torch
-from flash_attn.utils.benchmark import benchmark_forward
+import torch.utils.benchmark as benchmark
+
+
+def benchmark_forward(fn, *inputs, repeats=100, desc="", verbose=False, **kwinputs):
+    """Reimplemented from flash_attn.utils.benchmark.benchmark_forward
+    so we don't need flash_attn installed just for the timer."""
+    t = benchmark.Timer(
+        stmt="fn(*inputs, **kwinputs)",
+        globals={"fn": fn, "inputs": inputs, "kwinputs": kwinputs},
+        num_threads=torch.get_num_threads(),
+    )
+    m = t.timeit(repeats)
+    if verbose:
+        print(desc, "- Forward pass")
+        print(m)
+    return t, m
 
 
 def get_cuda_version():

From b3fe3f6158d35cf9d30533ccfd6654545c27067a Mon Sep 17 00:00:00 2001
From: lishunyang <lishunyang12@163.com>
Date: Thu, 9 Apr 2026 22:51:11 +0800
Subject: [PATCH 44/45] Add sageattn method, --dtype flag, fix FA3 BF16 support

Signed-off-by: lishunyang <lishunyang12@163.com>
---
 benchmarks/diffusion/bench_attn_kernel.py | 45 ++++++++++++++++++-----
 1 file changed, 35 insertions(+), 10 deletions(-)

diff --git a/benchmarks/diffusion/bench_attn_kernel.py b/benchmarks/diffusion/bench_attn_kernel.py
index 458cf8bc6ed..8d3ec502fad 100644
--- a/benchmarks/diffusion/bench_attn_kernel.py
+++ b/benchmarks/diffusion/bench_attn_kernel.py
@@ -11,7 +11,8 @@
   bench_qk_int8_pv_fp8_cuda_sm90.py -> --method sage_int8_fp8_cuda_sm90 (H100)
 
 Usage:
-  python bench_attn_kernel.py --method fa3
+  python bench_attn_kernel.py --method fa3 --dtype bfloat16
+  python bench_attn_kernel.py --method sageattn --dtype bfloat16
   python bench_attn_kernel.py --method fa2
   python bench_attn_kernel.py --method torch
   python bench_attn_kernel.py --method sage_int8_fp16_cuda
@@ -57,7 +58,7 @@ def get_cuda_version():
 
 parser = argparse.ArgumentParser(description='Attention Kernel Benchmark (SageAttention official style)')
 parser.add_argument('--method', type=str, default='fa3',
-                    choices=['fa2', 'torch', 'xformers', 'fa3',
+                    choices=['fa2', 'torch', 'xformers', 'fa3', 'sageattn',
                              'sage_int8_fp16_cuda', 'sage_int8_fp16_triton',
                              'sage_int8_fp8_cuda', 'sage_int8_fp8_cuda_sm90'])
 parser.add_argument('--batch_size', type=int, default=4, help='Batch size')
@@ -67,11 +68,14 @@ def get_cuda_version():
                     help='Quantization granularity (sage kernels only)')
 parser.add_argument('--pv_accum_dtype', type=str, default=None,
                     help='PV accumulation dtype (sage kernels only)')
+parser.add_argument('--dtype', type=str, default='float16', choices=['float16', 'bfloat16'],
+                    help='Data type for FA3/sageattn/baseline (default: float16)')
 args = parser.parse_args()
 
 head = args.num_heads
 batch = args.batch_size
 headdim = args.head_dim
+dtype = getattr(torch, args.dtype)
 
 # ============================================================
 # bench_baseline: fa2 / torch / xformers
@@ -84,15 +88,15 @@ def get_cuda_version():
     torch.backends.cuda.enable_mem_efficient_sdp(args.method == 'xformers')
 
     print(f"Baseline: {args.method}")
-    print(f"batch: {batch}, head: {head}, headdim: {headdim}")
+    print(f"batch: {batch}, head: {head}, headdim: {headdim}, dtype: {args.dtype}")
 
     for is_causal in [False, True]:
         print(f"is_causal: {is_causal}")
         for seq_len in sorted({1024, 2048, 4096, 8192, 16384, 32768}):
             flops = 4 * head * batch * headdim * seq_len * seq_len // (2 if is_causal else 1)
-            q = torch.randn(batch, head, seq_len, headdim, dtype=torch.float16, device="cuda")
-            k = torch.randn(batch, head, seq_len, headdim, dtype=torch.float16, device="cuda")
-            v = torch.randn(batch, head, seq_len, headdim, dtype=torch.float16, device="cuda")
+            q = torch.randn(batch, head, seq_len, headdim, dtype=dtype, device="cuda")
+            k = torch.randn(batch, head, seq_len, headdim, dtype=dtype, device="cuda")
+            v = torch.randn(batch, head, seq_len, headdim, dtype=dtype, device="cuda")
             for i in range(5): sdpa(q, k, v, is_causal=is_causal)
             torch.cuda.synchronize()
             _, time = benchmark_forward(sdpa, q, k, v, is_causal=is_causal, repeats=100, verbose=False, desc='Triton')
@@ -118,20 +122,41 @@ def get_cuda_version():
         raise ImportError("Neither fa3_fwd_interface nor flash_attn_interface found. Install FA3.")
 
     print(f"FlashAttention3 Benchmark (source: {fa3_source})")
-    print(f"batch: {batch}, head: {head}, headdim: {headdim}")
+    print(f"batch: {batch}, head: {head}, headdim: {headdim}, dtype: {args.dtype}")
 
     for is_causal in [False, True]:
         print(f"is_causal: {is_causal}")
         for seq_len in sorted({1024, 2048, 4096, 8192, 16384, 32768}):
             flops = 4 * head * batch * headdim * seq_len * seq_len // (2 if is_causal else 1)
-            q = torch.randn(batch, seq_len, head, headdim, dtype=torch.float16, device="cuda")
-            k = torch.randn(batch, seq_len, head, headdim, dtype=torch.float16, device="cuda")
-            v = torch.randn(batch, seq_len, head, headdim, dtype=torch.float16, device="cuda")
+            q = torch.randn(batch, seq_len, head, headdim, dtype=dtype, device="cuda")
+            k = torch.randn(batch, seq_len, head, headdim, dtype=dtype, device="cuda")
+            v = torch.randn(batch, seq_len, head, headdim, dtype=dtype, device="cuda")
             for i in range(5): flash_attn_func_v3(q, k, v, causal=is_causal)
             torch.cuda.synchronize()
             _, time = benchmark_forward(flash_attn_func_v3, q, k, v, causal=is_causal, repeats=100, verbose=False, desc='Triton')
             print(f'{seq_len} flops:{flops/time.mean*1e-12}')
 
+# ============================================================
+# bench sageattn high-level API (what vllm-omni actually calls)
+# ============================================================
+elif args.method == 'sageattn':
+    from sageattention import sageattn
+
+    print(f"SageAttention (sageattn high-level API) Benchmark")
+    print(f"batch: {batch}, head: {head}, headdim: {headdim}, dtype: {args.dtype}")
+
+    for is_causal in [False, True]:
+        print(f"is_causal: {is_causal}")
+        for seq_len in sorted({1024, 2048, 4096, 8192, 16384, 32768}):
+            flops = 4 * head * batch * headdim * seq_len * seq_len // (2 if is_causal else 1)
+            q = torch.randn(batch, seq_len, head, headdim, dtype=dtype, device="cuda")
+            k = torch.randn(batch, seq_len, head, headdim, dtype=dtype, device="cuda")
+            v = torch.randn(batch, seq_len, head, headdim, dtype=dtype, device="cuda")
+            for i in range(5): sageattn(q, k, v, tensor_layout="NHD", is_causal=is_causal)
+            torch.cuda.synchronize()
+            _, time = benchmark_forward(sageattn, q, k, v, tensor_layout="NHD", is_causal=is_causal, repeats=100, verbose=False, desc='SageAttn')
+            print(f'{seq_len} flops:{flops/time.mean*1e-12}')
+
 # ============================================================
 # bench_qk_int8_pv_fp16_cuda
 # ============================================================

From 8d5bca84bd1a8ba74845f74684d2472a976c745b Mon Sep 17 00:00:00 2001
From: lishunyang <lishunyang12@163.com>
Date: Thu, 9 Apr 2026 23:48:25 +0800
Subject: [PATCH 45/45] Default to SageAttention when available on CUDA

Priority: SageAttn > FlashAttn > SDPA.
SageAttn2 v2.2.0 with SM90 FP8 kernels is 8% faster than FA3
on H100 for HunyuanVideo 1.5 (4.00 vs 4.35 s/it).

Signed-off-by: lishunyang <lishunyang12@163.com>
---
 vllm_omni/platforms/cuda/platform.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/vllm_omni/platforms/cuda/platform.py b/vllm_omni/platforms/cuda/platform.py
index 6bf740a0188..de1454632a4 100644
--- a/vllm_omni/platforms/cuda/platform.py
+++ b/vllm_omni/platforms/cuda/platform.py
@@ -73,6 +73,17 @@ def get_diffusion_attn_backend_cls(
             logger.info("Using diffusion attention backend '%s'", backend_upper)
             return backend.get_path()
 
+        # Prefer SageAttention (INT8 QK + FP8 PV on Hopper) when available
+        try:
+            import sageattention  # noqa: F401
+            sage_available = True
+        except ImportError:
+            sage_available = False
+
+        if sage_available:
+            logger.info("Defaulting to diffusion attention backend SAGE_ATTN")
+            return DiffusionAttentionBackendEnum.SAGE_ATTN.get_path()
+
         if flash_attn_supported:
             logger.info("Defaulting to diffusion attention backend FLASH_ATTN")
             return DiffusionAttentionBackendEnum.FLASH_ATTN.get_path()