vllm-project · czhu15 · Apr 8, 2026 · Apr 2, 2026 · Apr 2, 2026 · Apr 1, 2026
@@ -454,6 +454,13 @@ def __init__(
         HPUFusedSDPA = kernels.fsdpa()
         self.fused_scaled_dot_product_attention = None if HPUFusedSDPA is None \
             else ModuleFusedSDPA(HPUFusedSDPA)
+        try:
+            from habana_frameworks.torch.hpex.kernels import fp8_fused_sdpa
+            if self.enable_fp8_attn:
+                self.fused_scaled_dot_product_attention = ModuleFP8FusedSDPA(fp8_fused_sdpa)
+        except ImportError:
+            pass
+
         self.prefill_impl = get_config().prompt_attn_impl
         self.use_contiguous_pa = get_config().use_contiguous_pa
         self.use_merged_prefill = get_config().merged_prefill

@@ -349,12 +349,9 @@ def _naive_prompt_attention(query: torch.Tensor,
             htcore.mark_step()
         attn_weights.add_(position_bias)
     if attn_bias is not None:
-        if attn_bias.dtype == torch.bool:
-            attn_weights = attn_weights.masked_fill(~attn_bias, float("-inf"))
-        else:
-            if attn_weights.dtype != attn_bias.dtype:
-                attn_bias = attn_bias.to(dtype=attn_weights.dtype)
-            attn_weights.add_(attn_bias)
+        if attn_weights.dtype != attn_bias.dtype:
+            attn_bias = attn_bias.to(dtype=attn_weights.dtype)
+        attn_weights.add_(attn_bias)
     if sinks is not None:
         sink = sinks.reshape(1, -1, 1, 1).expand(query.shape[0], -1, query.shape[-2], -1)
         if query_heads != kv_heads: