From e4a8aa761e6e5f0636f7c166f396535e53e410f0 Mon Sep 17 00:00:00 2001
From: Radoslaw Smyrek <radoslawx.smyrek@intel.com>
Date: Fri, 22 May 2026 12:22:57 +0300
Subject: [PATCH] Revert "Skip materialised causal attn_bias on FSDPA for
 non-GDN hybrid models (#1413)"

This reverts commit 808dbfaffad15ad0acbd0c94f4cb081a68b1f68b.

Signed-off-by: Radoslaw Smyrek <radoslawx.smyrek@intel.com>
---
 vllm_gaudi/v1/worker/hpu_model_runner.py | 46 ------------------------
 1 file changed, 46 deletions(-)

diff --git a/vllm_gaudi/v1/worker/hpu_model_runner.py b/vllm_gaudi/v1/worker/hpu_model_runner.py
index bf5b50293b..c51db6a3ca 100644
--- a/vllm_gaudi/v1/worker/hpu_model_runner.py
+++ b/vllm_gaudi/v1/worker/hpu_model_runner.py
@@ -1136,12 +1136,6 @@ def __init__(
                                              and (getattr(hf_text_config, "mamba_chunk_size", None) is not None
                                                   or getattr(hf_text_config, "chunk_size", None) is not None))
 
-        # Non-GDN hybrid: at least one mamba/linear-style layer and zero GDN
-        # (gdn_attention / linear_attention) layers. Used to gate optimizations
-        # that have only been validated on non-GDN hybrid topologies
-        # (e.g. Granite-4 Mamba2+Transformer).
-        self.is_non_gdn_hybrid = (self.num_mamba_like_layers > 0 and self.num_gdn == 0)
-
         # For HPU GDN, use configured chunk size when explicitly provided;
         # otherwise default to 128 to match bucket alignment.
         if self.num_mamba_like_layers > 0:
@@ -3896,21 +3890,6 @@ def set_attn_bias(self, attn_metadata, batch_size, seq_len, device, dtype):
                 or not attn_metadata.is_prompt):
             return attn_metadata
 
-        # Extended FSDPA-native causal short-circuit for non-GDN hybrid models
-        # (e.g. Granite-4 Mamba2+Transformer). FusedSDPA can encode a purely
-        # causal mask natively via is_causal=True + valid_seq_lengths, including
-        # chunked prefill where block_list is non-None. Skipping the
-        # materialised [bs, 1, q_len, total_kv_len] attn_bias avoids a large
-        # add_bf16 on the attention critical path (significant at long
-        # context). Conservative scope: only non-GDN hybrid models; GDN /
-        # pure-transformer / other topologies keep the materialised bias path
-        # until validated.
-        if (self.prefill_use_fusedsdpa and self.is_causal and not self.is_pooling_model
-                and not getattr(self, 'sliding_window', None)
-                and not getattr(self, 'model_has_chunked_attention', False)
-                and getattr(self, 'alibi_slopes', None) is None and self.is_non_gdn_hybrid):
-            return attn_metadata
-
         if attn_metadata.attn_bias is not None:
             return attn_metadata
 
@@ -6753,17 +6732,6 @@ def __init__(
         self.interleaved_sliding_window = (is_interleaved(vllm_config.model_config.hf_text_config)
                                            and self.sliding_window)
 
-        # Detect non-GDN hybrid topologies (e.g. Granite-4 Mamba2+Transformer).
-        # Used to gate the FSDPA-native causal short-circuit in _set_attn_bias.
-        # Mirrors the runner's num_mamba_like_layers / num_gdn computation
-        # (HPUModelRunner.__init__) so the same set of models is targeted.
-        get_num_layers = vllm_config.model_config.get_num_layers_by_block_type
-        parallel_config = vllm_config.parallel_config
-        num_mamba_like = sum(
-            get_num_layers(parallel_config, bt) for bt in ("mamba", "gdn_attention", "linear_attention"))
-        num_gdn = sum(get_num_layers(parallel_config, bt) for bt in ("gdn_attention", "linear_attention"))
-        self.is_non_gdn_hybrid = (num_mamba_like > 0 and num_gdn == 0)
-
         if self.interleaved_sliding_window:
             self.use_window_sdpa = with_default(get_config().PT_HPU_SDPA_QKV_SLICE_MODE_FWD, False)
             #os.getenv("PT_HPU_SDPA_QKV_SLICE_MODE_FWD", "false").strip().lower() in ("1", "true")
@@ -6796,20 +6764,6 @@ def _set_attn_bias(self, attn_metadata: HPUAttentionMetadataV1, batch_size: int,
                 or not attn_metadata.is_prompt):
             return attn_metadata
 
-        # Extended FSDPA-native causal short-circuit for non-GDN hybrid models
-        # (e.g. Granite-4 Mamba2+Transformer). FusedSDPA handles a purely
-        # causal mask natively (is_causal=True + valid_seq_lengths). Skip
-        # materialising a [bs, 1, q_len, total_kv_len] attn_bias even during
-        # chunked prefill (block_list is non-None) for these topologies; this
-        # removes a sizable add_bf16 from the attention critical path during
-        # long-context chunked prefill. interleaved_sliding_window and
-        # chunked-attention bias paths (window_attn_bias / chunked_attn_bias)
-        # are populated later in process_metadata and used by hpu_attn
-        # instead. Conservative scope: only non-GDN hybrid models; all other
-        # topologies retain the original behaviour.
-        if (self.prefill_use_fusedsdpa and not self.interleaved_sliding_window and self.is_non_gdn_hybrid):
-            return attn_metadata
-
         if attn_metadata.attn_bias is not None:
             return attn_metadata