From e4a8aa761e6e5f0636f7c166f396535e53e410f0 Mon Sep 17 00:00:00 2001 From: Radoslaw Smyrek Date: Fri, 22 May 2026 12:22:57 +0300 Subject: [PATCH] Revert "Skip materialised causal attn_bias on FSDPA for non-GDN hybrid models (#1413)" This reverts commit 808dbfaffad15ad0acbd0c94f4cb081a68b1f68b. Signed-off-by: Radoslaw Smyrek --- vllm_gaudi/v1/worker/hpu_model_runner.py | 46 ------------------------ 1 file changed, 46 deletions(-) diff --git a/vllm_gaudi/v1/worker/hpu_model_runner.py b/vllm_gaudi/v1/worker/hpu_model_runner.py index bf5b50293b..c51db6a3ca 100644 --- a/vllm_gaudi/v1/worker/hpu_model_runner.py +++ b/vllm_gaudi/v1/worker/hpu_model_runner.py @@ -1136,12 +1136,6 @@ def __init__( and (getattr(hf_text_config, "mamba_chunk_size", None) is not None or getattr(hf_text_config, "chunk_size", None) is not None)) - # Non-GDN hybrid: at least one mamba/linear-style layer and zero GDN - # (gdn_attention / linear_attention) layers. Used to gate optimizations - # that have only been validated on non-GDN hybrid topologies - # (e.g. Granite-4 Mamba2+Transformer). - self.is_non_gdn_hybrid = (self.num_mamba_like_layers > 0 and self.num_gdn == 0) - # For HPU GDN, use configured chunk size when explicitly provided; # otherwise default to 128 to match bucket alignment. if self.num_mamba_like_layers > 0: @@ -3896,21 +3890,6 @@ def set_attn_bias(self, attn_metadata, batch_size, seq_len, device, dtype): or not attn_metadata.is_prompt): return attn_metadata - # Extended FSDPA-native causal short-circuit for non-GDN hybrid models - # (e.g. Granite-4 Mamba2+Transformer). FusedSDPA can encode a purely - # causal mask natively via is_causal=True + valid_seq_lengths, including - # chunked prefill where block_list is non-None. Skipping the - # materialised [bs, 1, q_len, total_kv_len] attn_bias avoids a large - # add_bf16 on the attention critical path (significant at long - # context). Conservative scope: only non-GDN hybrid models; GDN / - # pure-transformer / other topologies keep the materialised bias path - # until validated. - if (self.prefill_use_fusedsdpa and self.is_causal and not self.is_pooling_model - and not getattr(self, 'sliding_window', None) - and not getattr(self, 'model_has_chunked_attention', False) - and getattr(self, 'alibi_slopes', None) is None and self.is_non_gdn_hybrid): - return attn_metadata - if attn_metadata.attn_bias is not None: return attn_metadata @@ -6753,17 +6732,6 @@ def __init__( self.interleaved_sliding_window = (is_interleaved(vllm_config.model_config.hf_text_config) and self.sliding_window) - # Detect non-GDN hybrid topologies (e.g. Granite-4 Mamba2+Transformer). - # Used to gate the FSDPA-native causal short-circuit in _set_attn_bias. - # Mirrors the runner's num_mamba_like_layers / num_gdn computation - # (HPUModelRunner.__init__) so the same set of models is targeted. - get_num_layers = vllm_config.model_config.get_num_layers_by_block_type - parallel_config = vllm_config.parallel_config - num_mamba_like = sum( - get_num_layers(parallel_config, bt) for bt in ("mamba", "gdn_attention", "linear_attention")) - num_gdn = sum(get_num_layers(parallel_config, bt) for bt in ("gdn_attention", "linear_attention")) - self.is_non_gdn_hybrid = (num_mamba_like > 0 and num_gdn == 0) - if self.interleaved_sliding_window: self.use_window_sdpa = with_default(get_config().PT_HPU_SDPA_QKV_SLICE_MODE_FWD, False) #os.getenv("PT_HPU_SDPA_QKV_SLICE_MODE_FWD", "false").strip().lower() in ("1", "true") @@ -6796,20 +6764,6 @@ def _set_attn_bias(self, attn_metadata: HPUAttentionMetadataV1, batch_size: int, or not attn_metadata.is_prompt): return attn_metadata - # Extended FSDPA-native causal short-circuit for non-GDN hybrid models - # (e.g. Granite-4 Mamba2+Transformer). FusedSDPA handles a purely - # causal mask natively (is_causal=True + valid_seq_lengths). Skip - # materialising a [bs, 1, q_len, total_kv_len] attn_bias even during - # chunked prefill (block_list is non-None) for these topologies; this - # removes a sizable add_bf16 from the attention critical path during - # long-context chunked prefill. interleaved_sliding_window and - # chunked-attention bias paths (window_attn_bias / chunked_attn_bias) - # are populated later in process_metadata and used by hpu_attn - # instead. Conservative scope: only non-GDN hybrid models; all other - # topologies retain the original behaviour. - if (self.prefill_use_fusedsdpa and not self.interleaved_sliding_window and self.is_non_gdn_hybrid): - return attn_metadata - if attn_metadata.attn_bias is not None: return attn_metadata