Fix SDPA attention precision issue in Qwen2.5-VL #37363

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged

zucchini-nlp merged 3 commits into huggingface:main from JJJYmmm:main

Jul 9, 2025

src/transformers/models/glm4v/modeling_glm4v.py

-Original file line number
+Diff line change
@@ Expand Up / @@ -296,7 +296,6 @@ def forward( @@
             cu_seqlens: torch.Tensor,
             rotary_pos_emb: Optional[torch.Tensor] = None,
             position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,
-            attention_mask: Optional[torch.Tensor] = None,
             **kwargs,
         ) -> torch.Tensor:
             seq_length = hidden_states.shape[0]
@@ Expand All / @@ -320,27 +319,51 @@ def forward( @@
             query_states = query_states.transpose(0, 1).unsqueeze(0)
             key_states = key_states.transpose(0, 1).unsqueeze(0)
             value_states = value_states.transpose(0, 1).unsqueeze(0)
-            max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
             attention_interface: Callable = eager_attention_forward
             if self.config._attn_implementation != "eager":
                 attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
-            attn_output, _ = attention_interface(
-                self,
-                query_states,
-                key_states,
-                value_states,
-                attention_mask=attention_mask,
-                dropout=0.0 if not self.training else self.attention_dropout,
-                scaling=self.scaling,
-                cu_seq_lens_q=cu_seqlens,  # pass cu seq lens for FA2
-                cu_seq_lens_k=cu_seqlens,
-                max_length_q=max_seqlen,
-                max_length_k=max_seqlen,
-                is_causal=False,
-                **kwargs,
-            )
+            if self.config._attn_implementation == "flash_attention_2":
+                # Flash Attention 2: Use cu_seqlens for variable length attention
+                max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
+                attn_output, _ = attention_interface(
+                    self,
+                    query_states,
+                    key_states,
+                    value_states,
+                    attention_mask=None,
+                    scaling=self.scaling,
+                    dropout=0.0 if not self.training else self.attention_dropout,
+                    cu_seq_lens_q=cu_seqlens,
+                    cu_seq_lens_k=cu_seqlens,
+                    max_length_q=max_seqlen,
+                    max_length_k=max_seqlen,
+                    is_causal=False,
+                    **kwargs,
+                )
+            else:
+                # Other implementations: Process each chunk separately
+                lengths = cu_seqlens[1:] - cu_seqlens[:-1]
+                splits = [
+                    torch.split(tensor, lengths.tolist(), dim=2) for tensor in (query_states, key_states, value_states)
+                ]
+                attn_outputs = [
+                    attention_interface(
+                        self,
+                        q,
+                        k,
+                        v,
+                        attention_mask=None,
+                        scaling=self.scaling,
+                        dropout=0.0 if not self.training else self.attention_dropout,
+                        is_causal=False,
+                        **kwargs,
+                    )[0]
+                    for q, k, v in zip(*splits)
+                ]
+                attn_output = torch.cat(attn_outputs, dim=1)
             attn_output = attn_output.reshape(seq_length, -1).contiguous()
             attn_output = self.proj(attn_output)
@@ Expand All / @@ -361,15 +384,13 @@ def forward( @@
             cu_seqlens: torch.Tensor,
             rotary_pos_emb: Optional[torch.Tensor] = None,
             position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,
-            attention_mask: Optional[torch.Tensor] = None,
             **kwargs,
         ) -> torch.Tensor:
             hidden_states = hidden_states + self.attn(
                 self.norm1(hidden_states),
                 cu_seqlens=cu_seqlens,
                 rotary_pos_emb=rotary_pos_emb,
                 position_embeddings=position_embeddings,
-                attention_mask=attention_mask,
                 **kwargs,
             )
             hidden_states = hidden_states + self.mlp(self.norm2(hidden_states))
@@ Expand Down Expand Up / @@ -467,25 +488,6 @@ def rot_pos_emb(self, grid_thw): @@
             rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1)
             return rotary_pos_emb, pos_ids
-        def _prepare_attention_mask(self, inputs_tensor: torch.Tensor, cu_seqlens: torch.Tensor) -> torch.Tensor:
-            # Flash Attention 2 doesn't need a 4D mask and relies on `cu_seqlens/max_seqlen`
-            # NOTE: the created attention masl only approximates the ragged FA2 attention by
-            # allowing bidirectional attention within `cu_seqlens` blocks, and not attending between
-            # blocks. Though it will not be a 100% match for FA2's `varlen` path
-            if self.config._attn_implementation == "flash_attention_2":
-                return None
-            seq_length = inputs_tensor.shape[0]
-            attention_mask = torch.full(
-                [1, 1, seq_length, seq_length],
-                torch.finfo(inputs_tensor.dtype).min,
-                device=inputs_tensor.device,
-                dtype=inputs_tensor.dtype,
-            )
-            for i in range(1, len(cu_seqlens)):
-                attention_mask[..., cu_seqlens[i - 1] : cu_seqlens[i], cu_seqlens[i - 1] : cu_seqlens[i]] = 0
-            return attention_mask
         def forward(self, hidden_states: torch.Tensor, grid_thw: torch.Tensor) -> torch.Tensor:
             """
             Args:
@@ Expand Down Expand Up @@
             cu_seqlens = F.pad(cu_seqlens, (1, 0), value=0)
             seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
             hidden_states = self.embeddings(hidden_states, seqlens, grid_thw, image_type_ids[:, 0], image_type_ids[:, 1])
-            attention_mask = self._prepare_attention_mask(hidden_states, cu_seqlens=cu_seqlens)
             for blk in self.blocks:
                 hidden_states = blk(
                     hidden_states,
                     cu_seqlens=cu_seqlens,
                     position_embeddings=position_embeddings,
-                    attention_mask=attention_mask,
                 )
             hidden_states = self.post_layernorm(hidden_states)
@@ Expand Down @@

src/transformers/models/glm4v/modular_glm4v.py

-Original file line number
+Diff line change
@@ Expand Up / @@ -603,25 +603,6 @@ def rot_pos_emb(self, grid_thw): @@
             rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1)
             return rotary_pos_emb, pos_ids
-        def _prepare_attention_mask(self, inputs_tensor: torch.Tensor, cu_seqlens: torch.Tensor) -> torch.Tensor:
-            # Flash Attention 2 doesn't need a 4D mask and relies on `cu_seqlens/max_seqlen`
-            # NOTE: the created attention masl only approximates the ragged FA2 attention by
-            # allowing bidirectional attention within `cu_seqlens` blocks, and not attending between
-            # blocks. Though it will not be a 100% match for FA2's `varlen` path
-            if self.config._attn_implementation == "flash_attention_2":
-                return None
-            seq_length = inputs_tensor.shape[0]
-            attention_mask = torch.full(
-                [1, 1, seq_length, seq_length],
-                torch.finfo(inputs_tensor.dtype).min,
-                device=inputs_tensor.device,
-                dtype=inputs_tensor.dtype,
-            )
-            for i in range(1, len(cu_seqlens)):
-                attention_mask[..., cu_seqlens[i - 1] : cu_seqlens[i], cu_seqlens[i - 1] : cu_seqlens[i]] = 0
-            return attention_mask
         def forward(self, hidden_states: torch.Tensor, grid_thw: torch.Tensor) -> torch.Tensor:
             """
             Args:
@@ Expand Down Expand Up @@
             cu_seqlens = F.pad(cu_seqlens, (1, 0), value=0)
             seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
             hidden_states = self.embeddings(hidden_states, seqlens, grid_thw, image_type_ids[:, 0], image_type_ids[:, 1])
-            attention_mask = self._prepare_attention_mask(hidden_states, cu_seqlens=cu_seqlens)
             for blk in self.blocks:
                 hidden_states = blk(
                     hidden_states,
                     cu_seqlens=cu_seqlens,
                     position_embeddings=position_embeddings,
-                    attention_mask=attention_mask,
                 )
             hidden_states = self.post_layernorm(hidden_states)
@@ Expand Down @@

src/transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py

-Original file line number
+Diff line change
@@ Expand Up / @@ -956,7 +956,6 @@ def forward( @@
             hidden_states: torch.Tensor,
             cu_seqlens: torch.Tensor,
             rotary_pos_emb: Optional[torch.Tensor] = None,
-            attention_mask: Optional[torch.Tensor] = None,
             **kwargs,
         ) -> torch.Tensor:
             seq_length = hidden_states.shape[0]
@@ Expand All / @@ -969,27 +968,51 @@ def forward( @@
             query_states = query_states.transpose(0, 1).unsqueeze(0)
             key_states = key_states.transpose(0, 1).unsqueeze(0)
             value_states = value_states.transpose(0, 1).unsqueeze(0)
-            max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
             attention_interface: Callable = eager_attention_forward
             if self.config._attn_implementation != "eager":
                 attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
-            attn_output, _ = attention_interface(
-                self,
-                query_states,
-                key_states,
-                value_states,
-                attention_mask=attention_mask,
-                dropout=0.0 if not self.training else self.attention_dropout,
-                scaling=self.scaling,
-                cu_seq_lens_q=cu_seqlens,  # pass cu seq lens for FA2
-                cu_seq_lens_k=cu_seqlens,
-                max_length_q=max_seqlen,
-                max_length_k=max_seqlen,
-                is_causal=False,
-                **kwargs,
-            )
+            if self.config._attn_implementation == "flash_attention_2":
+                # Flash Attention 2: Use cu_seqlens for variable length attention
+                max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
+                attn_output, _ = attention_interface(
+                    self,
+                    query_states,
+                    key_states,
+                    value_states,
+                    attention_mask=None,
+                    scaling=self.scaling,
+                    dropout=0.0 if not self.training else self.attention_dropout,
+                    cu_seq_lens_q=cu_seqlens,
+                    cu_seq_lens_k=cu_seqlens,
+                    max_length_q=max_seqlen,
+                    max_length_k=max_seqlen,
+                    is_causal=False,
+                    **kwargs,
+                )
+            else:
+                # Other implementations: Process each chunk separately
+                lengths = cu_seqlens[1:] - cu_seqlens[:-1]
+                splits = [
+                    torch.split(tensor, lengths.tolist(), dim=2) for tensor in (query_states, key_states, value_states)
+                ]
+                attn_outputs = [
+                    attention_interface(
+                        self,
+                        q,
+                        k,
+                        v,
+                        attention_mask=None,
+                        scaling=self.scaling,
+                        dropout=0.0 if not self.training else self.attention_dropout,
+                        is_causal=False,
+                        **kwargs,
+                    )[0]
+                    for q, k, v in zip(*splits)
+                ]
+                attn_output = torch.cat(attn_outputs, dim=1)
             attn_output = attn_output.reshape(seq_length, -1).contiguous()
             attn_output = self.proj(attn_output)
@@ Expand Down Expand Up / @@ -1023,14 +1046,12 @@ def forward( @@
             hidden_states: torch.Tensor,
             cu_seqlens: torch.Tensor,
             rotary_pos_emb: Optional[torch.Tensor] = None,
-            attention_mask: Optional[torch.Tensor] = None,
             **kwargs,
         ) -> torch.Tensor:
             hidden_states = hidden_states + self.attn(
                 self.norm1(hidden_states),
                 cu_seqlens=cu_seqlens,
                 rotary_pos_emb=rotary_pos_emb,
-                attention_mask=attention_mask,
                 **kwargs,
             )
             hidden_states = hidden_states + self.mlp(self.norm2(hidden_states))
@@ Expand Down Expand Up / @@ -1190,25 +1211,6 @@ def get_window_index(self, grid_thw): @@
             return window_index, cu_window_seqlens
-        def _prepare_attention_mask(self, inputs_tensor: torch.Tensor, cu_seqlens: torch.Tensor) -> torch.Tensor:
-            # Flash Attention 2 doesn't need a 4D mask and relies on `cu_seqlens/max_seqlen`
-            # NOTE: the created attention masl only approximates the ragged FA2 attention by
-            # allowing bidirectional attention within `cu_seqlens` blocks, and not attending between
-            # blocks. Though it will not be a 100% match for FA2's `varlen` path
-            if self.config._attn_implementation == "flash_attention_2":
-                return None
-            seq_length = inputs_tensor.shape[0]
-            attention_mask = torch.full(
-                [1, 1, seq_length, seq_length],
-                torch.finfo(inputs_tensor.dtype).min,
-                device=inputs_tensor.device,
-                dtype=inputs_tensor.dtype,
-            )
-            for i in range(1, len(cu_seqlens)):
-                attention_mask[..., cu_seqlens[i - 1] : cu_seqlens[i], cu_seqlens[i - 1] : cu_seqlens[i]] = 0
-            return attention_mask
         def forward(self, hidden_states: torch.Tensor, grid_thw: torch.Tensor, **kwargs) -> torch.Tensor:
             """
             Args:
@@ Expand Down Expand Up @@
                 else:
                     cu_seqlens_now = cu_window_seqlens
-                attention_mask = self._prepare_attention_mask(hidden_states, cu_seqlens_now)
                 hidden_states = blk(
                     hidden_states,
                     cu_seqlens=cu_seqlens_now,
                     rotary_pos_emb=rotary_pos_emb,
-                    attention_mask=attention_mask,
                     **kwargs,
                 )
             hidden_states = self.merger(hidden_states)
@@ Expand Down @@

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Fix SDPA attention precision issue in Qwen2.5-VL #37363

Uh oh!

Diff view

Diff view

There are no files selected for viewing

Uh oh!