add varlen attention for qwen 3 (#2084)

liangel-02 · web-flow · commit 1b9cfda0c981 · 2025-11-25T14:01:06.000-05:00
As title **Testing** <img width="469" height="431" alt="Screenshot 2025-11-24 at 4 30 53 PM" src="https://github.com/user-attachments/assets/6b9a362d-de36-48b7-b465-d91ae24f4cbf" /> performance and loss on par
diff --git a/torchtitan/models/qwen3/infra/parallelize.py b/torchtitan/models/qwen3/infra/parallelize.py
@@ -43,6 +43,7 @@
     # used to compute the scaling factor for quantization.
     torch.ops.aten.max.default,
     torch._higher_order_ops.flex_attention,
+    torch.ops.torch_attn._varlen_attn,
 }
 
 
diff --git a/torchtitan/models/qwen3/model/model.py b/torchtitan/models/qwen3/model/model.py
@@ -15,10 +15,13 @@
 from torchtitan.components.tokenizer import BaseTokenizer
 from torchtitan.models.attention import (
     create_attention_mask,
+    create_varlen_metadata_for_document,
     FlexAttentionWrapper,
     get_causal_mask_mod,
     get_document_mask_mod,
     ScaledDotProductAttentionWrapper,
+    VarlenAttentionWrapper,
+    VarlenMetadata,
 )
 from torchtitan.models.moe import MoE
 from torchtitan.protocols.model import AttentionMasksType
@@ -170,8 +173,12 @@ def __init__(self, model_args: Qwen3ModelArgs):
         match self.attn_type:
             case "flex":
                 self.inner_attention = FlexAttentionWrapper()
-            case _:
+            case "varlen":
+                self.inner_attention = VarlenAttentionWrapper()
+            case "sdpa":
                 self.inner_attention = ScaledDotProductAttentionWrapper()
+            case _:
+                raise ValueError(f"Unknown attention type: {self.attn_type}")
 
     def init_weights(self, init_std: float):
         for linear in (self.wq, self.wk, self.wv):
@@ -231,9 +238,20 @@ def forward(
             case "flex":
                 assert isinstance(attention_masks, BlockMask), attention_masks
                 output = self.inner_attention(xq, xk, xv, block_mask=attention_masks)
-            case _:
+            case "varlen":
+                assert isinstance(attention_masks, VarlenMetadata), attention_masks
+                output = self.inner_attention(
+                    xq,
+                    xk,
+                    xv,
+                    self.head_dim,
+                    attention_masks,
+                )
+            case "sdpa":
                 assert attention_masks is None
                 output = self.inner_attention(xq, xk, xv)
+            case _:
+                raise ValueError(f"Unknown attention type: {self.attn_type}")
 
         output = output.transpose(
             1, 2
@@ -447,7 +465,7 @@ def _precompute_rope_cache(self) -> torch.Tensor:
             self.model_args.rope_theta,
         )
 
-    def get_attention_masks(
+    def _get_flex_attention_masks(
         self,
         input_batch: torch.Tensor,
         tokenizer: BaseTokenizer,
@@ -468,6 +486,31 @@ def get_attention_masks(
             and_masks(*mask_mods), B, None, input_batch.shape[1], input_batch.shape[1]
         )
 
+    def get_attention_masks(
+        self,
+        input_batch: torch.Tensor,
+        tokenizer: BaseTokenizer,
+        extra_inputs: dict[str, torch.Tensor] | None = None,
+    ) -> AttentionMasksType:
+        match self.model_args.attn_type:
+            case "flex":
+                return self._get_flex_attention_masks(
+                    input_batch, tokenizer, extra_inputs
+                )
+            case "varlen":
+                if self.model_args.attn_mask_type != "block_causal":
+                    raise ValueError(
+                        f"varlen attention is only supported with block_causal \
+                        attention mask type, got {self.model_args.attn_mask_type}"
+                    )
+                return create_varlen_metadata_for_document(
+                    input_batch, tokenizer.eos_id
+                )
+            case _:
+                raise NotImplementedError(
+                    "Only varlen and flex attn masks are supported"
+                )
+
     def forward(
         self,
         tokens: torch.Tensor,

Original file line number	Diff line number	Diff line change
`@@ -43,6 +43,7 @@`
`43`	`43`	`# used to compute the scaling factor for quantization.`
`44`	`44`	`torch.ops.aten.max.default,`
`45`	`45`	`torch._higher_order_ops.flex_attention,`
	`46`	`+ torch.ops.torch_attn._varlen_attn,`
`46`	`47`	`}`
`47`	`48`
`48`	`49`