add varlen attention for qwen 3

liangel-02 · liangel-02 · commit a47df59d8a2f · 2025-11-24T13:28:24.000-08:00
diff --git a/torchtitan/models/qwen3/model/model.py b/torchtitan/models/qwen3/model/model.py
@@ -15,10 +15,13 @@
 from torchtitan.components.tokenizer import BaseTokenizer
 from torchtitan.models.attention import (
     create_attention_mask,
+    create_varlen_metadata_for_document,
     FlexAttentionWrapper,
     get_causal_mask_mod,
     get_document_mask_mod,
     ScaledDotProductAttentionWrapper,
+    VarlenAttentionWrapper,
+    VarlenMetadata,
 )
 from torchtitan.models.moe import MoE
 from torchtitan.protocols.model import AttentionMasksType
@@ -170,6 +173,8 @@ def __init__(self, model_args: Qwen3ModelArgs):
         match self.attn_type:
             case "flex":
                 self.inner_attention = FlexAttentionWrapper()
+            case "varlen":
+                self.inner_attention = VarlenAttentionWrapper()
             case _:
                 self.inner_attention = ScaledDotProductAttentionWrapper()
 
@@ -231,6 +236,15 @@ def forward(
             case "flex":
                 assert isinstance(attention_masks, BlockMask), attention_masks
                 output = self.inner_attention(xq, xk, xv, block_mask=attention_masks)
+            case "varlen":
+                assert isinstance(attention_masks, VarlenMetadata), attention_masks
+                output = self.inner_attention(
+                    xq,
+                    xk,
+                    xv,
+                    self.head_dim,
+                    attention_masks,
+                )
             case _:
                 assert attention_masks is None
                 output = self.inner_attention(xq, xk, xv)
@@ -447,7 +461,7 @@ def _precompute_rope_cache(self) -> torch.Tensor:
             self.model_args.rope_theta,
         )
 
-    def get_attention_masks(
+    def _get_flex_attention_masks(
         self,
         input_batch: torch.Tensor,
         tokenizer: BaseTokenizer,
@@ -468,6 +482,31 @@ def get_attention_masks(
             and_masks(*mask_mods), B, None, input_batch.shape[1], input_batch.shape[1]
         )
 
+    def get_attention_masks(
+        self,
+        input_batch: torch.Tensor,
+        tokenizer: BaseTokenizer,
+        extra_inputs: dict[str, torch.Tensor] | None = None,
+    ) -> AttentionMasksType:
+        match self.model_args.attn_type:
+            case "flex":
+                return self._get_flex_attention_masks(
+                    input_batch, tokenizer, extra_inputs
+                )
+            case "varlen":
+                if self.model_args.attn_mask_type != "block_causal":
+                    raise ValueError(
+                        f"varlen attention is only supported with block_causal \
+                        attention mask type, got {self.model_args.attn_mask_type}"
+                    )
+                return create_varlen_metadata_for_document(
+                    input_batch, tokenizer.eos_id
+                )
+            case _:
+                raise NotImplementedError(
+                    "Only varlen and flex attn masks are supported"
+                )
+
     def forward(
         self,
         tokens: torch.Tensor,