Use custom sdpa for ExecuTorch

Guang Yang · Guang Yang · commit 8e9e3c29c369 · 2025-04-07T20:34:20.000-07:00
diff --git a/optimum/executorch/attentions/custom_sdpa.py b/optimum/executorch/attentions/custom_sdpa.py
@@ -0,0 +1,66 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Optional, Tuple, Union
+
+import torch
+from executorch.extension.llm.custom_ops.custom_ops import custom_sdpa  # noqa
+
+
+def custom_sdpa_with_start_pos_forward(
+    module: torch.nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Union[torch.Tensor, "BlockMask"],
+    scaling: Optional[float] = None,
+    softcap: Optional[float] = None,
+    head_mask: Optional[torch.Tensor] = None,
+    **kwargs,
+) -> Tuple[torch.Tensor, None]:
+    # This is before the transpose
+    max_seq_len = key.shape[2]
+
+    # FA2 uses non-transposed inputs
+    query = query.transpose(1, 2)
+    key = key.transpose(1, 2)
+    value = value.transpose(1, 2)
+
+    # Convert the hell out of the inputs to fp32 and back
+    input_dtype = query.dtype
+    query = query.to(torch.float32)
+    key = key.to(torch.float32)
+    value = value.to(torch.float32)
+
+    # Ignore the causal flag from kwargs but use the one in module
+    kwargs.pop("is_causal", None)
+
+    # Calculate the input pos from attention mask.
+    # Branch out for float vs bool mask
+    # assert attention_mask.dim() == 2, f"attention_mask must be a 2D matrix."
+    attention_mask = attention_mask.reshape(-1, max_seq_len)
+    first_row_mask = attention_mask[0, :]
+    # [0, 0, 0, 0, -inf, -inf, -inf, -inf], start_pos = 3
+    start_pos = torch.argmin(first_row_mask).item() - 1
+    output = torch.ops.llama.custom_sdpa(
+        query,
+        key,
+        value,
+        start_pos=start_pos,
+        attn_mask=None,
+        drpout_p=0.0,
+        is_causal=module.is_causal,
+        scale=scaling,
+    )
+    return output.to(input_dtype), None
diff --git a/optimum/executorch/modeling.py b/optimum/executorch/modeling.py
@@ -594,6 +594,8 @@ def __init__(self, models: Dict[str, "ExecuTorchModule"], config: "PretrainedCon
             self.eos_token_id = self.model.run_method("get_eos_id")[0]
         if "get_vocab_size" in metadata:
             self.vocab_size = self.model.run_method("get_vocab_size")[0]
+        if "use_sdpa_with_kv_cache" in metadata:
+            self.use_sdpa_with_kv_cache = self.model.run_method("use_sdpa_with_kv_cache")[0]
 
     def forward(
         self,
diff --git a/optimum/exporters/executorch/convert.py b/optimum/exporters/executorch/convert.py
@@ -19,11 +19,18 @@
 from pathlib import Path
 from typing import Union
 
+from transformers.modeling_utils import AttentionInterface
+
+from optimum.executorch.attentions.custom_sdpa import custom_sdpa_with_start_pos_forward
+
 from .recipe_registry import discover_recipes, recipe_registry
 
 
 logger = logging.getLogger(__name__)
 
+# Register custom sdpa via `AttentionInterface` unconditionally
+AttentionInterface.register("executorch_custom_sdpa", custom_sdpa_with_start_pos_forward)
+
 
 def export_to_executorch(
     model,
diff --git a/optimum/exporters/executorch/recipes/xnnpack.py b/optimum/exporters/executorch/recipes/xnnpack.py
@@ -77,4 +77,14 @@ def _lower_to_executorch(
         return et_progs
 
     exported_progs = model.export()
+
+    if model.config._attn_implementation == "executorch_custom_sdpa":
+        # Sanity check to make sure the exported program contains the custom sdpa operator.
+        if not any(
+            node.op == "call_function" and "custom_sdpa" in str(node.target)
+            for exported_program in exported_progs.values()
+            for node in exported_program.graph_module.graph.nodes
+        ):
+            raise ValueError("'custom_sdpa' not found in the graph.")
+
     return _lower_to_executorch(exported_progs, model.metadata)
diff --git a/optimum/exporters/executorch/utils.py b/optimum/exporters/executorch/utils.py
@@ -43,6 +43,7 @@ def save_config_to_constant_methods(
         "get_max_batch_size": 1,
         "get_max_seq_len": getattr(config, "max_position_embeddings", None),
         "decoder_start_token_id": getattr(config, "decoder_start_token_id", None),
+        "use_sdpa_with_kv_cache": "custom_sdpa" in config._attn_implementation,
     }
 
     # Safely access fields from generation_config if it exists
diff --git a/setup.py b/setup.py
@@ -14,7 +14,7 @@
 INSTALL_REQUIRE = [
     "optimum~=1.24",
     "executorch>=0.4.0,!=0.5.0",  # https://github.com/huggingface/optimum-executorch/issues/14
-    "transformers>=4.46,<=4.50.1",
+    "transformers==4.51.0",
 ]
 
 TESTS_REQUIRE = [
diff --git a/tests/models/test_modeling_qwen2.py b/tests/models/test_modeling_qwen2.py
@@ -21,14 +21,17 @@
 
 import pytest
 from executorch.extension.pybindings.portable_lib import ExecuTorchModule
-from transformers import AutoTokenizer
+from transformers import AutoModelForCausalLM, AutoTokenizer
 from transformers.testing_utils import slow
 
 from optimum.executorch import ExecuTorchModelForCausalLM
 
 from ..utils import check_causal_lm_output_quality
 
 
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+
+
 class ExecuTorchModelIntegrationTest(unittest.TestCase):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
@@ -63,3 +66,39 @@ def test_qwen2_5_text_generation(self):
         )
         logging.info(f"\nGenerated text:\n\t{generated_text}")
         self.assertTrue(check_causal_lm_output_quality(model_id, generated_text))
+
+    @slow
+    @pytest.mark.run_slow
+    def test_qwen2_5_text_generation_with_custom_sdpa(self):
+        model_id = "Qwen/Qwen2.5-0.5B"
+        prompt = "My favourite condiment is "
+        max_seq_len = 32
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+
+        # ExecuTorch model + custom sdpa
+        model = ExecuTorchModelForCausalLM.from_pretrained(
+            model_id,
+            recipe="xnnpack",
+            attn_implementation="executorch_custom_sdpa",
+        )
+        self.assertIsInstance(model, ExecuTorchModelForCausalLM)
+        self.assertIsInstance(model.model, ExecuTorchModule)
+        generated_text = model.text_generation(
+            tokenizer=tokenizer,
+            prompt=prompt,
+            max_seq_len=max_seq_len,
+        )
+        logging.info(f"\nGenerated text:\n\t{generated_text}")
+        self.assertTrue(check_causal_lm_output_quality(model_id, generated_text))
+
+        # Eager model + custom sdpa
+        eager_model = AutoModelForCausalLM.from_pretrained(
+            model_id,
+            attn_implementation="executorch_custom_sdpa",
+        )
+        self.assertTrue(eager_model.config._attn_implementation, "executorch_custom_sdpa")
+        eager_inputs = tokenizer(prompt, return_tensors="pt").to(eager_model.device)
+        eager_generated_ids = eager_model.generate(**eager_inputs, max_new_tokens=max_seq_len)
+        eager_generated_text = tokenizer.batch_decode(eager_generated_ids, skip_special_tokens=True)[0]
+        logging.info(f"\nEager generated text:\n\t{eager_generated_text}")
+        self.assertTrue(check_causal_lm_output_quality(model_id, eager_generated_text))
diff --git a/tests/models/test_modeling_smollm.py b/tests/models/test_modeling_smollm.py
@@ -21,14 +21,17 @@
 
 import pytest
 from executorch.extension.pybindings.portable_lib import ExecuTorchModule
-from transformers import AutoTokenizer
+from transformers import AutoModelForCausalLM, AutoTokenizer
 from transformers.testing_utils import slow
 
 from optimum.executorch import ExecuTorchModelForCausalLM
 
 from ..utils import check_causal_lm_output_quality
 
 
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+
+
 class ExecuTorchModelIntegrationTest(unittest.TestCase):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
@@ -63,3 +66,40 @@ def test_smollm_text_generation(self):
         )
         logging.info(f"\nGenerated text:\n\t{generated_text}")
         self.assertTrue(check_causal_lm_output_quality(model_id, generated_text))
+
+    @slow
+    @pytest.mark.run_slow
+    def test_smollm_text_generation_with_custom_sdpa(self):
+        model_id = "HuggingFaceTB/SmolLM2-135M"
+        prompt = "My favourite condiment is "
+        max_seq_len = 32
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+
+        # ExecuTorch model + custom sdpa
+        model = ExecuTorchModelForCausalLM.from_pretrained(
+            model_id,
+            recipe="xnnpack",
+            attn_implementation="executorch_custom_sdpa",
+        )
+        self.assertIsInstance(model, ExecuTorchModelForCausalLM)
+        self.assertIsInstance(model.model, ExecuTorchModule)
+
+        generated_text = model.text_generation(
+            tokenizer=tokenizer,
+            prompt=prompt,
+            max_seq_len=max_seq_len,
+        )
+        logging.info(f"\nGenerated text:\n\t{generated_text}")
+        self.assertTrue(check_causal_lm_output_quality(model_id, generated_text))
+
+        # Eager model + custom sdpa
+        eager_model = AutoModelForCausalLM.from_pretrained(
+            model_id,
+            attn_implementation="executorch_custom_sdpa",
+        )
+        self.assertTrue(eager_model.config._attn_implementation, "executorch_custom_sdpa")
+        eager_inputs = tokenizer(prompt, return_tensors="pt").to(eager_model.device)
+        eager_generated_ids = eager_model.generate(**eager_inputs, max_new_tokens=max_seq_len)
+        eager_generated_text = tokenizer.batch_decode(eager_generated_ids, skip_special_tokens=True)[0]
+        logging.info(f"\nEager generated text:\n\t{eager_generated_text}")
+        self.assertTrue(check_causal_lm_output_quality(model_id, eager_generated_text))

Original file line number	Diff line number	Diff line change
`@@ -43,6 +43,7 @@ def save_config_to_constant_methods(`
`43`	`43`	`"get_max_batch_size": 1,`
`44`	`44`	`"get_max_seq_len": getattr(config, "max_position_embeddings", None),`
`45`	`45`	`"decoder_start_token_id": getattr(config, "decoder_start_token_id", None),`
	`46`	`+ "use_sdpa_with_kv_cache": "custom_sdpa" in config._attn_implementation,`
`46`	`47`	`}`
`47`	`48`
`48`	`49`	`# Safely access fields from generation_config if it exists`
Original file line number	Diff line number	Diff line change
`@@ -14,7 +14,7 @@`
`14`	`14`	`INSTALL_REQUIRE = [`
`15`	`15`	`"optimum~=1.24",`
`16`	`16`	`"executorch>=0.4.0,!=0.5.0", # https://github.com/huggingface/optimum-executorch/issues/14`
`17`		`- "transformers>=4.46,<=4.50.1",`
	`17`	`+ "transformers==4.51.0",`
`18`	`18`	`]`
`19`	`19`
`20`	`20`	`TESTS_REQUIRE = [`