Update on "introduce triton sdpa kernel to cuda backend"

Gasoonjia · Gasoonjia · commit 0385230b401d · 2025-11-17T23:39:10.000-08:00
**Introduce Triton SDPA Kernel to CUDA Backend** This diff introduces a Triton-optimized implementation of scaled dot-product attention (SDPA) kernel to the CUDA backend. The new kernel is designed to replace the default Edge SDPA operator during graph transformation to accelerate the model inference and get rid of sdpa decomposition. **Changes** * Added a new file `sdpa.py` to `fbcode/executorch/backends/cuda/triton/kernels` and `fbcode/executorch/backends/cuda/triton/kernels` directories, which contains the Triton-optimized SDPA kernel implementation. * Added a new file `__init__.py` to `fbcode/executorch/backends/cuda/triton/replacement_pass`, which replaces the given existing edge ops with target triton kernels. * Added tests for sdpa exporting with triton kernel. Without the triton kernel, sdpa model can not be exported. **Purpose** The purpose of this diff is to provide a high-performance SDPA kernel for the CUDA backend, which can be used to accelerate attention-based models on NVIDIA GPUs. Differential Revision: [D87259044](https://our.internmc.facebook.com/intern/diff/D87259044/) [ghstack-poisoned]
diff --git a/backends/cadence/aot/replace_ops.py b/backends/cadence/aot/replace_ops.py
@@ -69,50 +69,46 @@ def contains_placeholder_or_param(nodes: Iterable[torch.fx.Node]) -> bool:
 
 
 @register_cadence_pass(CadencePassAttribute(opt_level=0))
-class ReplaceLogicalNotBooleanWhereWithWherePass(ExportPass):
+class ReplaceLogicalNotBooleanWhereWithWherePass(RemoveOrReplacePassInterface):
     """
     A where op with a logical_not and a boolean tensor can be replaced
     by a where op with flipped inputs and the initial boolean tensor.
     """
 
-    def replace_logical_nop_where_with_where(
-        self, graph_module: torch.fx.GraphModule
-    ) -> None:
-        graph = graph_module.graph
-        for node in graph.nodes:
-            # We are only interested in where nodes
-            if node.target != exir_ops.edge.aten.where.self:
-                continue
+    @property
+    def targets(self) -> list[EdgeOpOverload]:
+        return [exir_ops.edge.aten.where.self]
 
-            # If the third arg is not a logical_not, bail.
-            if node.args[0].target != exir_ops.edge.aten.logical_not.default:
-                continue
+    def maybe_remove_or_replace(self, node: torch.fx.Node) -> bool:
+        # If the first arg is not a logical_not, bail.
+        if not isinstance(node.args[0], torch.fx.Node):
+            return False
 
-            # Get the third arg node and its input
-            logical_not_node = node.args[0]
-            logical_not_input_node = logical_not_node.args[0]
+        logical_not_node = cast(torch.fx.Node, node.args[0])
+        if logical_not_node.target != exir_ops.edge.aten.logical_not.default:
+            return False
 
-            # If the logical_not input is not a boolean tensor, bail.
-            if logical_not_input_node.meta["val"].dtype != torch.bool:
-                continue
+        # Get the first arg node and its input
+        if not isinstance(logical_not_node.args[0], torch.fx.Node):
+            return False
 
-            # Replace the where op with another one, flipping the inputs and using the boolean
-            # tensor from logical_not.
-            with graph.inserting_before(node):
-                linear_node = graph.call_function(
-                    exir_ops.edge.aten.where.self,
-                    args=(logical_not_node.args[0], node.args[2], node.args[1]),
-                )
-            # Replace all the uses
-            node.replace_all_uses_with(linear_node)
+        logical_not_input_node = cast(torch.fx.Node, logical_not_node.args[0])
 
-        graph_module.recompile()
-        graph_module.graph.eliminate_dead_code()
+        # If the logical_not input is not a boolean tensor, bail.
+        if logical_not_input_node.meta["val"].dtype != torch.bool:
+            return False
 
-    def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
-        self.replace_logical_nop_where_with_where(graph_module)
-        result = super().call(graph_module)
-        return result
+        # Replace the where op with another one, flipping the inputs and using the boolean
+        # tensor from logical_not.
+        with node.graph.inserting_before(node):
+            new_node = node.graph.call_function(
+                exir_ops.edge.aten.where.self,
+                args=(logical_not_input_node, node.args[2], node.args[1]),
+            )
+            new_node.meta = node.meta
+        # Replace all the uses
+        node.replace_all_uses_with(new_node)
+        return True
 
 
 @register_cadence_pass(CadencePassAttribute(opt_level=0))
diff --git a/backends/cadence/aot/tests/test_replace_ops_passes.py b/backends/cadence/aot/tests/test_replace_ops_passes.py
@@ -33,6 +33,7 @@
     ReplaceFunctionallyEquivalentOpTargets,
     ReplaceIm2RowWithViewPass,
     ReplaceLinearWithFullyConnectedOpPass,
+    ReplaceLogicalNotBooleanWhereWithWherePass,
     ReplaceMatmulWithTransposedMatmulPass,
     ReplaceMMWithAddMMPass,
     ReplaceMulTensorWithMulAndFullOpsPass,
@@ -2053,3 +2054,114 @@ def test_replace_quantized_embedding(
             ),
             1,
         )
+
+
+class TestReplaceLogicalNotBooleanWhereWithWherePass(unittest.TestCase):
+    """Tests for the ReplaceLogicalNotBooleanWhereWithWherePass."""
+
+    def test_replace_where_with_logical_not_boolean(self) -> None:
+        """Test that where(logical_not(bool_cond), x, y) is replaced with where(bool_cond, y, x)."""
+        # Setup: Create a graph with where(logical_not(bool_cond), x, y)
+        builder = GraphBuilder()
+        bool_cond_ = torch.randn(4, 8) > 0
+        x_ = torch.randn(4, 8)
+        y_ = torch.randn(4, 8)
+
+        bool_cond = builder.placeholder("bool_cond", bool_cond_)
+        x = builder.placeholder("x", x_)
+        y = builder.placeholder("y", y_)
+
+        # Create logical_not node
+        logical_not = builder.call_operator(
+            op=exir_ops.edge.aten.logical_not.default,
+            args=(bool_cond,),
+        )
+
+        # Create where node using logical_not
+        where_node = builder.call_operator(
+            op=exir_ops.edge.aten.where.self,
+            args=(logical_not, x, y),
+        )
+        builder.output([where_node])
+        original_gm = builder.get_graph_module()
+
+        # Make a copy of the original graph before applying the pass
+        original_gm_copy = copy.deepcopy(original_gm)
+
+        # Execute: Apply the replacement pass
+        p = ReplaceLogicalNotBooleanWhereWithWherePass()
+        result = cast(PassResult, p(original_gm))
+
+        # Assert: Verify the pass modified the graph
+        self.assertTrue(result.modified)
+        graph_after_passes = result.graph_module
+
+        # Assert: Verify logical_not is removed (dead code elimination)
+        self.assertEqual(
+            count_node(graph_after_passes, exir_ops.edge.aten.logical_not.default),
+            0,
+        )
+
+        # Assert: Verify where node still exists
+        self.assertEqual(
+            count_node(graph_after_passes, exir_ops.edge.aten.where.self),
+            1,
+        )
+
+        # Assert: Verify the arguments are flipped (condition uses original bool_cond, x and y are swapped)
+        where_nodes = list(
+            graph_after_passes.graph.find_nodes(
+                op="call_function", target=exir_ops.edge.aten.where.self
+            )
+        )
+        for node in where_nodes:
+            # First arg should be the original bool_cond (not the logical_not)
+            self.assertEqual(node.args[0].name, "bool_cond")
+            # Second and third args should be swapped (y, x instead of x, y)
+            self.assertEqual(node.args[1].name, "y")
+            self.assertEqual(node.args[2].name, "x")
+
+        # Assert: Verify outputs match exactly by running both graphs
+        validate(
+            original_gm_copy,
+            graph_after_passes,
+            (bool_cond_, x_, y_),
+            "ReplaceLogicalNotBooleanWhereWithWherePass",
+        )
+
+    def test_no_replacement_without_logical_not(self) -> None:
+        """Test that the pass does NOT apply when there's no logical_not."""
+        # Setup: Create a graph with where(bool_cond, x, y) without logical_not
+        builder = GraphBuilder()
+        bool_cond = builder.placeholder("bool_cond", torch.randn(4, 8) > 0)
+        x = builder.placeholder("x", torch.randn(4, 8))
+        y = builder.placeholder("y", torch.randn(4, 8))
+
+        # Create where node directly without logical_not
+        where_node = builder.call_operator(
+            op=exir_ops.edge.aten.where.self,
+            args=(bool_cond, x, y),
+        )
+        builder.output([where_node])
+        original_gm = builder.get_graph_module()
+
+        # Execute: Apply the replacement pass
+        p = ReplaceLogicalNotBooleanWhereWithWherePass()
+        result = cast(PassResult, p(original_gm))
+
+        # Assert: Verify the pass did NOT modify the graph
+        self.assertFalse(result.modified)
+        graph_after_passes = result.graph_module
+
+        # Assert: Verify where node still exists unchanged
+        self.assertEqual(
+            count_node(graph_after_passes, exir_ops.edge.aten.where.self),
+            1,
+        )
+
+        for node in graph_after_passes.graph.find_nodes(
+            op="call_function", target=exir_ops.edge.aten.where.self
+        ):
+            self.assertEqual(node.args[0].name, "bool_cond")
+            self.assertEqual(node.args[1].name, "x")
+            self.assertEqual(node.args[2].name, "y")
diff --git a/backends/cadence/runtime/runtime.py b/backends/cadence/runtime/runtime.py
@@ -45,7 +45,7 @@ def get_op_names(program: et_schema.Program, execution_plan_id: int = 0) -> set[
             op_names |= get_op_names(
                 deserialize_pte_binary(
                     program.backend_delegate_data[delegate.processed.index].data
-                )
+                ).program
             )
     return op_names
 
diff --git a/backends/cuda/triton/kernels/sdpa.py b/backends/cuda/triton/kernels/sdpa.py
@@ -72,6 +72,8 @@ def _validate_qkv_shapes(
         triton.Config({"BLOCK_M": 64, "BLOCK_N": 128}, num_stages=4, num_warps=4),
         triton.Config({"BLOCK_M": 128, "BLOCK_N": 64}, num_stages=3, num_warps=4),
         triton.Config({"BLOCK_M": 64, "BLOCK_N": 64}, num_stages=3, num_warps=4),
+        triton.Config({"BLOCK_M": 64, "BLOCK_N": 32}, num_stages=1, num_warps=2),
+        triton.Config({"BLOCK_M": 32, "BLOCK_N": 64}, num_stages=1, num_warps=2),
     ],
     key=["L_Q", "L_KV", "HEAD_DIM"],
 )
@@ -348,8 +350,8 @@ def _sdpa_abstract(
     attn_mask: Optional[torch.Tensor] = None,
     dropout_p: float = 0.0,
     is_causal: bool = False,
-    scale=None,
-    enable_gqa=False,
+    scale: float = 0.0,
+    enable_gq: bool = False,
 ) -> torch.Tensor:
     """
     Abstract/fake implementation for torch.export.
diff --git a/backends/cuda/triton/replacement_pass.py b/backends/cuda/triton/replacement_pass.py
@@ -31,10 +31,9 @@ class ReplaceEdgeOpWithTritonOpPass(PassBase):
     """
     Pass to replace ATen operators with Triton kernels.
 
-    This pass scans the graph for ATen operators that have registered Triton
-    replacements and replaces them with the optimized Triton implementations.
-
-    It automatically imports EDGE_TO_TRITON_KERNELS from cuda_backend.py.
+    This pass scans the graph for Edge operators that have registered Triton
+    replacements using EDGE_TO_TRITON_KERNELS and replaces them with the
+    optimized Triton implementations.
     """
 
     def __init__(self):
@@ -73,7 +72,7 @@ def call(self, graph_module: GraphModule) -> PassResult:
             # Recompile the graph module after modifications
             graph_module.recompile()
 
-        print(f"Replaced {self._replacement_count} nodes with Triton kernels")
+        logger.info(f"Replaced {self._replacement_count} nodes with Triton kernels")
 
         return PassResult(graph_module, modified)
 
@@ -83,7 +82,6 @@ def _should_replace_node(self, node: Node) -> bool:
 
         Args:
             node: The node to check
-            EDGE_TO_TRITON_KERNELS: Mapping from edge ops to Triton kernels
 
         Returns:
             True if the node should be replaced
@@ -101,7 +99,6 @@ def _replace_node_with_triton(self, graph_module: GraphModule, node: Node) -> No
         Args:
             graph_module: The graph module containing the node
             node: The node to replace
-            EDGE_TO_TRITON_KERNELS: Mapping from edge ops to Triton kernels
         """
         # Get the target operator (should be an exir_ops edge dialect op)
         target = node.target
diff --git a/backends/qualcomm/utils/utils.py b/backends/qualcomm/utils/utils.py
@@ -197,7 +197,7 @@ def dump_context_from_pte(pte_path) -> List[str]:
     with open(pte_path, "rb") as f:
         program_data = f.read()
 
-    program = deserialize_pte_binary(program_data)
+    program = deserialize_pte_binary(program_data).program
 
     ctx_path = os.path.dirname(pte_path)
     dummy_compiler_specs = generate_qnn_executorch_compiler_spec(
diff --git a/codegen/tools/gen_ops_def.py b/codegen/tools/gen_ops_def.py
@@ -23,7 +23,7 @@ def get_operators(model_file: str) -> List[Operator]:
     print("Processing model file: ", model_file)
     with open(model_file, "rb") as f:
         flatbuffer = f.read()
-    program = _deserialize_pte_binary(flatbuffer)
+    program = _deserialize_pte_binary(flatbuffer).program
     print(f"Program loaded from model file: {model_file}")
     operators = program.execution_plan[0].operators
     return operators
diff --git a/examples/qualcomm/oss_scripts/llama/decoder_utils.py b/examples/qualcomm/oss_scripts/llama/decoder_utils.py
@@ -276,7 +276,7 @@ def __init__(  # noqa: C901
 
         with open(pte_path, "rb") as f:
             program_data = f.read()
-        program = deserialize_pte_binary(program_data)
+        program = deserialize_pte_binary(program_data).program
 
         # Retrieve vocab_size from get_metadata under static_llama that is passed to edge manager
         self.output_vocab_size = None
diff --git a/exir/_serialize/__init__.py b/exir/_serialize/__init__.py
@@ -8,11 +8,13 @@
 
 from executorch.exir._serialize._program import (
     deserialize_pte_binary as _deserialize_pte_binary,
+    PTEFile as _PTEFile,
     serialize_pte_binary as _serialize_pte_binary,
 )
 
 # Internal APIs that should not be used outside of exir.
 __all__ = [
     "_deserialize_pte_binary",
     "_serialize_pte_binary",
+    "_PTEFile",
 ]
diff --git a/exir/_serialize/_program.py b/exir/_serialize/_program.py
diff --git a/exir/_serialize/test/test_program.py b/exir/_serialize/test/test_program.py
diff --git a/exir/emit/test/test_emit.py b/exir/emit/test/test_emit.py

Original file line number	Diff line number	Diff line change
`@@ -45,7 +45,7 @@ def get_op_names(program: et_schema.Program, execution_plan_id: int = 0) -> set[`
`45`	`45`	`op_names \|= get_op_names(`
`46`	`46`	`deserialize_pte_binary(`
`47`	`47`	`program.backend_delegate_data[delegate.processed.index].data`
`48`		`- )`
	`48`	`+ ).program`
`49`	`49`	`)`
`50`	`50`	`return op_names`
`51`	`51`
Original file line number	Diff line number	Diff line change
`@@ -8,11 +8,13 @@`
`8`	`8`
`9`	`9`	`from executorch.exir._serialize._program import (`
`10`	`10`	`deserialize_pte_binary as _deserialize_pte_binary,`
	`11`	`+ PTEFile as _PTEFile,`
`11`	`12`	`serialize_pte_binary as _serialize_pte_binary,`
`12`	`13`	`)`
`13`	`14`
`14`	`15`	`# Internal APIs that should not be used outside of exir.`
`15`	`16`	`__all__ = [`
`16`	`17`	`"_deserialize_pte_binary",`
`17`	`18`	`"_serialize_pte_binary",`
	`19`	`+ "_PTEFile",`
`18`	`20`	`]`