Update on "introduce triton sdpa kernel to cuda backend"

Gasoonjia · Gasoonjia · commit 1023d9310a59 · 2025-11-17T17:38:28.000-08:00
**Introduce Triton SDPA Kernel to CUDA Backend** This diff introduces a Triton-optimized implementation of scaled dot-product attention (SDPA) kernel to the CUDA backend. The new kernel is designed to replace the default Edge SDPA operator during graph transformation to accelerate the model inference and get rid of sdpa decomposition. **Changes** * Added a new file `sdpa.py` to `fbcode/executorch/backends/cuda/triton/kernels` and `fbcode/executorch/backends/cuda/triton/kernels` directories, which contains the Triton-optimized SDPA kernel implementation. * Added a new file `__init__.py` to `fbcode/executorch/backends/cuda/triton/replacement_pass`, which replaces the given existing edge ops with target triton kernels. * Added tests for sdpa exporting with triton kernel. Without the triton kernel, sdpa model can not be exported. **Purpose** The purpose of this diff is to provide a high-performance SDPA kernel for the CUDA backend, which can be used to accelerate attention-based models on NVIDIA GPUs. Differential Revision: [D87259044](https://our.internmc.facebook.com/intern/diff/D87259044/) [ghstack-poisoned]
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -801,6 +801,9 @@ if(EXECUTORCH_BUILD_PYBIND)
       torch
   )
 
+  # RPATH for _portable_lib.so
+  set(_portable_lib_rpath "$ORIGIN/../../../torch/lib")
+
   if(EXECUTORCH_BUILD_EXTENSION_MODULE)
     # Always use static linking for pybindings to avoid runtime symbol
     # resolution issues
@@ -835,6 +838,7 @@ if(EXECUTORCH_BUILD_PYBIND)
 
   if(EXECUTORCH_BUILD_QNN)
     list(APPEND _dep_libs qnn_executorch_backend)
+    string(APPEND _portable_lib_rpath ":$ORIGIN/../../backends/qualcomm")
   endif()
 
   if(EXECUTORCH_BUILD_ENN)
@@ -886,19 +890,20 @@ if(EXECUTORCH_BUILD_PYBIND)
   target_compile_options(portable_lib PUBLIC ${_pybind_compile_options})
   target_link_libraries(portable_lib PRIVATE ${_dep_libs})
 
-  # Set RPATH to find PyTorch libraries relative to the installation location
-  # This goes from executorch/extension/pybindings up to site-packages, then to
-  # torch/lib. Don't do this to APPLE, as it will error out on the following
-  # error:
+  # Set RPATH to find PyTorch and backend libraries relative to the installation
+  # location. This goes from executorch/extension/pybindings up to
+  # site-packages, then to torch/lib. If QNN is enabled, also add
+  # backends/qualcomm/. Don't do this to APPLE, as it will error out on the
+  # following error:
   #
   if(APPLE)
     # Skip setting @loader_path for APPLE, since it causes error like ld:
     # duplicate LC_RPATH '@loader_path' in '<site-packages>/torch/lib/
     # libtorch_cpu.dylib'
   else()
     set_target_properties(
-      portable_lib PROPERTIES BUILD_RPATH "$ORIGIN/../../../torch/lib"
-                              INSTALL_RPATH "$ORIGIN/../../../torch/lib"
+      portable_lib PROPERTIES BUILD_RPATH "${_portable_lib_rpath}"
+                              INSTALL_RPATH "${_portable_lib_rpath}"
     )
   endif()
 
diff --git a/backends/cuda/triton/kernels/sdpa.py b/backends/cuda/triton/kernels/sdpa.py
@@ -5,10 +5,12 @@
 # LICENSE file in the root directory of this source tree.
 
 """
-Optimized Triton SDPA Kernel for ExecuTorch CUDA Backend.
+Triton SDPA Kernel for ExecuTorch CUDA Backend.
 
 This module provides a Triton-optimized implementation of scaled dot-product attention
-that can replace the default ATen SDPA operator during graph transformation.
+that can replace the default ATen/Edge SDPA operator during graph transformation to allow
+us export the model without decomposing the SDPA operator under libtorch free environment
+and have better performance.
 """
 
 import math
@@ -221,22 +223,17 @@ def sdpa(
     """
     Triton fused Scaled Dot-Product Attention with support for different sequence lengths.
 
-    Supports different sequence lengths for query and key/value:
-    - Query: [B, H, L_q, D]
-    - Key: [B, H, L_kv, D]
-    - Value: [B, H, L_kv, D]
-    - Output: [B, H, L_q, D] (matches query shape)
     Args:
-        query: Query tensor [B, H, L_q, D]
-        key: Key tensor [B, H, L_kv, D]
-        value: Value tensor [B, H, L_kv, D]
-        attn_mask: Optional attention mask [B, H, L_q, L_kv] or broadcastable shape
-        dropout_p: must be 0.0 (not supported)
+        query: Query tensor with szie [B, H, L_q, D] and dtype torch.bfloat16
+        key: Key tensor [B, H, L_kv, D] and dtype torch.bfloat16
+        value: Value tensor [B, H, L_kv, D] and dtype torch.bfloat16
+        attn_mask: Optional attention mask [B, H, L_q, L_kv] or broadcastable shape (2D: [L_q, L_kv] or 3D: [B, L_q, L_kv])
+        dropout_p: must be 0.0 (others are not supported)
         is_causal: whether to apply causal masking
-        scale: attention scale (default: 1/sqrt(d))
-        enable_gqa: must be False (not supported)
+        scale: attention scale (default: 1/sqrt(D))
+        enable_gqa: must be False (True is not supported)
     Returns:
-        Output tensor [B, H, L_q, D]
+        Output tensor [B, H, L_q, D] with dtype torch.bfloat16
     """
     # Validate inputs
     if not (query.is_cuda and key.is_cuda and value.is_cuda):
diff --git a/backends/qualcomm/CMakeLists.txt b/backends/qualcomm/CMakeLists.txt
@@ -23,6 +23,47 @@ get_filename_component(
   _common_include_directories "${EXECUTORCH_SOURCE_DIR}/.." ABSOLUTE
 )
 
+# We only download QNN SDK when we build pip wheel for ExecuTorch. Please don't
+# change this code unless you know what you are doing.
+if(EXECUTORCH_BUILD_WHEEL_DO_NOT_USE)
+  set(_qnn_default_sdk_dir "${CMAKE_CURRENT_BINARY_DIR}/sdk/qnn")
+
+  if(EXISTS "${_qnn_default_sdk_dir}" AND EXISTS "${_qnn_default_sdk_dir}/lib")
+    message(STATUS "Found cached Qualcomm SDK at ${_qnn_default_sdk_dir}")
+    set(QNN_SDK_ROOT
+        ${_qnn_default_sdk_dir}
+        CACHE PATH "Qualcomm SDK root directory" FORCE
+    )
+  else()
+    message(STATUS "Downloading Qualcomm SDK")
+    execute_process(
+      COMMAND
+        ${PYTHON_EXECUTABLE}
+        ${EXECUTORCH_SOURCE_DIR}/backends/qualcomm/scripts/download_qnn_sdk.py
+        --dst-folder ${_qnn_default_sdk_dir} --print-sdk-path
+      WORKING_DIRECTORY ${EXECUTORCH_SOURCE_DIR}
+      RESULT_VARIABLE _qnn_sdk_download_result
+      OUTPUT_VARIABLE _qnn_sdk_download_output
+      ERROR_VARIABLE _qnn_sdk_download_error
+      OUTPUT_STRIP_TRAILING_WHITESPACE
+    )
+    if(NOT _qnn_sdk_download_result EQUAL 0 OR _qnn_sdk_download_output
+                                               STREQUAL ""
+    )
+      message(
+        FATAL_ERROR
+          "Failed to download Qualcomm SDK. stdout: ${_qnn_sdk_download_output}\n"
+          "stderr: ${_qnn_sdk_download_error}"
+      )
+    endif()
+    set(QNN_SDK_ROOT
+        ${_qnn_sdk_download_output}
+        CACHE PATH "Qualcomm SDK root directory" FORCE
+    )
+  endif()
+  set(ENV{QNN_SDK_ROOT} ${QNN_SDK_ROOT})
+endif()
+
 if(NOT DEFINED QNN_SDK_ROOT)
   message(
     FATAL_ERROR
@@ -214,7 +255,9 @@ add_subdirectory(
 install(
   TARGETS qnn_executorch_backend
   EXPORT ExecuTorchTargets
-  DESTINATION ${CMAKE_INSTALL_LIBDIR}
+  LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}/executorch/backends/qualcomm
+  ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}/executorch/backends/qualcomm
+  RUNTIME DESTINATION ${CMAKE_INSTALL_LIBDIR}/executorch/backends/qualcomm
 )
 
 # QNN pybind
@@ -275,4 +318,12 @@ if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "x86_64")
     ${QNN_EXECUTORCH_ROOT_DIR}/aot/python
     ${CMAKE_CURRENT_BINARY_DIR}/qnn_executorch/python
   )
+
+  install(
+    TARGETS PyQnnManagerAdaptor PyQnnWrapperAdaptor
+    LIBRARY
+      DESTINATION ${CMAKE_INSTALL_LIBDIR}/executorch/backends/qualcomm/python
+    RUNTIME
+      DESTINATION ${CMAKE_INSTALL_LIBDIR}/executorch/backends/qualcomm/python
+  )
 endif()
diff --git a/backends/qualcomm/scripts/download_qnn_sdk.py b/backends/qualcomm/scripts/download_qnn_sdk.py
@@ -1,4 +1,4 @@
-# Add these imports for additional logging
+import argparse
 import ctypes
 import logging
 import os
@@ -592,3 +592,46 @@ def install_qnn_sdk() -> bool:
 
     # libc++ and QNN SDK setup
     return _ensure_libcxx_stack() and _ensure_qnn_sdk_lib()
+
+
+def main(argv: Optional[List[str]] = None) -> int:
+    parser = argparse.ArgumentParser(
+        description="Helper utility for Qualcomm SDK staging."
+    )
+    parser.add_argument(
+        "--dst-folder",
+        type=pathlib.Path,
+        default=SDK_DIR,
+        help="Destination directory for the Qualcomm SDK.",
+    )
+    parser.add_argument(
+        "--print-sdk-path",
+        action="store_true",
+        help="Print the resolved Qualcomm SDK path to stdout.",
+    )
+    parser.add_argument(
+        "--install-sdk",
+        action="store_true",
+        help="Ensure the SDK and runtime libraries are staged and loaded.",
+    )
+    args = parser.parse_args(argv)
+
+    logging.basicConfig(level=logging.INFO)
+
+    sdk_path: Optional[pathlib.Path]
+    if args.install_sdk:
+        if not install_qnn_sdk():
+            return 1
+        sdk_path = pathlib.Path(os.environ.get("QNN_SDK_ROOT", args.dst_folder))
+    else:
+        sdk_path = _download_qnn_sdk(dst_folder=args.dst_folder)
+        if sdk_path is None:
+            return 1
+
+    if args.print_sdk_path and sdk_path is not None:
+        print(sdk_path)
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/examples/qualcomm/oss_scripts/llama/runner/runner.cpp b/examples/qualcomm/oss_scripts/llama/runner/runner.cpp
@@ -378,7 +378,22 @@ Error Runner<T>::generate_from_prompt_or_file(
   stats_.inference_start_ms = time_in_ms();
 
   int32_t seq_len = config.seq_len;
-  seq_len = (seq_len > 0 && seq_len <= context_len_) ? seq_len : context_len_;
+  if (seq_len > context_len_) {
+    ET_LOG(
+        Info,
+        "Warning: Requested seq_len (%d) exceeds compiled max_seq_len (%d). Clamping to %d.",
+        seq_len,
+        context_len_,
+        context_len_);
+    seq_len = context_len_;
+  } else if (seq_len <= 0) {
+    ET_LOG(
+        Info,
+        "Warning: Invalid seq_len (%d). Using compiled max_seq_len (%d).",
+        seq_len,
+        context_len_);
+    seq_len = context_len_;
+  }
   int32_t n_bos = (cur_pos_ == 0) ? 1 : 0;
 
   // encode the (string) prompt into tokens sequence
diff --git a/examples/qualcomm/oss_scripts/llama/runner/token_generator.cpp b/examples/qualcomm/oss_scripts/llama/runner/token_generator.cpp
@@ -323,6 +323,30 @@ Result<int64_t> TokenGenerator<T>::generate(
       break;
     }
   }
+
+  // Check if generation was truncated due to seq_len limit (no EOS token)
+  if (eos_ids_->count(cur_token) == 0 && pos >= seq_len - 1) {
+    printf("\n");
+    ET_LOG(
+        Info,
+        "Warning: Generation stopped at seq_len limit (%d) without reaching EOS token. Response may be incomplete.",
+        seq_len);
+    if (seq_len >= metadata_.context_len) {
+      ET_LOG(
+          Info,
+          "- seq_len (%d) already equals compiled max_seq_len (%d). Consider recompiling with larger --max_seq_len.",
+          seq_len,
+          metadata_.context_len);
+    } else {
+      ET_LOG(
+          Info,
+          "- seq_len (%d) is less than compiled max_seq_len (%d). Consider increasing --seq_len (up to %d).",
+          seq_len,
+          metadata_.context_len,
+          metadata_.context_len);
+    }
+  }
+
   return pos - start_pos;
 }
 // Explicit instantiations
diff --git a/exir/serde/export_serialize.py b/exir/serde/export_serialize.py
@@ -2143,17 +2143,23 @@ def deserialize_meta_func(serialized_target: str):
             def import_nn_module_stack(key, path, ty):
                 return key, (path, ty)
 
-            # Helper function that splits strings by commas except for those
-            # encapsulated by parens, which are valid traces.
-            # TODO: Currently this is needed due to indexing Sequential
-            # layers introducing names in the form "layer.slice(1, None, None)".
-            # If that naming is improved, this fancier splitting can probably be
-            # reverted to a simple split by comma.
+            # Helper function to split string by commas, accounting for nested parentheses/brackets
             def metadata_split(metadata):
-                # Remove the parentheses and commas inside them
-                metadata = re.sub(r"\(.*?\)", "", metadata)
-                # Split the string by comma, except for those inside parentheses
-                return re.split(r"(?<!\()\s*,\s*(?!\()", metadata)
+                out = []
+                start, depth = 0, 0
+                for position, char in enumerate(metadata):
+                    if char in "[(":
+                        depth += 1
+                    elif char in ")]":
+                        depth -= 1
+                        if depth < 0:
+                            raise ValueError(f"Mismatched brackets in metadata: {metadata}")
+                    elif char == "," and depth == 0:
+                        out.append(metadata[start:position].strip())
+                        start = position + 1
+                out.append(metadata[start:].strip())
+                assert len(out) == 3
+                return out
 
             nn_module_stack = dict(
                 import_nn_module_stack(*metadata_split(item))
diff --git a/setup.py b/setup.py
diff --git a/tools/cmake/preset/default.cmake b/tools/cmake/preset/default.cmake
diff --git a/tools/cmake/preset/pybind.cmake b/tools/cmake/preset/pybind.cmake