Update base for Update on "introduce triton sdpa kernel to cuda backend"

Gasoonjia · Gasoonjia · commit abbf37c2d9c4 · 2025-11-17T17:38:27.000-08:00
**Introduce Triton SDPA Kernel to CUDA Backend** This diff introduces a Triton-optimized implementation of scaled dot-product attention (SDPA) kernel to the CUDA backend. The new kernel is designed to replace the default Edge SDPA operator during graph transformation to accelerate the model inference and get rid of sdpa decomposition. **Changes** * Added a new file `sdpa.py` to `fbcode/executorch/backends/cuda/triton/kernels` and `fbcode/executorch/backends/cuda/triton/kernels` directories, which contains the Triton-optimized SDPA kernel implementation. * Added a new file `__init__.py` to `fbcode/executorch/backends/cuda/triton/replacement_pass`, which replaces the given existing edge ops with target triton kernels. * Added tests for sdpa exporting with triton kernel. Without the triton kernel, sdpa model can not be exported. **Purpose** The purpose of this diff is to provide a high-performance SDPA kernel for the CUDA backend, which can be used to accelerate attention-based models on NVIDIA GPUs. Differential Revision: [D87259044](https://our.internmc.facebook.com/intern/diff/D87259044/) [ghstack-poisoned]
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -801,6 +801,9 @@ if(EXECUTORCH_BUILD_PYBIND)
       torch
   )
 
+  # RPATH for _portable_lib.so
+  set(_portable_lib_rpath "$ORIGIN/../../../torch/lib")
+
   if(EXECUTORCH_BUILD_EXTENSION_MODULE)
     # Always use static linking for pybindings to avoid runtime symbol
     # resolution issues
@@ -835,6 +838,7 @@ if(EXECUTORCH_BUILD_PYBIND)
 
   if(EXECUTORCH_BUILD_QNN)
     list(APPEND _dep_libs qnn_executorch_backend)
+    string(APPEND _portable_lib_rpath ":$ORIGIN/../../backends/qualcomm")
   endif()
 
   if(EXECUTORCH_BUILD_ENN)
@@ -886,19 +890,20 @@ if(EXECUTORCH_BUILD_PYBIND)
   target_compile_options(portable_lib PUBLIC ${_pybind_compile_options})
   target_link_libraries(portable_lib PRIVATE ${_dep_libs})
 
-  # Set RPATH to find PyTorch libraries relative to the installation location
-  # This goes from executorch/extension/pybindings up to site-packages, then to
-  # torch/lib. Don't do this to APPLE, as it will error out on the following
-  # error:
+  # Set RPATH to find PyTorch and backend libraries relative to the installation
+  # location. This goes from executorch/extension/pybindings up to
+  # site-packages, then to torch/lib. If QNN is enabled, also add
+  # backends/qualcomm/. Don't do this to APPLE, as it will error out on the
+  # following error:
   #
   if(APPLE)
     # Skip setting @loader_path for APPLE, since it causes error like ld:
     # duplicate LC_RPATH '@loader_path' in '<site-packages>/torch/lib/
     # libtorch_cpu.dylib'
   else()
     set_target_properties(
-      portable_lib PROPERTIES BUILD_RPATH "$ORIGIN/../../../torch/lib"
-                              INSTALL_RPATH "$ORIGIN/../../../torch/lib"
+      portable_lib PROPERTIES BUILD_RPATH "${_portable_lib_rpath}"
+                              INSTALL_RPATH "${_portable_lib_rpath}"
     )
   endif()
 
diff --git a/backends/qualcomm/CMakeLists.txt b/backends/qualcomm/CMakeLists.txt
@@ -23,6 +23,47 @@ get_filename_component(
   _common_include_directories "${EXECUTORCH_SOURCE_DIR}/.." ABSOLUTE
 )
 
+# We only download QNN SDK when we build pip wheel for ExecuTorch. Please don't
+# change this code unless you know what you are doing.
+if(EXECUTORCH_BUILD_WHEEL_DO_NOT_USE)
+  set(_qnn_default_sdk_dir "${CMAKE_CURRENT_BINARY_DIR}/sdk/qnn")
+
+  if(EXISTS "${_qnn_default_sdk_dir}" AND EXISTS "${_qnn_default_sdk_dir}/lib")
+    message(STATUS "Found cached Qualcomm SDK at ${_qnn_default_sdk_dir}")
+    set(QNN_SDK_ROOT
+        ${_qnn_default_sdk_dir}
+        CACHE PATH "Qualcomm SDK root directory" FORCE
+    )
+  else()
+    message(STATUS "Downloading Qualcomm SDK")
+    execute_process(
+      COMMAND
+        ${PYTHON_EXECUTABLE}
+        ${EXECUTORCH_SOURCE_DIR}/backends/qualcomm/scripts/download_qnn_sdk.py
+        --dst-folder ${_qnn_default_sdk_dir} --print-sdk-path
+      WORKING_DIRECTORY ${EXECUTORCH_SOURCE_DIR}
+      RESULT_VARIABLE _qnn_sdk_download_result
+      OUTPUT_VARIABLE _qnn_sdk_download_output
+      ERROR_VARIABLE _qnn_sdk_download_error
+      OUTPUT_STRIP_TRAILING_WHITESPACE
+    )
+    if(NOT _qnn_sdk_download_result EQUAL 0 OR _qnn_sdk_download_output
+                                               STREQUAL ""
+    )
+      message(
+        FATAL_ERROR
+          "Failed to download Qualcomm SDK. stdout: ${_qnn_sdk_download_output}\n"
+          "stderr: ${_qnn_sdk_download_error}"
+      )
+    endif()
+    set(QNN_SDK_ROOT
+        ${_qnn_sdk_download_output}
+        CACHE PATH "Qualcomm SDK root directory" FORCE
+    )
+  endif()
+  set(ENV{QNN_SDK_ROOT} ${QNN_SDK_ROOT})
+endif()
+
 if(NOT DEFINED QNN_SDK_ROOT)
   message(
     FATAL_ERROR
@@ -214,7 +255,9 @@ add_subdirectory(
 install(
   TARGETS qnn_executorch_backend
   EXPORT ExecuTorchTargets
-  DESTINATION ${CMAKE_INSTALL_LIBDIR}
+  LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}/executorch/backends/qualcomm
+  ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}/executorch/backends/qualcomm
+  RUNTIME DESTINATION ${CMAKE_INSTALL_LIBDIR}/executorch/backends/qualcomm
 )
 
 # QNN pybind
@@ -275,4 +318,12 @@ if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "x86_64")
     ${QNN_EXECUTORCH_ROOT_DIR}/aot/python
     ${CMAKE_CURRENT_BINARY_DIR}/qnn_executorch/python
   )
+
+  install(
+    TARGETS PyQnnManagerAdaptor PyQnnWrapperAdaptor
+    LIBRARY
+      DESTINATION ${CMAKE_INSTALL_LIBDIR}/executorch/backends/qualcomm/python
+    RUNTIME
+      DESTINATION ${CMAKE_INSTALL_LIBDIR}/executorch/backends/qualcomm/python
+  )
 endif()
diff --git a/backends/qualcomm/scripts/download_qnn_sdk.py b/backends/qualcomm/scripts/download_qnn_sdk.py
@@ -1,4 +1,4 @@
-# Add these imports for additional logging
+import argparse
 import ctypes
 import logging
 import os
@@ -592,3 +592,46 @@ def install_qnn_sdk() -> bool:
 
     # libc++ and QNN SDK setup
     return _ensure_libcxx_stack() and _ensure_qnn_sdk_lib()
+
+
+def main(argv: Optional[List[str]] = None) -> int:
+    parser = argparse.ArgumentParser(
+        description="Helper utility for Qualcomm SDK staging."
+    )
+    parser.add_argument(
+        "--dst-folder",
+        type=pathlib.Path,
+        default=SDK_DIR,
+        help="Destination directory for the Qualcomm SDK.",
+    )
+    parser.add_argument(
+        "--print-sdk-path",
+        action="store_true",
+        help="Print the resolved Qualcomm SDK path to stdout.",
+    )
+    parser.add_argument(
+        "--install-sdk",
+        action="store_true",
+        help="Ensure the SDK and runtime libraries are staged and loaded.",
+    )
+    args = parser.parse_args(argv)
+
+    logging.basicConfig(level=logging.INFO)
+
+    sdk_path: Optional[pathlib.Path]
+    if args.install_sdk:
+        if not install_qnn_sdk():
+            return 1
+        sdk_path = pathlib.Path(os.environ.get("QNN_SDK_ROOT", args.dst_folder))
+    else:
+        sdk_path = _download_qnn_sdk(dst_folder=args.dst_folder)
+        if sdk_path is None:
+            return 1
+
+    if args.print_sdk_path and sdk_path is not None:
+        print(sdk_path)
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/examples/qualcomm/oss_scripts/llama/runner/runner.cpp b/examples/qualcomm/oss_scripts/llama/runner/runner.cpp
@@ -378,7 +378,22 @@ Error Runner<T>::generate_from_prompt_or_file(
   stats_.inference_start_ms = time_in_ms();
 
   int32_t seq_len = config.seq_len;
-  seq_len = (seq_len > 0 && seq_len <= context_len_) ? seq_len : context_len_;
+  if (seq_len > context_len_) {
+    ET_LOG(
+        Info,
+        "Warning: Requested seq_len (%d) exceeds compiled max_seq_len (%d). Clamping to %d.",
+        seq_len,
+        context_len_,
+        context_len_);
+    seq_len = context_len_;
+  } else if (seq_len <= 0) {
+    ET_LOG(
+        Info,
+        "Warning: Invalid seq_len (%d). Using compiled max_seq_len (%d).",
+        seq_len,
+        context_len_);
+    seq_len = context_len_;
+  }
   int32_t n_bos = (cur_pos_ == 0) ? 1 : 0;
 
   // encode the (string) prompt into tokens sequence
diff --git a/examples/qualcomm/oss_scripts/llama/runner/token_generator.cpp b/examples/qualcomm/oss_scripts/llama/runner/token_generator.cpp
@@ -323,6 +323,30 @@ Result<int64_t> TokenGenerator<T>::generate(
       break;
     }
   }
+
+  // Check if generation was truncated due to seq_len limit (no EOS token)
+  if (eos_ids_->count(cur_token) == 0 && pos >= seq_len - 1) {
+    printf("\n");
+    ET_LOG(
+        Info,
+        "Warning: Generation stopped at seq_len limit (%d) without reaching EOS token. Response may be incomplete.",
+        seq_len);
+    if (seq_len >= metadata_.context_len) {
+      ET_LOG(
+          Info,
+          "- seq_len (%d) already equals compiled max_seq_len (%d). Consider recompiling with larger --max_seq_len.",
+          seq_len,
+          metadata_.context_len);
+    } else {
+      ET_LOG(
+          Info,
+          "- seq_len (%d) is less than compiled max_seq_len (%d). Consider increasing --seq_len (up to %d).",
+          seq_len,
+          metadata_.context_len,
+          metadata_.context_len);
+    }
+  }
+
   return pos - start_pos;
 }
 // Explicit instantiations
diff --git a/exir/serde/export_serialize.py b/exir/serde/export_serialize.py
@@ -2143,17 +2143,23 @@ def deserialize_meta_func(serialized_target: str):
             def import_nn_module_stack(key, path, ty):
                 return key, (path, ty)
 
-            # Helper function that splits strings by commas except for those
-            # encapsulated by parens, which are valid traces.
-            # TODO: Currently this is needed due to indexing Sequential
-            # layers introducing names in the form "layer.slice(1, None, None)".
-            # If that naming is improved, this fancier splitting can probably be
-            # reverted to a simple split by comma.
+            # Helper function to split string by commas, accounting for nested parentheses/brackets
             def metadata_split(metadata):
-                # Remove the parentheses and commas inside them
-                metadata = re.sub(r"\(.*?\)", "", metadata)
-                # Split the string by comma, except for those inside parentheses
-                return re.split(r"(?<!\()\s*,\s*(?!\()", metadata)
+                out = []
+                start, depth = 0, 0
+                for position, char in enumerate(metadata):
+                    if char in "[(":
+                        depth += 1
+                    elif char in ")]":
+                        depth -= 1
+                        if depth < 0:
+                            raise ValueError(f"Mismatched brackets in metadata: {metadata}")
+                    elif char == "," and depth == 0:
+                        out.append(metadata[start:position].strip())
+                        start = position + 1
+                out.append(metadata[start:].strip())
+                assert len(out) == 3
+                return out
 
             nn_module_stack = dict(
                 import_nn_module_stack(*metadata_split(item))
diff --git a/setup.py b/setup.py
@@ -57,8 +57,6 @@
 import site
 import subprocess
 import sys
-import sysconfig
-import tempfile
 
 from distutils import log  # type: ignore[import-not-found]
 from distutils.sysconfig import get_python_lib  # type: ignore[import-not-found]
@@ -463,84 +461,6 @@ def run(self):
         if self._ran_build:
             return
 
-        try:
-            # Following code is for building the Qualcomm backend.
-            from backends.qualcomm.scripts.download_qnn_sdk import (
-                _download_qnn_sdk,
-                is_linux_x86,
-            )
-
-            if is_linux_x86():
-                os.environ["EXECUTORCH_BUILDING_WHEEL"] = "1"
-
-                with tempfile.TemporaryDirectory() as tmpdir:
-                    tmp_path = Path(tmpdir)
-                    sdk_path = _download_qnn_sdk(dst_folder=tmp_path)
-
-                    if not sdk_path:
-                        raise RuntimeError(
-                            "Qualcomm SDK not found, cannot build backend"
-                        )
-
-                    # Determine paths
-                    prj_root = Path(__file__).parent.resolve()
-                    build_sh = prj_root / "backends/qualcomm/scripts/build.sh"
-                    build_root = prj_root / "build-x86"
-
-                    if not build_sh.exists():
-                        raise FileNotFoundError(f"{build_sh} not found")
-
-                    # Run build.sh with SDK path exported
-                    env = dict(**os.environ)
-                    env["QNN_SDK_ROOT"] = str(sdk_path)
-                    subprocess.check_call(
-                        [
-                            str(build_sh),
-                            "--skip_linux_android",
-                            "--skip_linux_embedded",
-                        ],
-                        env=env,
-                    )
-
-                    # Copy the main .so into the wheel package
-                    so_src = (
-                        build_root / "backends/qualcomm/libqnn_executorch_backend.so"
-                    )
-                    so_dst = Path(
-                        self.get_ext_fullpath(
-                            "executorch.backends.qualcomm.qnn_backend"
-                        )
-                    )
-                    self.mkpath(str(so_dst.parent))  # ensure destination exists
-                    self.copy_file(str(so_src), str(so_dst))
-                    logging.info(f"Copied Qualcomm backend: {so_src} -> {so_dst}")
-
-                    # Copy Python adaptor .so files
-                    ext_suffix = sysconfig.get_config_var("EXT_SUFFIX")
-
-                    so_files = [
-                        (
-                            "executorch.backends.qualcomm.python.PyQnnManagerAdaptor",
-                            prj_root
-                            / f"backends/qualcomm/python/PyQnnManagerAdaptor{ext_suffix}",
-                        ),
-                        (
-                            "executorch.backends.qualcomm.python.PyQnnWrapperAdaptor",
-                            prj_root
-                            / f"backends/qualcomm/python/PyQnnWrapperAdaptor{ext_suffix}",
-                        ),
-                    ]
-
-                    for module_name, so_src in so_files:
-                        so_dst = Path(self.get_ext_fullpath(module_name))
-                        self.mkpath(str(so_dst.parent))
-                        self.copy_file(str(so_src), str(so_dst))
-                        logging.info(f"Copied Qualcomm backend: {so_src} -> {so_dst}")
-
-        except ImportError:
-            logging.error("Fail to build Qualcomm backend")
-            logging.exception("Import error")
-
         if self.editable_mode:
             self._ran_build = True
             self.run_command("build")
@@ -837,6 +757,11 @@ def run(self):  # noqa C901
             cmake_build_args += ["--target", "custom_ops_aot_lib"]
             cmake_build_args += ["--target", "quantized_ops_aot_lib"]
 
+        if cmake_cache.is_enabled("EXECUTORCH_BUILD_QNN"):
+            cmake_build_args += ["--target", "qnn_executorch_backend"]
+            cmake_build_args += ["--target", "PyQnnManagerAdaptor"]
+            cmake_build_args += ["--target", "PyQnnWrapperAdaptor"]
+
         # Set PYTHONPATH to the location of the pip package.
         os.environ["PYTHONPATH"] = (
             site.getsitepackages()[0] + ";" + os.environ.get("PYTHONPATH", "")
@@ -924,5 +849,24 @@ def run(self):  # noqa C901
             dst="executorch/data/lib/",
             dependent_cmake_flags=[],
         ),
+        BuiltFile(
+            src_dir="%CMAKE_CACHE_DIR%/backends/qualcomm/%BUILD_TYPE%/",
+            src_name="qnn_executorch_backend",
+            dst="executorch/backends/qualcomm/",
+            is_dynamic_lib=True,
+            dependent_cmake_flags=["EXECUTORCH_BUILD_QNN"],
+        ),
+        BuiltExtension(
+            src_dir="%CMAKE_CACHE_DIR%/backends/qualcomm/%BUILD_TYPE%/",
+            src="PyQnnManagerAdaptor.*",
+            modpath="executorch.backends.qualcomm.python.PyQnnManagerAdaptor",
+            dependent_cmake_flags=["EXECUTORCH_BUILD_QNN"],
+        ),
+        BuiltExtension(
+            src_dir="%CMAKE_CACHE_DIR%/backends/qualcomm/%BUILD_TYPE%/",
+            src="PyQnnWrapperAdaptor.*",
+            modpath="executorch.backends.qualcomm.python.PyQnnWrapperAdaptor",
+            dependent_cmake_flags=["EXECUTORCH_BUILD_QNN"],
+        ),
     ],
 )
diff --git a/tools/cmake/preset/default.cmake b/tools/cmake/preset/default.cmake
diff --git a/tools/cmake/preset/pybind.cmake b/tools/cmake/preset/pybind.cmake