diff --git a/.buckconfig b/.buckconfig
index 8ccb8ef4aae..6aaf7221b3e 100644
--- a/.buckconfig
+++ b/.buckconfig
@@ -23,3 +23,14 @@
 
 [parser]
   target_platform_detector_spec = target:root//...->prelude//platforms:default target:shim//...->prelude//platforms:default
+
+# Limit the number of files that the buck daemon needs to monitor. If every
+# submodule is cloned recursively, some system can fail to build with "OS file
+# watch limit reached".
+[project]
+  ignore = \
+      .git, \
+      **/.git, \
+      third-party/pytorch/third_party, \
+      cmake-out, \
+      pip-out
diff --git a/.ci/docker/ci_commit_pins/pytorch.txt b/.ci/docker/ci_commit_pins/pytorch.txt
index 44632703e32..1fcaede5ad1 100644
--- a/.ci/docker/ci_commit_pins/pytorch.txt
+++ b/.ci/docker/ci_commit_pins/pytorch.txt
@@ -1 +1 @@
-0a038cf0cff2d071b7359ac0491fd2ba7798a438
+release/2.3
diff --git a/.github/workflows/_unittest.yml b/.github/workflows/_unittest.yml
index c36c5861168..81a4bd60e9e 100644
--- a/.github/workflows/_unittest.yml
+++ b/.github/workflows/_unittest.yml
@@ -14,7 +14,7 @@ on:
 
 jobs:
   linux:
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@release/2.3
     strategy:
       matrix:
         include:
@@ -44,7 +44,7 @@ jobs:
         pytest -n auto --cov=./ --cov-report=xml
 
   macos:
-    uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
+    uses: pytorch/test-infra/.github/workflows/macos_job.yml@release/2.3
     strategy:
       matrix:
         include:
diff --git a/.github/workflows/android.yml b/.github/workflows/android.yml
index 0d8931cf102..06e4b2acd54 100644
--- a/.github/workflows/android.yml
+++ b/.github/workflows/android.yml
@@ -21,7 +21,7 @@ concurrency:
 jobs:
   test-demo-android:
     name: test-demo-android
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@release/2.3
     strategy:
       matrix:
         include:
diff --git a/.github/workflows/apple.yml b/.github/workflows/apple.yml
index 06aa6a66e98..532316731fe 100644
--- a/.github/workflows/apple.yml
+++ b/.github/workflows/apple.yml
@@ -26,7 +26,7 @@ concurrency:
 jobs:
   test-demo-ios:
     name: test-demo-ios
-    uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
+    uses: pytorch/test-infra/.github/workflows/macos_job.yml@release/2.3
     with:
       runner: macos-latest-xlarge
       python-version: '3.11'
@@ -52,7 +52,7 @@ jobs:
 
   build-frameworks-ios:
     name: build-frameworks-ios
-    uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
+    uses: pytorch/test-infra/.github/workflows/macos_job.yml@release/2.3
     with:
       runner: macos-latest-xlarge
       python-version: '3.11'
@@ -64,7 +64,7 @@ jobs:
         WORKSPACE=$(pwd)
         pushd "${WORKSPACE}/pytorch/executorch"
         BUILD_TOOL=cmake
-        VERSION="0.1.0"
+        VERSION="0.2.0"
         FRAMEWORKS=(
           "executorch"
           "coreml_backend"
diff --git a/.github/workflows/build-wheels-linux.yml b/.github/workflows/build-wheels-linux.yml
index a2f86b219f8..abe680f946e 100644
--- a/.github/workflows/build-wheels-linux.yml
+++ b/.github/workflows/build-wheels-linux.yml
@@ -19,12 +19,12 @@ on:
 
 jobs:
   generate-matrix:
-    uses: pytorch/test-infra/.github/workflows/generate_binary_build_matrix.yml@main
+    uses: pytorch/test-infra/.github/workflows/generate_binary_build_matrix.yml@release/2.3
     with:
       package-type: wheel
       os: linux
       test-infra-repository: pytorch/test-infra
-      test-infra-ref: main
+      test-infra-ref: release/2.3
       with-cuda: disabled
       with-rocm: disabled
 
@@ -43,13 +43,18 @@ jobs:
             smoke-test-script: build/packaging/smoke_test.py
             package-name: executorch
     name: ${{ matrix.repository }}
-    uses: pytorch/test-infra/.github/workflows/build_wheels_linux.yml@main
+    uses: pytorch/test-infra/.github/workflows/build_wheels_linux.yml@release/2.3
     with:
       repository: ${{ matrix.repository }}
       ref: ""
       test-infra-repository: pytorch/test-infra
-      test-infra-ref: main
+      test-infra-ref: release/2.3
       build-matrix: ${{ needs.generate-matrix.outputs.matrix }}
+      # ExecuTorch only needs the first layer of submodules; override the
+      # "recursive" default to do less work, and to give the buck daemon fewer
+      # files to look at.
+      submodules: true
+      env-var-script: build/packaging/env_var_script_linux.sh
       pre-script: ${{ matrix.pre-script }}
       post-script: ${{ matrix.post-script }}
       package-name: ${{ matrix.package-name }}
diff --git a/.github/workflows/build-wheels-m1.yml b/.github/workflows/build-wheels-m1.yml
index dbc74433ff8..d36c22d23fc 100644
--- a/.github/workflows/build-wheels-m1.yml
+++ b/.github/workflows/build-wheels-m1.yml
@@ -19,12 +19,12 @@ on:
 
 jobs:
   generate-matrix:
-    uses: pytorch/test-infra/.github/workflows/generate_binary_build_matrix.yml@main
+    uses: pytorch/test-infra/.github/workflows/generate_binary_build_matrix.yml@release/2.3
     with:
       package-type: wheel
       os: macos-arm64
       test-infra-repository: pytorch/test-infra
-      test-infra-ref: main
+      test-infra-ref: release/2.3
       with-cuda: disabled
       with-rocm: disabled
 
@@ -43,13 +43,18 @@ jobs:
             smoke-test-script: build/packaging/smoke_test.py
             package-name: executorch
     name: ${{ matrix.repository }}
-    uses: pytorch/test-infra/.github/workflows/build_wheels_macos.yml@main
+    uses: pytorch/test-infra/.github/workflows/build_wheels_macos.yml@release/2.3
     with:
       repository: ${{ matrix.repository }}
       ref: ""
       test-infra-repository: pytorch/test-infra
-      test-infra-ref: main
+      test-infra-ref: release/2.3
       build-matrix: ${{ needs.generate-matrix.outputs.matrix }}
+      # ExecuTorch only needs the first layer of submodules; override the
+      # "recursive" default to do less work, and to give the buck daemon fewer
+      # files to look at.
+      submodules: true
+      env-var-script: build/packaging/env_var_script_m1.sh
       pre-script: ${{ matrix.pre-script }}
       post-script: ${{ matrix.post-script }}
       package-name: ${{ matrix.package-name }}
diff --git a/.github/workflows/doc-build.yml b/.github/workflows/doc-build.yml
index ee5cfb859b3..238345489f4 100644
--- a/.github/workflows/doc-build.yml
+++ b/.github/workflows/doc-build.yml
@@ -14,7 +14,7 @@ on:
 
 jobs:
   build:
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@release/2.3
     strategy:
       matrix:
         include:
@@ -68,26 +68,25 @@ jobs:
         make html
         cd ..
 
+        # If it's main branch, add noindex tag to all .html files to exclude from Google Search indexing.
+        GITHUB_REF=${{ github.ref }}
+        echo "GitHub Ref: ${GITHUB_REF}"
+        if [[ "${{ github.ref }}" == 'refs/heads/main' ]]; then
+          find docs/_build/html/ -name "*.html" -print0 | xargs -0 sed -i '/<head>/a \ \ <meta name="robots" content="noindex">';
+        fi
+
         cp -rf docs/_build/html/* "${RUNNER_DOCS_DIR}"
 
         mv docs/_build/html "${RUNNER_ARTIFACT_DIR}"
 
         ls -R "${RUNNER_ARTIFACT_DIR}"/*/*.html
 
-# Enable preview later. Previews are available publicly
-#
-# upload-preview:
-#    if: github.repository == 'pytorch/executorch' && github.event_name == 'push' &&
-#        (github.ref_type == 'branch' && github.ref_name == 'main')
-#    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
-
   upload-gh-pages:
     needs: build
-    if: github.repository == 'pytorch/executorch' && github.event_name == 'push' &&
-        ((github.ref_type == 'branch' && github.ref_name == 'main') || github.ref_type == 'tag')
+    if: github.repository == 'pytorch/executorch' && github.event_name == 'push' && (github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/heads/release/') || startsWith(github.ref, 'refs/tags/v'))
     permissions:
       contents: write
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@release/2.3
     with:
       repository: pytorch/executorch
       download-artifact: docs
@@ -99,6 +98,13 @@ jobs:
         REF_TYPE=${{ github.ref_type }}
         REF_NAME=${{ github.ref_name }}
 
+        # If it's main branch, add noindex tag to all .html files to exclude from Google Search indexing.
+        REF_NAME=$(echo "${{ github.ref }}")
+        echo "Ref name: ${REF_NAME}"
+        if [[ "${{ github.ref }}" == 'refs/heads/main' ]]; then
+          find docs -name "*.html" -print0 | xargs -0 sed -i '/<head>/a \ \ <meta name="robots" content="noindex">';
+        fi
+
         # If building for a release tag, branch, set the branch/tag name
         # as the target folder in the gh-pages branch. The artifacts created
         # during the build will be copied over to the target dir in the
@@ -108,10 +114,16 @@ jobs:
         elif [[ "${REF_TYPE}" == tag ]]; then
           # Strip the leading "v" as well as the trailing patch version and "-rc" suffix.
           # For example: 'v0.1.2' -> '0.1' and 'v0.1.2-rc1' -> 0.1.
-          TARGET_FOLDER=$(echo "${REF_NAME}" | sed 's/^v//i; s/-rc[0-9]*$//; s/\.[0-9]*$//')
-        else
-          echo "ERROR: Invalid REF_TYPE: ${REF_TYPE}. Expected 'branch' or 'tag'."
-          exit 1
+          case "${REF_NAME}" in
+            *-rc*)
+              echo "Aborting upload since this is an RC tag: ${REF_NAME}"
+              # We don't generate -rc* documentation but for actual tag only.
+              exit 0
+              ;;
+            *)
+              TARGET_FOLDER=$(echo "${REF_NAME}" | sed 's/v\([0-9]\+\)\.\([0-9]\+\)\.[0-9]\+/\1.\2/')
+              ;;
+          esac
         fi
         echo "Target Folder: ${TARGET_FOLDER}"
 
@@ -122,12 +134,6 @@ jobs:
         mv "${RUNNER_ARTIFACT_DIR}"/html/* "${TARGET_FOLDER}"
         git add "${TARGET_FOLDER}" || true
 
-        # If it's main branch, add noindex tag to all .html files to exclude from Google Search indexing.
-        if [[ "${REF_NAME}" == 'main' ]]; then
-          find "${TARGET_FOLDER}" -type f -name "*.html" -exec sed -i '/<head>/a <meta name="robots" content="noindex">' {} \;
-          git add "${TARGET_FOLDER}"/**/*.html || true
-        fi
-
         git config user.name 'pytorchbot'
         git config user.email 'soumith+bot@pytorch.org'
         git commit -m "Auto-generating sphinx docs" || true
diff --git a/.github/workflows/docker-builds.yml b/.github/workflows/docker-builds.yml
index f773f3aca88..6cf6e0495b3 100644
--- a/.github/workflows/docker-builds.yml
+++ b/.github/workflows/docker-builds.yml
@@ -50,7 +50,7 @@ jobs:
           mkdir "${GITHUB_WORKSPACE}"
 
       - name: Setup SSH (Click me for login details)
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.3
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
 
@@ -58,11 +58,11 @@ jobs:
         uses: actions/checkout@v3
 
       - name: Setup Linux
-        uses: pytorch/test-infra/.github/actions/setup-linux@main
+        uses: pytorch/test-infra/.github/actions/setup-linux@release/2.3
 
       - name: Build docker image
         id: build-docker-image
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.3
         with:
           docker-image-name: ${{ matrix.docker-image-name }}
           always-rebuild: true
@@ -70,5 +70,5 @@ jobs:
           force-push: true
 
       - name: Teardown Linux
-        uses: pytorch/test-infra/.github/actions/teardown-linux@main
+        uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.3
         if: always()
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
index 7cb2cf69b8b..a47f38d1b86 100644
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -16,7 +16,7 @@ concurrency:
 
 jobs:
   lintrunner:
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@release/2.3
     with:
       runner: linux.2xlarge
       docker-image: executorch-ubuntu-22.04-linter
diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml
index f8ffd41d214..efa3ed6f540 100644
--- a/.github/workflows/nightly.yml
+++ b/.github/workflows/nightly.yml
@@ -21,12 +21,12 @@ jobs:
     environment: ${{ (github.event_name == 'schedule') && 'update-commit-hash' || '' }}
     steps:
       - name: update-pytorch-commit-hash
-        uses: pytorch/test-infra/.github/actions/update-commit-hash@main
+        uses: pytorch/test-infra/.github/actions/update-commit-hash@release/2.3
         if: ${{ github.event_name == 'schedule' }}
         with:
           repo-name: pytorch
           branch: main
           pin-folder: .ci/docker/ci_commit_pins
-          test-infra-ref: main
+          test-infra-ref: release/2.3
           updatebot-token: ${{ secrets.UPDATEBOT_TOKEN }}
           pytorchbot-token: ${{ secrets.GH_PYTORCHBOT_TOKEN }}
diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
index 9751b906cd8..6b3a25d89c8 100644
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@@ -33,7 +33,7 @@ jobs:
 
   test-setup-linux-gcc:
     name: test-setup-linux-gcc
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@release/2.3
     strategy:
       matrix:
         include:
@@ -58,7 +58,7 @@ jobs:
 
   test-models-linux:
     name: test-models-linux
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@release/2.3
     needs: gather-models
     strategy:
       matrix: ${{ fromJSON(needs.gather-models.outputs.models) }}
@@ -85,7 +85,7 @@ jobs:
 
   test-llama-runner-linux:
     name: test-llama-runner-linux
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@release/2.3
     strategy:
       matrix:
         dtype: [fp32]
@@ -139,7 +139,7 @@ jobs:
 
   test-custom-ops-linux:
     name: test-custom-ops-linux
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@release/2.3
     strategy:
       matrix:
         include:
@@ -164,7 +164,7 @@ jobs:
 
   test-selective-build-linux:
     name: test-selective-build-linux
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@release/2.3
     strategy:
       matrix:
         include:
@@ -189,7 +189,7 @@ jobs:
 
   test-quantized-aot-lib-linux:
     name: test-quantized-aot-lib-linux
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@release/2.3
     strategy:
       matrix:
         include:
@@ -212,7 +212,7 @@ jobs:
 
   test-pybind-build-linux:
     name: test-pybind-build-linux
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@release/2.3
     strategy:
       matrix:
         include:
diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml
index 16ed6a27577..a21e02a468c 100644
--- a/.github/workflows/trunk.yml
+++ b/.github/workflows/trunk.yml
@@ -34,7 +34,7 @@ jobs:
 
   test-models-macos:
     name: test-models-macos
-    uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
+    uses: pytorch/test-infra/.github/workflows/macos_job.yml@release/2.3
     needs: gather-models
     strategy:
       matrix: ${{ fromJSON(needs.gather-models.outputs.models) }}
@@ -63,7 +63,7 @@ jobs:
 
   test-custom-ops-macos:
     name: test-custom-ops-macos
-    uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
+    uses: pytorch/test-infra/.github/workflows/macos_job.yml@release/2.3
     strategy:
       matrix:
         include:
@@ -89,7 +89,7 @@ jobs:
 
   test-selective-build-macos:
     name: test-selective-build-macos
-    uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
+    uses: pytorch/test-infra/.github/workflows/macos_job.yml@release/2.3
     strategy:
       matrix:
         include:
@@ -115,7 +115,7 @@ jobs:
 
   test-demo-backend-delegation:
     name: test-demo-backend-delegation
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@release/2.3
     strategy:
       matrix:
         include:
@@ -139,7 +139,7 @@ jobs:
 
   test-arm-backend-delegation:
     name: test-arm-backend-delegation
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@release/2.3
     with:
       runner: linux.2xlarge
       docker-image: executorch-ubuntu-22.04-arm-sdk
@@ -169,7 +169,7 @@ jobs:
 
   test-arm-reference-delegation:
     name: test-arm-reference-delegation
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@release/2.3
     with:
       runner: linux.2xlarge
       docker-image: executorch-ubuntu-22.04-arm-sdk
@@ -200,7 +200,7 @@ jobs:
 
   test-coreml-delegate:
     name: test-coreml-delegate
-    uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
+    uses: pytorch/test-infra/.github/workflows/macos_job.yml@release/2.3
     with:
       runner: macos-13-xlarge
       python-version: '3.11'
@@ -222,7 +222,7 @@ jobs:
 
   test-pybind-build-macos:
     name: test-pybind-build-macos
-    uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
+    uses: pytorch/test-infra/.github/workflows/macos_job.yml@release/2.3
     strategy:
       matrix:
         include:
@@ -249,7 +249,7 @@ jobs:
 
   test-llama-runner-macos:
     name: test-llama-runner-mac
-    uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
+    uses: pytorch/test-infra/.github/workflows/macos_job.yml@release/2.3
     strategy:
       matrix:
         dtype: [fp32]
diff --git a/.github/workflows/update-viablestrict.yml b/.github/workflows/update-viablestrict.yml
index 189a5cf3aa3..9bb89aa2be3 100644
--- a/.github/workflows/update-viablestrict.yml
+++ b/.github/workflows/update-viablestrict.yml
@@ -16,7 +16,7 @@ jobs:
     environment: ${{ (github.event_name == 'schedule') && 'update-viable-strict' || '' }}
     steps:
       - name: Update viable/strict
-        uses: pytorch/test-infra/.github/actions/update-viablestrict@main
+        uses: pytorch/test-infra/.github/actions/update-viablestrict@release/2.3
         with:
           repository: pytorch/executorch
           stable-branch: viable/strict
diff --git a/.gitignore b/.gitignore
index 6661daed13e..26a46f23f62 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,6 +2,7 @@
 buck-out/
 cmake-out/
 cmake-android-out/
+cmake-out-android/
 cmake-ios-out/
 ethos-u-scratch/
 executorch.egg-info
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 46b73f63492..75de1e01ae6 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -352,23 +352,27 @@ add_subdirectory(schema)
 # Only contains primitive operators; does not contain portable kernels or other
 # full operators. Does not contain any backends.
 #
-
-add_library(executorch ${_executorch__srcs})
-target_link_libraries(executorch PRIVATE program_schema)
-target_link_options_shared_lib(executorch)
+add_library(executorch_no_prim_ops ${_executorch_no_prim_ops__srcs})
+target_link_libraries(executorch_no_prim_ops PRIVATE program_schema)
 # Check if dl exists for this toolchain and only then link it.
 find_library(DL_LIBRARY_EXISTS NAMES dl)
 # Check if the library was found
 if(DL_LIBRARY_EXISTS)
-  target_link_libraries(executorch PRIVATE dl) # For dladdr()
+  target_link_libraries(executorch_no_prim_ops PRIVATE dl) # For dladdr()
 endif()
-target_include_directories(executorch PUBLIC ${_common_include_directories})
-target_compile_options(executorch PUBLIC ${_common_compile_options})
+target_include_directories(executorch_no_prim_ops PUBLIC ${_common_include_directories})
+target_compile_options(executorch_no_prim_ops PUBLIC ${_common_compile_options})
 if(MAX_KERNEL_NUM)
-  target_compile_definitions(executorch
+  target_compile_definitions(executorch_no_prim_ops
                              PRIVATE MAX_KERNEL_NUM=${MAX_KERNEL_NUM})
 endif()
 
+add_library(executorch ${_executorch__srcs})
+target_link_libraries(executorch PRIVATE executorch_no_prim_ops)
+target_include_directories(executorch PUBLIC ${_common_include_directories})
+target_compile_options(executorch PUBLIC ${_common_compile_options})
+target_link_options_shared_lib(executorch)
+
 #
 # portable_ops_lib: A library to register core ATen ops using portable kernels,
 # see kernels/portable/CMakeLists.txt.
@@ -406,7 +410,7 @@ endif()
 # Install `executorch` library as well as `executorch-config.cmake` under
 # ${CMAKE_INSTALL_PREFIX}/
 install(
-  TARGETS executorch
+  TARGETS executorch executorch_no_prim_ops
   DESTINATION lib
   INCLUDES
   DESTINATION ${_common_include_directories})
diff --git a/backends/apple/coreml/CMakeLists.txt b/backends/apple/coreml/CMakeLists.txt
index f1c19d00ee8..3b3c26ece94 100644
--- a/backends/apple/coreml/CMakeLists.txt
+++ b/backends/apple/coreml/CMakeLists.txt
@@ -144,7 +144,7 @@ target_include_directories(
 )
 target_link_libraries(
   coremldelegate PRIVATE
-  executorch
+  executorch_no_prim_ops
 )
 
 if(EXECUTORCH_BUILD_SDK)
@@ -174,7 +174,7 @@ find_library(SQLITE_LIBRARY sqlite3)
 
 target_link_libraries(coremldelegate
   PRIVATE
-  executorch
+  executorch_no_prim_ops
   ${ACCELERATE_FRAMEWORK}
   ${COREML_FRAMEWORK}
   ${FOUNDATION_FRAMEWORK}
@@ -185,7 +185,7 @@ target_compile_options(coremldelegate PRIVATE "-fobjc-arc")
 target_compile_options(coremldelegate PRIVATE "-fno-exceptions")
 
 if(EXECUTORCH_BUILD_SDK)
-target_compile_options(executorch PUBLIC -DET_EVENT_TRACER_ENABLED)
+target_compile_options(executorch_no_prim_ops PUBLIC -DET_EVENT_TRACER_ENABLED)
 target_compile_options(coremldelegate PRIVATE "-frtti")
 target_compile_options(libprotobuf-lite PRIVATE "-frtti")
 else()
diff --git a/backends/apple/coreml/runtime/workspace/executorchcoreml.xcodeproj/project.pbxproj b/backends/apple/coreml/runtime/workspace/executorchcoreml.xcodeproj/project.pbxproj
index 4c9fc081b9e..cba1bfab8b0 100644
--- a/backends/apple/coreml/runtime/workspace/executorchcoreml.xcodeproj/project.pbxproj
+++ b/backends/apple/coreml/runtime/workspace/executorchcoreml.xcodeproj/project.pbxproj
@@ -100,6 +100,7 @@
 		C9E7D7952AB3F9BF00CCAE5D /* ETCoreMLModelManagerTests.mm in Sources */ = {isa = PBXBuildFile; fileRef = C9E7D78D2AB3F9BF00CCAE5D /* ETCoreMLModelManagerTests.mm */; };
 		C9E7D7962AB3F9BF00CCAE5D /* KeyValueStoreTests.mm in Sources */ = {isa = PBXBuildFile; fileRef = C9E7D78E2AB3F9BF00CCAE5D /* KeyValueStoreTests.mm */; };
 		C9E7D7A22AB3FBB200CCAE5D /* CoreMLBackendDelegateTests.mm in Sources */ = {isa = PBXBuildFile; fileRef = C9E7D7A12AB3FBB200CCAE5D /* CoreMLBackendDelegateTests.mm */; };
+		F24817E52BC655E100E80D98 /* libexecutorch_no_prim_ops.a in Frameworks */ = {isa = PBXBuildFile; fileRef = F24817E42BC655E100E80D98 /* libexecutorch_no_prim_ops.a */; };
 /* End PBXBuildFile section */
 
 /* Begin PBXCopyFilesBuildPhase section */
@@ -297,6 +298,7 @@
 		C9EA3DB22B71A2B200B7D7BD /* CoreML.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = CoreML.framework; path = System/Library/Frameworks/CoreML.framework; sourceTree = SDKROOT; };
 		C9EA3FDE2B73EEA000B7D7BD /* libsqlite3.tbd */ = {isa = PBXFileReference; lastKnownFileType = "sourcecode.text-based-dylib-definition"; name = libsqlite3.tbd; path = usr/lib/libsqlite3.tbd; sourceTree = SDKROOT; };
 		C9EA3FE52B73EF6300B7D7BD /* Accelerate.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Accelerate.framework; path = System/Library/Frameworks/Accelerate.framework; sourceTree = SDKROOT; };
+		F24817E42BC655E100E80D98 /* libexecutorch_no_prim_ops.a */ = {isa = PBXFileReference; lastKnownFileType = archive.ar; name = libexecutorch_no_prim_ops.a; path = ../libraries/libexecutorch_no_prim_ops.a; sourceTree = "<group>"; };
 /* End PBXFileReference section */
 
 /* Begin PBXFrameworksBuildPhase section */
@@ -305,6 +307,7 @@
 			buildActionMask = 2147483647;
 			files = (
 				C94D510F2ABDF87500AF47FD /* Accelerate.framework in Frameworks */,
+				F24817E52BC655E100E80D98 /* libexecutorch_no_prim_ops.a in Frameworks */,
 				C94D510E2ABDF86800AF47FD /* libsqlite3.tbd in Frameworks */,
 				C94D50D92ABD7B2400AF47FD /* CoreML.framework in Frameworks */,
 				C99883862B95AD7D000953A3 /* libprotobuf-lite.a in Frameworks */,
@@ -523,6 +526,7 @@
 				C96560942AABFDCE005F8126 /* libsqlite3.tbd */,
 				C96560922AABF992005F8126 /* CoreML.framework */,
 				C96560902AABF982005F8126 /* Accelerate.framework */,
+				F24817E42BC655E100E80D98 /* libexecutorch_no_prim_ops.a */,
 				C965608D2AABF72A005F8126 /* libexecutorch.a */,
 			);
 			name = "Recovered References";
diff --git a/backends/apple/coreml/scripts/build_tests.sh b/backends/apple/coreml/scripts/build_tests.sh
index 72afca2d6ce..730ba0839db 100755
--- a/backends/apple/coreml/scripts/build_tests.sh
+++ b/backends/apple/coreml/scripts/build_tests.sh
@@ -59,6 +59,7 @@ cmake --build "$CMAKE_PROTOBUF_BUILD_DIR_PATH"  -j9 -t libprotobuf-lite
 echo "ExecuTorch: Copying libraries"
 mkdir "$LIBRARIES_DIR_PATH"
 cp -f "$CMAKE_EXECUTORCH_BUILD_DIR_PATH/libexecutorch.a" "$LIBRARIES_DIR_PATH"
+cp -f "$CMAKE_EXECUTORCH_BUILD_DIR_PATH/libexecutorch_no_prim_ops.a" "$LIBRARIES_DIR_PATH"
 cp -f "$CMAKE_PROTOBUF_BUILD_DIR_PATH/libprotobuf-lite.a" "$LIBRARIES_DIR_PATH"
 
 #Copy ExecuTorch headers
diff --git a/backends/apple/mps/CMakeLists.txt b/backends/apple/mps/CMakeLists.txt
index ef64e26f2cc..50d91fe20fe 100644
--- a/backends/apple/mps/CMakeLists.txt
+++ b/backends/apple/mps/CMakeLists.txt
@@ -70,7 +70,7 @@ target_link_libraries(mpsdelegate
   PRIVATE
   bundled_program
   mps_schema
-  ${_executor_runner_libs}
+  executorch_no_prim_ops
   ${FOUNDATION_FRAMEWORK}
   ${METAL_FRAMEWORK}
   ${MPS_FRAMEWORK}
diff --git a/backends/arm/arm_quantizer_utils.py b/backends/arm/arm_quantizer_utils.py
index 63c98ee42d2..5275b5ecade 100644
--- a/backends/arm/arm_quantizer_utils.py
+++ b/backends/arm/arm_quantizer_utils.py
@@ -23,7 +23,7 @@
 from torch.ao.quantization.pt2e.utils import (
     _conv1d_bn_example_inputs,
     _conv2d_bn_example_inputs,
-    _get_aten_graph_module_for_pattern,
+    get_aten_graph_module,
 )
 from torch.ao.quantization.quantizer import (
     QuantizationAnnotation,
@@ -478,7 +478,7 @@ def _do_annotate_conv_bn(
     # Match against all conv dimensions and cuda variants
     for (conv_fn, example_inputs), is_cuda, relu_is_inplace in combinations:
         pattern = _get_pattern(conv_fn, relu_is_inplace, has_relu)
-        pattern = _get_aten_graph_module_for_pattern(pattern, example_inputs, is_cuda)
+        pattern = get_aten_graph_module(pattern, example_inputs, is_cuda)
         pattern.graph.eliminate_dead_code()
         pattern.recompile()
         matcher = SubgraphMatcherWithNameNodeMap(pattern, ignore_literals=True)
diff --git a/backends/qualcomm/CMakeLists.txt b/backends/qualcomm/CMakeLists.txt
index 8883e5ee026..727952b4fe4 100644
--- a/backends/qualcomm/CMakeLists.txt
+++ b/backends/qualcomm/CMakeLists.txt
@@ -253,7 +253,7 @@ target_link_libraries(qnn_executorch_backend
     qnn_executorch_header
     qnn_schema
     qnn_manager
-    executorch
+    executorch_no_prim_ops
     qcir_utils
 )
 target_link_libraries(utils
diff --git a/backends/qualcomm/quantizer/utils.py b/backends/qualcomm/quantizer/utils.py
index 809b7298eba..58be76aba11 100644
--- a/backends/qualcomm/quantizer/utils.py
+++ b/backends/qualcomm/quantizer/utils.py
@@ -9,6 +9,7 @@
 import torch
 
 from torch._ops import OpOverload
+from torch._subclasses import FakeTensor
 
 from torch.ao.quantization.quantizer import (
     QuantizationAnnotation,
@@ -42,6 +43,19 @@ def decorator(annotator: Callable):
     return decorator
 
 
+def _is_input_float_tensor(node: Node):
+    """Check if the input is not a float tensor, so that we can skip quantization for the node
+    since observers only works with float Tensors
+    """
+    if (
+        not isinstance(node, Node)
+        or "val" not in node.meta
+        or not isinstance(node.meta["val"], FakeTensor)
+    ):
+        return False
+    return node.meta["val"].dtype == torch.float32
+
+
 def _is_annotated(nodes: List[Node]):
     """
     Given a list of nodes (that represents an operator pattern),
@@ -123,11 +137,11 @@ def annotate_binary(node: Node, quantization_config: QuantizationConfig) -> None
 
     input_qspec_map = {}
     input_act0 = node.args[0]
-    if isinstance(input_act0, Node):
+    if _is_input_float_tensor(input_act0):
         input_qspec_map[input_act0] = input_act_qspec
 
     input_act1 = node.args[1]
-    if isinstance(input_act1, Node):
+    if _is_input_float_tensor(input_act1):
         input_qspec_map[input_act1] = input_act_qspec
 
     node.meta[QUANT_ANNOTATION_KEY] = QuantizationAnnotation(
diff --git a/backends/vulkan/README.md b/backends/vulkan/README.md
new file mode 100644
index 00000000000..bc5a674970f
--- /dev/null
+++ b/backends/vulkan/README.md
@@ -0,0 +1,192 @@
+# ExecuTorch Vulkan Delegate
+
+The ExecuTorch Vulkan delegate is a native GPU delegate for ExecuTorch that is
+built on top of the cross-platform Vulkan GPU API standard. It is primarily
+designed to leverage the GPU to accelerate model inference on Android devices,
+but can be used on any platform that supports an implementation of Vulkan:
+laptops, servers, and edge devices.
+
+::::{note}
+The Vulkan delegate is currently under active development, and its components
+are subject to change.
+::::
+
+## What is Vulkan?
+
+Vulkan is a low-level GPU API specification developed as a successor to OpenGL.
+It is designed to offer developers more explicit control over GPUs compared to
+previous specifications in order to reduce overhead and maximize the
+capabilities of the modern graphics hardware.
+
+Vulkan has been widely adopted among GPU vendors, and most modern GPUs (both
+desktop and mobile) in the market support Vulkan. Vulkan is also included in
+Android from Android 7.0 onwards.
+
+**Note that Vulkan is a GPU API, not a GPU Math Library**. That is to say it
+provides a way to execute compute and graphics operations on a GPU, but does not
+come with a built-in library of performant compute kernels.
+
+## The Vulkan Compute Library
+
+The ExecuTorch Vulkan Delegate is a wrapper around a standalone runtime known as
+the **Vulkan Compute Library**. The aim of the Vulkan Compute Library is to
+provide GPU implementations for PyTorch operators via GLSL compute shaders.
+
+The Vulkan Compute Library is a fork/iteration of the [PyTorch Vulkan Backend](https://pytorch.org/tutorials/prototype/vulkan_workflow.html).
+The core components of the PyTorch Vulkan backend were forked into ExecuTorch
+and adapted for an AOT graph-mode style of model inference (as opposed to
+PyTorch which adopted an eager execution style of model inference).
+
+The components of the Vulkan Compute Library are contained in the
+`executorch/backends/vulkan/runtime/` directory. The core components are listed
+and described below:
+
+```
+runtime/
+├── api/ .................... Wrapper API around Vulkan to manage Vulkan objects
+└── graph/ .................. ComputeGraph class which implements graph mode inference
+    └── ops/ ................ Base directory for operator implementations
+        ├── glsl/ ........... GLSL compute shaders
+        │   ├── *.glsl
+        │   └── conv2d.glsl
+        └── impl/ ........... C++ code to dispatch GPU compute shaders
+            ├── *.cpp
+            └── Conv2d.cpp
+```
+
+## Features
+
+The Vulkan delegate currently supports the following features:
+
+* **Memory Planning**
+  * Intermediate tensors whose lifetimes do not overlap will share memory allocations. This reduces the peak memory usage of model inference.
+* **Capability Based Partitioning**:
+  * A graph can be partially lowered to the Vulkan delegate via a partitioner, which will identify nodes (i.e. operators) that are supported by the Vulkan delegate and lower only supported subgraphs
+* **Support for upper-bound dynamic shapes**:
+  * Tensors can change shape between inferences as long as its current shape is smaller than the bounds specified during lowering
+
+In addition to increasing operator coverage, the following features are
+currently in development:
+
+* **Quantization Support**
+  * We are currently working on support for 8-bit dynamic quantization, with plans to extend to other quantization schemes in the future.
+* **Memory Layout Management**
+  * Memory layout is an important factor to optimizing performance. We plan to introduce graph passes to introduce memory layout transitions throughout a graph to optimize memory-layout sensitive operators such as Convolution and Matrix Multiplication.
+* **Selective Build**
+  * We plan to make it possible to control build size by selecting which operators/shaders you want to build with
+
+## End to End Example
+
+To further understand the features of the Vulkan Delegate and how to use it,
+consider the following end to end example with MobileNet V2.
+
+### Compile and lower a model to the Vulkan Delegate
+
+Assuming ExecuTorch has been set up and installed, the following script can be
+used to produce a lowered MobileNet V2 model as `vulkan_mobilenetv2.pte`.
+
+```
+import torch
+import torchvision.models as models
+
+from torch.export import export, ExportedProgram
+from torchvision.models.mobilenetv2 import MobileNet_V2_Weights
+from executorch.backends.vulkan.partitioner.vulkan_partitioner import VulkanPartitioner
+from executorch.exir import EdgeProgramManager, ExecutorchProgramManager, to_edge
+from executorch.exir.backend.backend_api import to_backend
+
+mobilenet_v2 = models.mobilenetv2.mobilenet_v2(weights=MobileNet_V2_Weights.DEFAULT).eval()
+sample_inputs = (torch.randn(1, 3, 224, 224), )
+
+exported_program: ExportedProgram = export(mobilenet_v2, sample_inputs)
+edge: EdgeProgramManager = to_edge(exported_program)
+
+# Lower the model to Vulkan backend
+edge = edge.to_backend(VulkanPartitioner())
+
+exec_prog = edge.to_executorch()
+
+with open("vulkan_mobilenetv2.pte", "wb") as file:
+    exec_prog.write_to_file(file)
+```
+
+Like other ExecuTorch delegates, a model can be lowered to the Vulkan Delegate
+using the `to_backend()` API. The Vulkan Delegate implements the
+`VulkanPartitioner` class which identifies nodes (i.e. operators) in the graph
+that are supported by the Vulkan delegate, and separates compatible sections of
+the model to be executed on the GPU.
+
+This means the a model can be lowered to the Vulkan delegate even if it contains
+some unsupported operators. This will just mean that only parts of the graph
+will be executed on the GPU.
+
+
+::::{note}
+The [Vulkan partitioner code](https://github.com/pytorch/executorch/blob/main/backends/vulkan/partitioner/vulkan_partitioner.py)
+can be inspected to examine which ops are currently implemented in the Vulkan
+delegate.
+::::
+
+### Build Vulkan Delegate libraries
+
+The easiest way to build and test the Vulkan Delegate is to build for Android
+and test on a local Android device. Android devices have built in support for
+Vulkan, and the Android NDK ships with a GLSL compiler, which is needed to
+compile the Vulkan Compute Library's GLSL compute shaders.
+
+The Vulkan Delegate libraries can be built by setting `-DEXECUTORCH_BUILD_VULKAN=ON`
+when building with CMake.
+
+First, make sure that you have the Android NDK installed - Android NDK r25c is
+recommended. The Android SDK should also be installed so that you have access
+to `adb`.
+
+```shell
+# Recommended version is Android NDK r25c.
+export ANDROID_NDK=<path_to_ndk>
+# Select an appropriate Android ABI
+export ANDROID_ABI=arm64-v8a
+# All subsequent commands should be performed from ExecuTorch repo root
+cd <path_to_executorch_root>
+# Make sure adb works
+adb --version
+```
+
+To build and install ExecuTorch libraries (for Android) with the Vulkan
+Delegate:
+
+```shell
+# From executorch root directory
+(rm -rf cmake-android-out && \
+  pp cmake . -DCMAKE_INSTALL_PREFIX=cmake-android-out \
+    -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \
+    -DANDROID_ABI=$ANDROID_ABI \
+    -DEXECUTORCH_BUILD_VULKAN=ON \
+    -DPYTHON_EXECUTABLE=python \
+    -Bcmake-android-out && \
+  cmake --build cmake-android-out -j16 --target install)
+```
+
+### Run the Vulkan model on device
+
+::::{note}
+Since operator support is currently limited, only binary arithmetic operators
+will run on the GPU. Expect inference to be slow as the majority of operators
+are being executed via Portable operators.
+::::
+
+Now, the partially delegated model can be executed (partially) on your device's
+GPU!
+
+```shell
+# Build a model runner binary linked with the Vulkan delegate libs
+cmake --build cmake-android-out --target vulkan_executor_runner -j32
+
+# Push model to device
+adb push vulkan_mobilenetv2.pte /data/local/tmp/vulkan_mobilenetv2.pte
+# Push binary to device
+adb push cmake-android-out/backends/vulkan/vulkan_executor_runner /data/local/tmp/runner_bin
+
+# Run the model
+adb shell /data/local/tmp/runner_bin --model_path /data/local/tmp/vulkan_mobilenetv2.pte
+```
diff --git a/backends/vulkan/docs/android_demo.md b/backends/vulkan/docs/android_demo.md
new file mode 100644
index 00000000000..f9fc35657a6
--- /dev/null
+++ b/backends/vulkan/docs/android_demo.md
@@ -0,0 +1,148 @@
+# Building and Running ExecuTorch with the Vulkan Backend
+
+The [ExecuTorch Vulkan Delegate](./native-delegates-executorch-vulkan-delegate.md)
+is a native GPU delegate for ExecuTorch.
+
+<!----This will show a grid card on the page----->
+::::{grid} 2
+:::{grid-item-card}  What you will learn in this tutorial:
+:class-card: card-content
+* How to export the Stories 110M parameter model with partial GPU delegation
+* How to execute the partially delegated model on Android
+:::
+:::{grid-item-card}  Prerequisites:
+:class-card: card-prerequisites
+* Follow [**Setting up ExecuTorch**](./getting-started-setup.md)
+* Follow [**Setting up the ExecuTorch LLaMA Android Demo App**](./llm/llama-demo-android.md)
+:::
+::::
+
+## Prerequisites
+
+Note that all the steps below should be performed from the ExecuTorch repository
+root directory, and assumes that you have gone through the steps of setting up
+ExecuTorch.
+
+You should also refer to the **Prerequisites** section of the [**Setting up the ExecuTorch LLaMA Android Demo App**](./llm/llama-demo-android.md)
+Tutorial in order to install the specified versions of the Android NDK and the
+Android SDK.
+
+```shell
+# Recommended version is Android NDK r25c.
+export ANDROID_NDK=<path_to_ndk>
+# Select an appropriate Android ABI
+export ANDROID_ABI=arm64-v8a
+# All subsequent commands should be performed from ExecuTorch repo root
+cd <path_to_executorch_root>
+# Make sure adb works
+adb --version
+```
+
+## Lowering the Stories 110M model to Vulkan
+
+::::{note}
+The resultant model will only be partially delegated to the Vulkan backend. In
+particular, only binary arithmetic operators (`aten.add`, `aten.sub`,
+`aten.mul`, `aten.div`) and the matrix multiplication operator (`aten.mm`) will
+be executed on the GPU via the Vulkan delegate. The rest of the model will be
+executed using Portable operators. This is because the Vulkan delegate is still
+early in development and currently has limited operator coverage.
+::::
+
+First, download `stories110M.pt` and `tokenizer.model` from Github:
+
+```shell
+wget "https://huggingface.co/karpathy/tinyllamas/resolve/main/stories110M.pt"
+wget "https://raw.githubusercontent.com/karpathy/llama2.c/master/tokenizer.model"
+```
+
+Next, create the params file:
+
+```shell
+echo '{"dim": 768, "multiple_of": 32, "n_heads": 12, "n_layers": 12, "norm_eps": 1e-05, "vocab_size": 32000}' > params.json
+```
+
+Then, create a tokenizer binary file:
+
+```shell
+python -m examples.models.llama2.tokenizer.tokenizer -t tokenizer.model -o tokenizer.bin
+```
+
+Finally, export the `stories110M.pt` file into an ExecuTorch program:
+
+```shell
+python -m examples.models.llama2.export_llama -c stories110M.pt -p params.json --vulkan
+```
+
+A `vulkan_llama2.pte` file should have been created as a result of the last step.
+
+Push the tokenizer binary and `vulkan_llama2.pte` onto your Android device:
+
+```shell
+adb mkdir /data/local/tmp/llama/
+adb push tokenizer.bin /data/local/tmp/llama/
+adb push vulkan_llama2.pte /data/local/tmp/llama/
+```
+
+## Build and Run the LLaMA runner binary on Android
+
+First, build and install ExecuTorch libraries, then build the LLaMA runner
+binary using the Android NDK toolchain.
+
+```shell
+(rm -rf cmake-android-out && \
+  cmake . -DCMAKE_INSTALL_PREFIX=cmake-android-out \
+    -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \
+    -DANDROID_ABI=$ANDROID_ABI \
+    -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+    -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
+    -DEXECUTORCH_BUILD_VULKAN=ON \
+    -DEXECUTORCH_BUILD_OPTIMIZED=ON \
+    -DPYTHON_EXECUTABLE=python \
+    -Bcmake-android-out && \
+  cmake --build cmake-android-out -j16 --target install)
+
+# Build LLaMA Runner library
+(rm -rf cmake-android-out/examples/models/llama2 && \
+  cmake examples/models/llama2 \
+    -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \
+    -DANDROID_ABI=$ANDROID_ABI \
+    -DCMAKE_INSTALL_PREFIX=cmake-android-out \
+    -DPYTHON_EXECUTABLE=python \
+    -Bcmake-android-out/examples/models/llama2 && \
+  cmake --build cmake-android-out/examples/models/llama2 -j16)
+```
+
+Finally, push and run the llama runner binary on your Android device.
+
+```shell
+adb push cmake-android-out/examples/models/llama2/llama_main /data/local/tmp/llama_main
+
+adb shell /data/local/tmp/llama_main \
+    --model_path=/data/local/tmp/llama/vulkan_llama2.pte \
+    --tokenizer_path=/data/local/tmp/llama/tokenizer.bin \
+    --prompt "hi" \--temperature=0
+```
+
+The following output will be produced:
+
+```
+hippo named Hippy lived in a big pond. Hippy was a very happy hippo. He liked to play...
+```
+
+## Running with the LLaMA Android Demo App
+
+It is also possible to run the partially delegated Vulkan model inside the LLaMA
+Android demo app.
+
+First, make some modifications to the Android app setup script to make sure that
+the Vulkan backend is built when building and installing ExecuTorch libraries:
+
+```shell
+# Run from executorch root directory. You can also edit this in a code editor
+sed -i 's/-DEXECUTORCH_BUILD_XNNPACK=ON/-DEXECUTORCH_BUILD_XNNPACK=ON -DEXECUTORCH_BUILD_VULKAN=ON/g' examples/demo-apps/android/LlamaDemo/setup.sh
+```
+
+Then, Follow the instructions at [**Setting up the ExecuTorch LLaMA Android Demo App**](./llm/llama-demo-android.md)
+to build and run the demo application on your Android device. Once the app
+starts up, you can load and run the `vulkan_llama2.pte` model with the app.
diff --git a/build/Utils.cmake b/build/Utils.cmake
index 39fa7317da8..66b740ad1eb 100644
--- a/build/Utils.cmake
+++ b/build/Utils.cmake
@@ -213,6 +213,12 @@ function(resolve_buck2)
           PARENT_SCOPE)
     endif()
   endif()
+
+  # The buck2 daemon can get stuck. Killing it can help.
+  message(STATUS "Killing buck2 daemon")
+  execute_process(
+    COMMAND "${BUCK2} kill"
+    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
 endfunction()
 
 # Sets the value of the PYTHON_EXECUTABLE variable to 'python' if in an active
diff --git a/build/build_apple_frameworks.sh b/build/build_apple_frameworks.sh
index 78425485526..cbaa903ecda 100755
--- a/build/build_apple_frameworks.sh
+++ b/build/build_apple_frameworks.sh
@@ -25,7 +25,7 @@ PORTABLE=OFF
 QUANTIZED=OFF
 XNNPACK=OFF
 HEADERS_PATH="include"
-EXECUTORCH_FRAMEWORK="executorch:libexecutorch.a,libextension_apple.a,libextension_data_loader.a,libextension_module.a:$HEADERS_PATH"
+EXECUTORCH_FRAMEWORK="executorch:libexecutorch.a,libexecutorch_no_prim_ops.a,libextension_apple.a,libextension_data_loader.a,libextension_module.a:$HEADERS_PATH"
 COREML_FRAMEWORK="coreml_backend:libcoremldelegate.a:"
 CUSTOM_FRAMEWORK="custom_backend:libcustom_ops.a,libcustom_ops_lib.a:"
 MPS_FRAMEWORK="mps_backend:libmpsdelegate.a:"
diff --git a/build/cmake_deps.toml b/build/cmake_deps.toml
index 4346881224b..91174c08f75 100644
--- a/build/cmake_deps.toml
+++ b/build/cmake_deps.toml
@@ -19,6 +19,18 @@ excludes = [
 buck_targets = [
   "//runtime/executor:program",
 ]
+deps = [
+  "executorch_no_prim_ops",
+]
+filters = [
+  ".cpp$",
+]
+
+
+[targets.executorch_no_prim_ops]
+buck_targets = [
+  "//runtime/executor:program_no_prim_ops",
+]
 deps = [
   "program_schema",
 ]
@@ -43,6 +55,7 @@ excludes = [
 ]
 deps = [
   "executorch",
+  "executorch_no_prim_ops",
 ]
 
 [targets.optimized_kernels]
@@ -59,6 +72,7 @@ excludes = [
 ]
 deps = [
   "executorch",
+  "executorch_no_prim_ops",
   "portable_kernels",
 ]
 
@@ -76,6 +90,7 @@ excludes = [
 ]
 deps = [
   "executorch",
+  "executorch_no_prim_ops",
   "portable_kernels",
 ]
 
@@ -97,6 +112,7 @@ filters = [
 excludes = [
 ]
 deps = [
+  "executorch_no_prim_ops",
   "executorch",
 ]
 
@@ -113,6 +129,7 @@ filters = [
   ".cpp$",
 ]
 deps = [
+  "executorch_no_prim_ops",
   "executorch",
 ]
 
@@ -125,6 +142,7 @@ filters = [
 ]
 deps = [
   "executorch",
+  "executorch_no_prim_ops",
   "extension_data_loader",
 ]
 
@@ -137,6 +155,7 @@ filters = [
 ]
 deps = [
   "executorch",
+  "executorch_no_prim_ops",
 ]
 
 # ---------------------------------- extension end ----------------------------------
@@ -154,6 +173,7 @@ excludes = [
 ]
 deps = [
   "executorch",
+  "executorch_no_prim_ops",
   "portable_kernels",
   "quantized_kernels",
 ]
@@ -169,6 +189,7 @@ excludes = [
   "^codegen",
 ]
 deps = [
+  "executorch_no_prim_ops",
   "executorch",
 ]
 # ---------------------------------- binary end ----------------------------------
@@ -185,6 +206,7 @@ excludes = [
 ]
 deps = [
   "executorch",
+  "executorch_no_prim_ops",
   "portable_kernels",
 ]
 
@@ -197,6 +219,7 @@ filters = [
 ]
 deps = [
   "executorch",
+  "executorch_no_prim_ops",
 ]
 
 [targets.mps_schema]
@@ -222,6 +245,7 @@ excludes = [
 ]
 deps = [
   "executorch",
+  "executorch_no_prim_ops",
   "xnnpack_backend",
   "portable_kernels",
 ]
@@ -235,6 +259,7 @@ filters = [
 ]
 deps = [
   "executorch",
+  "executorch_no_prim_ops",
 ]
 
 [targets.xnnpack_dynamic_quant_utils]
@@ -275,6 +300,7 @@ excludes = [
 ]
 deps = [
   "executorch",
+  "executorch_no_prim_ops",
   "optimized_kernels",
   "xnnpack_backend",
 ]
@@ -292,6 +318,7 @@ excludes = [
 deps = [
   "custom_ops",
   "executorch",
+  "executorch_no_prim_ops",
   "extension_data_loader",
   "extension_module",
   "portable_kernels",
diff --git a/build/executorch-config.cmake b/build/executorch-config.cmake
index 14ec7bf1f45..acf8b6779d5 100644
--- a/build/executorch-config.cmake
+++ b/build/executorch-config.cmake
@@ -13,27 +13,20 @@
 cmake_minimum_required(VERSION 3.19)
 
 set(_root "${CMAKE_CURRENT_LIST_DIR}/../..")
-add_library(executorch STATIC IMPORTED)
-find_library(
-    EXECUTORCH_LIBRARY_PATH executorch
-    HINTS "${_root}"
-    CMAKE_FIND_ROOT_PATH_BOTH
-)
-set_target_properties(
-    executorch PROPERTIES IMPORTED_LOCATION "${EXECUTORCH_LIBRARY_PATH}"
-)
-target_include_directories(executorch INTERFACE ${_root})
+set(required_lib_list executorch executorch_no_prim_ops portable_kernels)
+foreach(lib ${required_lib_list})
+    set(lib_var "LIB_${lib}")
+    add_library(${lib} STATIC IMPORTED)
+    find_library(
+        ${lib_var} ${lib} HINTS "${_root}" CMAKE_FIND_ROOT_PATH_BOTH
+    )
+    set_target_properties(
+        ${lib} PROPERTIES IMPORTED_LOCATION "${${lib_var}}"
+    )
+    target_include_directories(${lib} INTERFACE ${_root})
+endforeach()
 
-add_library(portable_kernels STATIC IMPORTED)
-find_library(
-    PORTABLE_KERNELS_PATH portable_kernels
-    HINTS "${_root}"
-    CMAKE_FIND_ROOT_PATH_BOTH
-)
-set_target_properties(
-    portable_kernels PROPERTIES IMPORTED_LOCATION "${PORTABLE_KERNELS_PATH}"
-)
-target_include_directories(portable_kernels INTERFACE ${_root})
+target_link_libraries(executorch INTERFACE executorch_no_prim_ops)
 
 if(CMAKE_BUILD_TYPE MATCHES "Debug")
     set(FLATCCRT_LIB flatccrt_d)
diff --git a/build/packaging/env_var_script_linux.sh b/build/packaging/env_var_script_linux.sh
new file mode 100644
index 00000000000..6379dee6b5a
--- /dev/null
+++ b/build/packaging/env_var_script_linux.sh
@@ -0,0 +1,20 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# This file is sourced into the environment before building a pip wheel. It
+# should typically only contain shell variable assignments. Be sure to export
+# any variables so that subprocesses will see them.
+
+# Enable pybindings so that users can execute ExecuTorch programs from python.
+export EXECUTORCH_BUILD_PYBIND=1
+
+# Ensure that CMAKE_ARGS is defined before referencing it. Defaults to empty
+# if not defined.
+export CMAKE_ARGS="${CMAKE_ARGS:-}"
+
+# Link the XNNPACK backend into the pybindings runtime so that users can execute
+# ExecuTorch programs that delegate to it.
+CMAKE_ARGS="${CMAKE_ARGS} -DEXECUTORCH_BUILD_XNNPACK=ON"
diff --git a/build/packaging/env_var_script_m1.sh b/build/packaging/env_var_script_m1.sh
new file mode 100644
index 00000000000..b7f92b321ea
--- /dev/null
+++ b/build/packaging/env_var_script_m1.sh
@@ -0,0 +1,28 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# This file is sourced into the environment before building a pip wheel. It
+# should typically only contain shell variable assignments. Be sure to export
+# any variables so that subprocesses will see them.
+
+# Enable pybindings so that users can execute ExecuTorch programs from python.
+export EXECUTORCH_BUILD_PYBIND=1
+
+# Ensure that CMAKE_ARGS is defined before referencing it. Defaults to empty
+# if not defined.
+export CMAKE_ARGS="${CMAKE_ARGS:-}"
+
+# Link the XNNPACK backend into the pybindings runtime so that users can execute
+# ExecuTorch programs that delegate to it.
+CMAKE_ARGS="${CMAKE_ARGS} -DEXECUTORCH_BUILD_XNNPACK=ON"
+
+# When building for macOS, link additional backends into the pybindings runtime.
+
+# TODO(dbort): Core ML requires features only available in macOS 10.15, but the
+# build machine uses an older version.
+# CMAKE_ARGS="${CMAKE_ARGS} -DEXECUTORCH_BUILD_COREML=ON"
+
+CMAKE_ARGS="${CMAKE_ARGS} -DEXECUTORCH_BUILD_MPS=ON"
diff --git a/build/packaging/pre_build_script.sh b/build/packaging/pre_build_script.sh
index 3940168c403..eeb4b95b007 100644
--- a/build/packaging/pre_build_script.sh
+++ b/build/packaging/pre_build_script.sh
@@ -5,6 +5,21 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-set -eux
+set -euxo pipefail
 
-echo "This script is run before building ExecuTorch binaries"
+# This script is run before building ExecuTorch binaries
+
+# Manually install build requirements because `python setup.py bdist_wheel` does
+# not install them. TODO(dbort): Switch to using `python -m build --wheel`,
+# which does install them. Though we'd need to disable build isolation to be
+# able to see the installed torch package.
+readonly BUILD_DEPS=(
+  # This list must match the build-system.requires list from pyproject.toml.
+  "cmake"
+  "pyyaml"
+  "setuptools"
+  "tomli"
+  "wheel"
+  "zstd"
+)
+pip install --progress-bar off "${BUILD_DEPS[@]}"
diff --git a/build/resolve_buck.py b/build/resolve_buck.py
index cba151ab340..463e6bf6c37 100644
--- a/build/resolve_buck.py
+++ b/build/resolve_buck.py
@@ -76,6 +76,10 @@ class BuckInfo:
         archive_name="buck2-aarch64-apple-darwin.zst",
         target_versions=["99e407b49dc432eda0cbddd67ea78346"],
     ),
+    ("darwin", "x86_64"): BuckInfo(
+        archive_name="buck2-x86_64-apple-darwin.zst",
+        target_versions=["9150d78e7a7531799a1b06ce58623bbc"],
+    ),
 }
 
 
diff --git a/docs/source/_static/img/llama_ios_app.mp4 b/docs/source/_static/img/llama_ios_app.mp4
new file mode 100644
index 00000000000..fead47644d6
Binary files /dev/null and b/docs/source/_static/img/llama_ios_app.mp4 differ
diff --git a/docs/source/_static/img/llama_ios_app.png b/docs/source/_static/img/llama_ios_app.png
new file mode 100644
index 00000000000..4f9020efb87
Binary files /dev/null and b/docs/source/_static/img/llama_ios_app.png differ
diff --git a/docs/source/_static/img/llm_manual_print_data_tabular.png b/docs/source/_static/img/llm_manual_print_data_tabular.png
new file mode 100644
index 00000000000..6052a404246
Binary files /dev/null and b/docs/source/_static/img/llm_manual_print_data_tabular.png differ
diff --git a/docs/source/build-run-qualcomm-ai-engine-direct-backend.md b/docs/source/build-run-qualcomm-ai-engine-direct-backend.md
index 30904b29ddb..1a94577e90c 100644
--- a/docs/source/build-run-qualcomm-ai-engine-direct-backend.md
+++ b/docs/source/build-run-qualcomm-ai-engine-direct-backend.md
@@ -115,6 +115,10 @@ Python APIs on x64 are required to compile models to Qualcomm AI Engine Direct b
 
 ```bash
 cd $EXECUTORCH_ROOT
+# Workaround for fbs files in exir/_serialize
+cp schema/program.fbs exir/_serialize/program.fbs
+cp schema/scalar_type.fbs exir/_serialize/scalar_type.fbs
+
 mkdir build_x86_64
 cd build_x86_64
 cmake .. -DEXECUTORCH_BUILD_QNN=ON -DQNN_SDK_ROOT=${QNN_SDK_ROOT}
@@ -138,8 +142,8 @@ mkdir build_android
 cd build_android
 # build executorch & qnn_executorch_backend
 cmake .. \
-    -DBUCK2=buck2 \
     -DCMAKE_INSTALL_PREFIX=$PWD \
+    -DEXECUTORCH_BUILD_SDK=ON \
     -DEXECUTORCH_BUILD_QNN=ON \
     -DQNN_SDK_ROOT=$QNN_SDK_ROOT \
     -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \
@@ -220,6 +224,7 @@ So, we can run `qnn_executor_runner` like
 ```bash
 adb push ./deeplab_v3/dlv3_qnn.pte ${DEVICE_DIR}
 adb push ${EXECUTORCH_ROOT}/build_android/examples/qualcomm/qnn_executor_runner ${DEVICE_DIR}
+adb push ${EXECUTORCH_ROOT}/build_android/lib/libqnn_executorch_backend.so ${DEVICE_DIR}
 adb shell "cd ${DEVICE_DIR} \
            && export LD_LIBRARY_PATH=${DEVICE_DIR} \
            && export ADSP_LIBRARY_PATH=${DEVICE_DIR} \
diff --git a/docs/source/build-run-vulkan.md b/docs/source/build-run-vulkan.md
new file mode 100644
index 00000000000..736859b86f6
--- /dev/null
+++ b/docs/source/build-run-vulkan.md
@@ -0,0 +1 @@
+```{include} ../../backends/vulkan/docs/android_demo.md
diff --git a/docs/source/demo-apps-ios.md b/docs/source/demo-apps-ios.md
index e04b6cae681..d68b1309e2b 100644
--- a/docs/source/demo-apps-ios.md
+++ b/docs/source/demo-apps-ios.md
@@ -1 +1 @@
-```{include} ../../examples/demo-apps/apple_ios/README.md
+```{include} ../../examples/demo-apps/apple_ios/ExecuTorchDemo/README.md
diff --git a/docs/source/index.rst b/docs/source/index.rst
index adbda475aa2..cb78b012850 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -100,6 +100,7 @@ Topics in this section will help you get started with ExecuTorch.
    demo-apps-android
    examples-end-to-end-to-lower-model-to-delegate
    tutorial-xnnpack-delegate-lowering
+   build-run-vulkan
    ..
       Alphabetical by backend name. Be sure to keep the same order in the
       customcarditem entries below.
@@ -183,6 +184,7 @@ Topics in this section will help you get started with ExecuTorch.
    :hidden:
 
    native-delegates-executorch-xnnpack-delegate
+   native-delegates-executorch-vulkan-delegate
    backend-delegates-integration
    backend-delegates-dependencies
 
@@ -262,6 +264,13 @@ ExecuTorch tutorials.
    :link: tutorial-xnnpack-delegate-lowering.html
    :tags: Export,Backend,Delegation,Quantization,XNNPACK
 
+.. customcarditem::
+   :header: Building and Running ExecuTorch with Vulkan Backend
+   :card_description: A tutorial that walks you through the process of building ExecuTorch with Vulkan Backend
+   :image: _static/img/generic-pytorch-logo.png
+   :link: build-run-vulkan.html
+   :tags: Export,Backend,Delegation,Vulkan
+
 ..
    Alphabetical by backend name. Be sure to keep the same order in the Tutorials
    toctree entry above.
diff --git a/docs/source/llm/getting-started.md b/docs/source/llm/getting-started.md
index eff8fd52ffb..3bff5c903f8 100644
--- a/docs/source/llm/getting-started.md
+++ b/docs/source/llm/getting-started.md
@@ -14,208 +14,443 @@
 
 ## Prerequisites
 
-Let’s start by getting an ExecuTorch environment:
+To follow this guide, you'll need to clone the ExecuTorch repository and install dependencies.
+ExecuTorch recommends Python 3.10 and the use of Conda to manage your environment. Conda is not
+required, though be aware that you may need to replace the use of python/pip with python3/pip3
+depending on your environment.
+
+::::{tab-set}
+:::{tab-item} conda
+Instructions on installing miniconda can be [found here](https://docs.anaconda.com/free/miniconda).
 
-1.  Create a third-party folder (Keeps the file paths organized)
-```
-mkdir  third-party
-cd  third-party
 ```
-2. If you’re new to ExecuTorch follow [these steps](https://pytorch.org/executorch/main/getting-started-setup.html#set-up-your-environment) to set up your environment.
+# Create a directory for this example.
+mkdir et-nanogpt
+cd et-nanogpt
+
+# Clone the ExecuTorch repository and submodules.
+mkdir third-party
+git clone -b release/0.2 https://github.com/pytorch/executorch.git third-party/executorch
+cd third-party/executorch
+git submodule update --init
 
-## Instantiating and Executing an LLM
+# Create a conda environment and install requirements.
+conda create -yn executorch python=3.10.0
+conda activate executorch
+pip install cmake zstd
+./install_requirements.sh
 
-We will use Karpathy’s [NanoGPT](https://github.com/karpathy/nanoGPT) but you can use another model if you prefer.
+cd ../..
+```
+:::
+:::{tab-item} pyenv-virtualenv
+Instructions on installing pyenv-virtualenv can be [found here](https://github.com/pyenv/pyenv-virtualenv?tab=readme-ov-file#installing-with-homebrew-for-macos-users).
 
+Importantly, if installing pyenv through brew, it does not automatically enable pyenv in the terminal, leading to errors. Run the following commands to enable.
+See the pyenv-virtualenv installation guide above on how to add this to your .bashrc or .zshrc to avoid needing to run these commands manually.
+```
+eval "$(pyenv init -)"
+eval "$(pyenv virtualenv-init -)"
+```
 
+```
+# Create a directory for this example.
+mkdir et-nanogpt
+cd et-nanogpt
 
-There are just 2 steps to this:
+pyenv install -s 3.10
+pyenv virtualenv 3.10 executorch
+pyenv activate executorch
 
-1.  Export the LLM Model
-2.  Create a runtime to execute the model
+# Clone the ExecuTorch repository and submodules.
+mkdir third-party
+git clone -b release/0.2 https://github.com/pytorch/executorch.git third-party/executorch
+cd third-party/executorch
+git submodule update --init
 
+# Install requirements.
+pip install cmake zstd
+PYTHON_EXECUTABLE=python ./install_requirements.sh
 
+cd ../..
+```
+:::
+::::
 
+For more information, see [Setting Up ExecuTorch](https://pytorch.org/executorch/stable/getting-started-setup.html).
 
-Note: Reminder to exit out of the “third-party” directory, before proceeding.
 
-### Step 1. Export
+## Running a Large Language Model Locally
 
-[Exporting to ExecuTorch](https://pytorch.org/executorch/main/export-overview.html) simply describes taking an existing model and converting it to the ExecuTorch format.
+This example uses Karpathy’s [NanoGPT](https://github.com/karpathy/nanoGPT), which is a minimal implementation of
+GPT-2 124M. This guide is applicable to other language models, as ExecuTorch is model-invariant.
 
+There are two steps to running a model with ExecuTorch:
 
+1.  Export the model. This step preprocesses it into a format suitable for runtime execution.
+2.  At runtime, load the model file and run with the ExecuTorch runtime.
 
-To start, let’s retrieve our model:
+<br />
 
-`wget  https://raw.githubusercontent.com/karpathy/nanoGPT/master/model.py`
+The export step happens ahead of time, typically as part of the application build or when the model changes. The resultant
+.pte file is distributed with the application. At runtime, the application loads the .pte file and passes it to the
+ExecuTorch runtime.
 
-Next, we’ll create a script (call it export.py) to generate the ExecuTorch Program (which gets dumped into an ExecuTorch Binary):
+### Step 1. Exporting to ExecuTorch
 
+Exporting takes a PyTorch model and converts it into a format that can run efficiently on consumer devices.
 
+For this example, you will need the NanoGPT model and the corresponding tokenizer vocabulary.
 
-1.  Create the model and example inputs
+::::{tab-set}
+:::{tab-item} curl
 ```
-import torch
-from model import GPT
-
-model  =  GPT.from_pretrained('gpt2')
-example_inputs = (torch.randint(0, 100, (1, 8), dtype=torch.long), )
+curl https://raw.githubusercontent.com/karpathy/nanoGPT/master/model.py -O
+curl https://huggingface.co/openai-community/gpt2/resolve/main/vocab.json -O
 ```
+:::
+:::{tab-item} wget
+```
+wget https://raw.githubusercontent.com/karpathy/nanoGPT/master/model.py
+wget https://huggingface.co/openai-community/gpt2/resolve/main/vocab.json
+```
+:::
+::::
 
+To convert the model into a format optimized for standalone execution, there are two steps. First, use the PyTorch
+`export` function to convert the PyTorch model into an intermediate, platform-independent intermediate representation. Then
+use the ExecuTorch `to_edge` and `to_executorch` methods to prepare the model for on-device execution. This creates a .pte
+file which can be loaded by a desktop or mobile application at runtime.
 
+Create a file called export_nanogpt.py with the following contents:
 
-2.  Trace the model
-Tracing extracts a cleaner representation of our model for conversion to ExecuTorch.
-You can read more about tracing in [torch.export — PyTorch 2.2 documentation](https://pytorch.org/docs/stable/export.html).
+```python
+# export_nanogpt.py
 
-```
-from torch.nn.attention import sdpa_kernel,  SDPBackend
+import torch
+
+from executorch.exir import EdgeCompileConfig, to_edge
+from torch.nn.attention import sdpa_kernel, SDPBackend
 from torch._export import capture_pre_autograd_graph
 from torch.export import export
 
-# Using a custom SDPA kernel for LLMs
-with torch.nn.attention.sdpa_kernel([SDPBackend.MATH]),  torch.no_grad():
+from model import GPT
 
-m  =  capture_pre_autograd_graph(model,  example_inputs)
+# Load the model.
+model = GPT.from_pretrained('gpt2')
 
-traced_model  =  export(m,  example_inputs)
-```
+# Create example inputs. This is used in the export process to provide
+# hints on the expected shape of the model input.
+example_inputs = (torch.randint(0, 100, (1, 8), dtype=torch.long), )
 
-3.  Export the model to ExecuTorch
-Exporting (or lowering) takes the model and creates a runnable ExecuTorch program, without delegate to any specific bakends for further acceleration.
-```
-from executorch.exir import EdgeCompileConfig,  to_edge
+# Trace the model, converting it to a portable intermediate representation.
+# The torch.no_grad() call tells PyTorch to exclude training-specific logic.
+with torch.nn.attention.sdpa_kernel([SDPBackend.MATH]), torch.no_grad():
+    m = capture_pre_autograd_graph(model, example_inputs)
+    traced_model = export(m, example_inputs)
+
+# Convert the model into a runnable ExecuTorch program.
+edge_config = EdgeCompileConfig(_check_ir_validity=False)
+edge_manager = to_edge(traced_model,  compile_config=edge_config)
+et_program = edge_manager.to_executorch()
 
-edge_config  =  EdgeCompileConfig(_check_ir_validity=False)
-edge_manager  =  to_edge(traced_model,  compile_config=edge_config)
-et_program  =  edge_manager.to_executorch()
+# Save the ExecuTorch program to a file.
+with open("nanogpt.pte", "wb") as file:
+    file.write(et_program.buffer)
 ```
 
-Also ExecuTorch provides different backend support for mobile acceleration. Simply call `to_backend()` with the specific backend partitioner on edge_manager  during exportation. Take Xnnpack delegation as an example:
+To export, run the script with `python export_nanogpt.py` (or python3, as appropriate for your environment). It will generate a `nanogpt.pte` file in the current directory.
 
+For more information, see [Exporting to ExecuTorch](https://pytorch.org/executorch/main/tutorials/export-to-executorch-tutorial.html) and
+[torch.export](https://pytorch.org/docs/stable/export.html).
 
-```
-from executorch.backends.xnnpack.partition.xnnpack_partitioner import XnnpackPartitioner
-from executorch.backends.xnnpack.utils.configs import get_xnnpack_edge_compile_config
-from executorch.exir import EdgeCompileConfig, to_edge
+### Step 2. Invoking the Runtime
 
-edge_config = edge_config = get_xnnpack_edge_compile_config()
-edge_manager = to_edge(traced_model, compile_config=edge_config)
-edge_manager = edge_manager.to_backend(XnnpackPartitioner())
+ExecuTorch provides a set of runtime APIs and types to load and run models.
 
-et_program = edge_manager.to_executorch()
-```
+Create a file called main.cpp with the following contents:
 
-After that, we’re ready to run our model. Remember to save you model before proceeding:
+```cpp
+// main.cpp
 
-```
-#Write the serialized ExecuTorch program to a file.
-with open("nanogpt.pte",  "wb") as file:
-file.write(et_program.buffer)
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <unordered_map>
+
+#include "basic_tokenizer.h"
+#include "basic_sampler.h"
+#include "managed_tensor.h"
+
+#include <executorch/extension/module/module.h>
+#include <executorch/extension/evalue_util/print_evalue.h>
+#include <executorch/runtime/core/exec_aten/exec_aten.h>
+#include <executorch/runtime/core/exec_aten/util/dim_order_util.h>
+#include <executorch/runtime/core/exec_aten/util/tensor_util.h>
+
+using namespace torch::executor;
+
+using SizesType = exec_aten::SizesType;
+using DimOrderType = exec_aten::DimOrderType;
+using StridesType = exec_aten::StridesType;
 ```
 
+The model inputs and outputs take the form of tensors. A tensor can be thought of as an multi-dimensional array.
+The ExecuTorch `EValue` class provides a wrapper around tensors and other ExecuTorch data types.
 
-Then run the script.
-`python export.py`
+Since the LLM generates one token at a time, the driver code needs to repeatedly invoke the model, building the
+output token by token. Each generated token is passed as input for the next run.
 
-### Step 2. Running the model
-Running model stands for executing the exported model on ExecuTorch runtime platform.
+```cpp
+// main.cpp
 
-Before running, we need to retrieve vocabulary file GPT2 used for tokenization:
+std::string generate(
+    Module& llm_model,
+    std::string& prompt,
+    BasicTokenizer& tokenizer,
+    BasicSampler& sampler,
+    size_t max_output_length) {
 
-```
-wget  https://huggingface.co/openai-community/gpt2/resolve/main/vocab.json
-```
-1.  Create the prompt:
-Prompt here means the initial cue given to the model, which it uses as a starting point to generate following sentences. Here we use “Hello world!” as example:
+    // Convert the input text into a list of integers (tokens) that represents
+    // it, using the string-to-token mapping that the model was trained on.
+    // Each token is an integer that represents a word or part of a word.
+    std::vector<int64_t> input_tokens = tokenizer.encode(prompt);
+    std::vector<int64_t> output_tokens;
 
+    for (auto i = 0u; i < max_output_length; i++) {
+        // Convert the input_tokens from a vector of int64_t to EValue.
+        // EValue is a unified data type in the ExecuTorch runtime.
+        ManagedTensor tensor_tokens(
+            input_tokens.data(),
+            {1, static_cast<int>(input_tokens.size())},
+            ScalarType::Long);
+        std::vector<EValue> inputs = {tensor_tokens.get_tensor()};
+
+        // Run the model. It will return a tensor of logits (log-probabilities).
+        Result<std::vector<EValue>> logits_evalue = llm_model.forward(inputs);
+
+        // Convert the output logits from EValue to std::vector, which is what
+        // the sampler expects.
+        Tensor logits_tensor = logits_evalue.get()[0].toTensor();
+        std::vector<float> logits(logits_tensor.data_ptr<float>(),
+            logits_tensor.data_ptr<float>() + logits_tensor.numel());
 
-```
-string  prompt  =  "Hello world!";
+        // Sample the next token from the logits.
+        int64_t next_token = sampler.sample(logits);
+        output_tokens.push_back(next_token);
+
+        std::cout << tokenizer.decode({ next_token });
+        std::cout.flush();
+
+        // Update next input.
+        input_tokens.erase(input_tokens.begin());
+        input_tokens.push_back(next_token);
+    }
+
+    std::cout << std::endl;
+
+    // Convert the output tokens into a human-readable string.
+    std::string output_string = tokenizer.decode(output_tokens);
+    return output_string;
+}
 ```
 
+The `Module` class handles loading the .pte file and preparing for execution.
 
-2.  Load tokenizer and model
-A Tokenizer is a crucial component among different Natural Language Processing (NLP) tasks. The primary functionalities are:
+The tokenizer is responsible for converting from a human-readable string representation of the prompt to the
+numerical form expected by the model. To do this, the tokenzier associates short substrings with a given token ID.
+The tokens can be thought of as representing words or parts of words, though, in-practice, they may be arbitrary
+sequences of characters.
 
--   Encode: Convert text into structural and numerical representations by parsing text into smaller units.Each unit is replaced by a specific number for the NLP model to consume
+The tokenizer loads the vocabulary from a file, which contains the mapping between each token ID and the text it
+represents. Call `tokenizer.encode()` and `tokenizer.decode()` to convert between string and token representations.
 
--   Decode: Convert the numerical representations back for human interpretation.
+The sampler is responsible for selecting the next token, based on the logits, or log-probabilties, output by the
+model. The LLM returns a logit value for each possible next token. The sampler chooses which token to use based
+on some strategy. The simplest approach, used here, is to take the token with the highest logit value.
 
+Samplers may provide configurable options, such as configurable amount of randomness to the outputs selection,
+penalties for repeated tokens, and biases to prioritize or de-prioritize specific tokens.
 
-In our NanoGPT example, we create a simple tokenizer called BasicTokenizer to demonstrate the function. You can use other implementations like [tiktoken](https://github.com/openai/tiktoken) or your own implementation to do that.
 
+```cpp
+// main.cpp
 
-```
-#include  "basic_tokenizer.h"
-BasicTokenizer tokenizer("vocab.json");
-```
+int main() {
+    // Set up the prompt. This provides the seed text for the model to elaborate.
+    std::string prompt = "Once upon a time, there was a";
 
+    // The tokenizer is used to convert between tokens (used by the model) and
+    // human-readable strings.
+    BasicTokenizer tokenizer("vocab.json");
 
-To load the exported ExecuTorch model into runtime environment, we can use **Module** class:
+    // The sampler is used to sample the next token from the logits.
+    BasicSampler sampler = BasicSampler();
 
+    // Load the exported nanoGPT program, which was generated via the previous steps.
+    Module model("nanogpt.pte", torch::executor::Module::MlockConfig::UseMlockIgnoreErrors);
 
-```
-#include <executorch/extension/module/module.h>
-Module llm_model("nanogpt.pte");
+    const auto max_output_tokens = 30;
+    std::cout << prompt;
+    generate(model, prompt, tokenizer, sampler, max_output_tokens);
+}
 ```
 
+Finally, download the following files into the same directory as main.h:
 
-3.  Tokenize the prompt
+TODO: This is a placeholder.
 ```
-vector<int64_t> tokens = tokenizer.encode(prompt);
+curl -O https://raw.githubusercontent.com/GregoryComer/et-tutorials/quantization/nanogpt/managed_tensor.h
+curl -O https://raw.githubusercontent.com/GregoryComer/et-tutorials/quantization/nanogpt/basic_tokenizer.h
+curl -O https://raw.githubusercontent.com/GregoryComer/et-tutorials/quantization/nanogpt/basic_sampler.h
 ```
 
-4.  Generate outputs
-We use the loaded model to generate text based on tokenized prompt. Here we create a helper function to illustrate the pipeline:
+To learn more, see [Running an ExecuTorch Model in C++](https://pytorch.org/executorch/main/running-a-model-cpp-tutorial.html)
+and the [ExecuTorch Runtime API Reference](https://pytorch.org/executorch/main/executorch-runtime-api-reference.html).
 
-```
-vector<int64_t> generate(Module& llm_model, vector<int64_t>& input_tokens, BasicSampler& sampler, size_t target_output_length) {
-    vector<int64_t> output_tokens;
-    for (int i = 0; i < target_output_length; i++) {
-        // Convert the input_tokens from a vector of int64_t to EValue.
-        // Evalue is a unified data type in the executorch runtime.
-        ManagedTensor tensor_tokens(input_tokens.data(), {1, 8}, ScalarType::Long);
-        vector<EValue> inputs = {tensor_tokens.get_tensor()};
-        // Run the model given the Evalue inputs. The model will also return a sequence of EValues as output.
-        Result<vector<EValue>> logits_evalue = llm_model.forward(inputs);
-        // Convert the output from EValue to a logits in float.
-        Tensor logits_tensor = logits_evalue.get()[0].toTensor();
-        vector<float> logits(logits_tensor.data_ptr<float>(), logits_tensor.data_ptr<float>() + logits_tensor.numel());
-        // Sample the next token from the logits.
-        int64_t next_token = sampler.sample(logits);
-        // Record the next token
-        output_tokens.push_back(next_token);
-        // Update next input.
-        input_tokens.erase(input_tokens.begin());
-        input_tokens.push_back(next_token);
-    }
-    return output_tokens;
-}
+### Building and Running
 
-```
+ExecuTorch uses the CMake build system. To compile and link against the ExecuTorch runtime,
+include the ExecuTorch project via `add_directory` and link against `executorch` and additional
+dependencies.
 
+Create a file named CMakeLists.txt with the following content:
 
-And in the main function, we leverage the function to generate the outputs.
 ```
-vector<int64_t> outputs = generate(llm_model, tokens, sampler, /*target_output_length*/20);
+# CMakeLists.txt
+
+cmake_minimum_required(VERSION 3.19)
+project(nanogpt_runner)
+
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED True)
+
+# Set options for executorch build.
+option(EXECUTORCH_BUILD_EXTENSION_DATA_LOADER "" ON)
+option(EXECUTORCH_BUILD_EXTENSION_MODULE "" ON)
+option(EXECUTORCH_BUILD_OPTIMIZED "" ON)
+
+# Include the executorch subdirectory.
+add_subdirectory(
+    ${CMAKE_CURRENT_SOURCE_DIR}/third-party/executorch
+    ${CMAKE_BINARY_DIR}/third-party/executorch)
+
+add_executable(nanogpt_runner main.cpp)
+target_link_libraries(
+    nanogpt_runner
+    PRIVATE
+    executorch
+    extension_module_static # Provides the Module class
+    optimized_native_cpu_ops_lib) # Provides baseline cross-platform kernels
+```
+
+At this point, the working directory should contain the following files:
+
+- CMakeLists.txt
+- main.cpp
+- basic_tokenizer.h
+- basic_sampler.h
+- managed_tensor.h
+- export_nanogpt.py
+- model.py
+- vocab.json
+- nanogpt.pte
+
+If all of these are present, you can now build and run:
+```bash
+(rm -rf cmake-out && mkdir cmake-out && cd cmake-out && cmake ..)
+cmake --build cmake-out -j10
+./cmake-out/nanogpt_runner
 ```
-Notice that here outputs are tokens, rather than actual natural language.
 
-5.  Decode the output.
-We convert the generated output tokens back to natural language for better understanding:
+You should see something like the following:
 
 ```
-string out_str = tokenizer.decode(outputs);
+Once upon a time, there was a man who was a member of the military...
 ```
 
-6.  Print the generated text
-```
-cout << "output: " << out_str << endl;
+At this point, it is likely to run very slowly. This is because ExecuTorch hasn't been told to optimize for
+specific hardware (delegation), and because it is doing all of the calculations in 32-bit floating point (no quantization).
+
+## Delegation
+
+While ExecuTorch provides a portable, cross-platform implementation for all
+operators, it also provides specialized backends for a number of different
+targets. These include, but are not limited to, x86 and ARM CPU acceleration via
+the XNNPACK backend, Apple acceleration via the CoreML backend and Metal
+Performance Shader (MPS) backend, and GPU acceleration via the Vulkan backend.
+
+Because optimizations are specific to a given backend, each pte file is specific
+to the backend(s) targeted at export. To support multiple devices, such as
+XNNPACK acceleration for Android and CoreML for iOS, export a separate PTE file
+for each backend.
+
+To delegate to a backend at export time, ExecuTorch provides the `to_backend()`
+function in the `EdgeProgramManager` object, which takes a backend-specific
+partitioner object. The partitioner is responsible for finding parts of the
+computation graph that can be accelerated by the target backend，and
+`to_backend()` function will delegate matched part to given backend for
+acceleration and optimization. Any portions of the computation graph not
+delegated will be executed by the ExecuTorch operator implementations.
+
+To delegate the exported model to the specific backend, we need to import its
+partitioner as well as edge compile config from ExecuTorch Codebase first, then
+call `to_backend` with an instance of partitioner on the `EdgeProgramManager`
+object `to_edge` function created.
+
+Here's an example of how to delegate NanoGPT to XNNPACK (if you're deploying to an Android Phone for instance):
+
+```python
+# export_nanogpt.py
+
+# Load partitioner for Xnnpack backend
+from executorch.backends.xnnpack.partition.xnnpack_partitioner import XnnpackPartitioner
+
+# Model to be delegated to specific backend should use specific edge compile config
+from executorch.backends.xnnpack.utils.configs import get_xnnpack_edge_compile_config
+from executorch.exir import EdgeCompileConfig, to_edge
+
+import torch
+from torch.export import export
+from torch.nn.attention import sdpa_kernel, SDPBackend
+from torch._export import capture_pre_autograd_graph
+
+from model import GPT
+
+# Load the NanoGPT model.
+model = GPT.from_pretrained('gpt2')
+
+# Create example inputs. This is used in the export process to provide
+# hints on the expected shape of the model input.
+example_inputs = (
+        torch.randint(0, 100, (1, 8), dtype=torch.long),
+    )
+
+# Trace the model, converting it to a portable intermediate representation.
+# The torch.no_grad() call tells PyTorch to exclude training-specific logic.
+with torch.nn.attention.sdpa_kernel([SDPBackend.MATH]), torch.no_grad():
+    m = capture_pre_autograd_graph(model, example_inputs)
+    traced_model = export(m, example_inputs)
+
+# Convert the model into a runnable ExecuTorch program.
+# To be further lowered to Xnnpack backend, `traced_model` needs xnnpack-specific edge compile config
+edge_config = get_xnnpack_edge_compile_config()
+edge_manager = to_edge(traced_model, compile_config=edge_config)
+
+# Delegate exported model to Xnnpack backend by invoking `to_backend` function with Xnnpack partitioner.
+edge_manager = edge_manager.to_backend(XnnpackPartitioner())
+et_program = edge_manager.to_executorch()
+
+# Save the Xnnpack-delegated ExecuTorch program to a file.
+with open("nanogpt.pte", "wb") as file:
+    file.write(et_program.buffer)
+
+
 ```
-### Build and Run
 
-1. Create the Cmake file for build
+Additionally, update CMakeLists.txt to build and link the XNNPACK backend to
+ExecuTorch runner.
+
 ```
 cmake_minimum_required(VERSION 3.19)
 project(nanogpt_runner)
@@ -223,71 +458,95 @@ project(nanogpt_runner)
 set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_STANDARD_REQUIRED True)
 
-
 # Set options for executorch build.
 option(EXECUTORCH_BUILD_EXTENSION_DATA_LOADER "" ON)
 option(EXECUTORCH_BUILD_EXTENSION_MODULE "" ON)
-option(EXECUTORCH_BUILD_XNNPACK "" ON)
-option(EXECUTORCH_BUILD_SDK "" ON) # Needed for etdump
+option(EXECUTORCH_BUILD_OPTIMIZED "" ON)
+option(EXECUTORCH_BUILD_XNNPACK "" ON) # Build with Xnnpack backend
 
 # Include the executorch subdirectory.
 add_subdirectory(
-    ${CMAKE_CURRENT_SOURCE_DIR}/../executorch
+    ${CMAKE_CURRENT_SOURCE_DIR}/third-party/executorch
     ${CMAKE_BINARY_DIR}/executorch)
 
 # include_directories(${CMAKE_CURRENT_SOURCE_DIR}/src)
 
-add_executable(nanogpt_runner nanogpt_runner.cpp)
+add_executable(nanogpt_runner main.cpp)
 target_link_libraries(
     nanogpt_runner
     PRIVATE
-    etdump
-    extension_module
-    portable_ops_lib)
-
+    executorch
+    extension_module_static # Provides the Module class
+    optimized_native_cpu_ops_lib # Provides baseline cross-platform kernels
+    xnnpack_backend) # Provides the XNNPACK CPU acceleration backend
 ```
 
-This CMake file links the ExecuTorch codebase, along with the necessary extensions and XNNPACK modules, to the nanogpt runner.
+Keep the rest of the code the same. For more details refer to
+[Exporting to ExecuTorch](https://pytorch.org/executorch/main/llm/getting-started.html#step-1-exporting-to-executorch)
+and
+[Invoking the Runtime](https://pytorch.org/executorch/main/llm/getting-started.html#step-2-invoking-the-runtime)
+for more details
 
-2. Build the c++ environment for nanorunner
-```
-(rm -rf cmake-out \
-  && mkdir cmake-out \
-  && cd cmake-out \
-  && cmake ..)
-```
+At this point, the working directory should contain the following files:
 
-3. With this CMake file as well as built environment iin place, you can build the nanogpt runner binary by executing the following command:
+- CMakeLists.txt
+- main.cpp
+- basic_tokenizer.h
+- basic_sampler.h
+- managed_tensor.h
+- export_nanogpt.py
+- model.py
+- vocab.json
 
-```
-cmake --build cmake-out --target nanogpt_runner -j9
+If all of these are present, you can now export Xnnpack delegated pte model:
+```bash
+python export_nanogpt.py
 ```
 
-4. After the build is complete, you can run the binary with this command:
-```
+It will generate `nanogpt.pte`, under the same working directory.
+
+Then we can build and run the model by:
+```bash
+(rm -rf cmake-out && mkdir cmake-out && cd cmake-out && cmake ..)
+cmake --build cmake-out -j10
 ./cmake-out/nanogpt_runner
 ```
-If everything worked it should see something like this:
-```
-prompt: Hello world!
-output: Hello world!
 
-I'm not sure if you've heard of the "Curse of the Dragon" or
+You should see something like the following:
+
+```
+Once upon a time, there was a man who was a member of the military...
 ```
 
-## Quantization (Optional)
 
-Quantization refers to a set of techniques for running calculations and storing tensors using lower precision types. Compared to 32-bit floating point, using 8-bit integers can provide both a significant speedup and reduction in memory usage. There are many approaches to quantizing a model, varying in amount of pre-processing required, data types used, and impact on model accuracy and performance.
+For more information regarding backend delegateion, see the ExecuTorch guides
+for the
+[XNNPACK Backend](https://pytorch.org/executorch/stable/tutorial-xnnpack-delegate-lowering.html)
+and
+[CoreML Backend](https://pytorch.org/executorch/stable/build-run-coreml.html).
 
-Because compute and memory are highly constrained on mobile devices, some form of quantization is necessary to ship large models on consumer electronics. In particular, large language models, such as Llama2, may require quantizing model weights to 4 bits or less.
+## Quantization
 
-Leveraging quantization requires transforming the model before export. PyTorch provides multiple quantization flows. Because we are quantizing a model for export, we need to use the PyTorch 2.0 export (pt2e) quantization API.
+Quantization refers to a set of techniques for running calculations and storing tensors using lower precision types.
+Compared to 32-bit floating point, using 8-bit integers can provide both a significant speedup and reduction in
+memory usage. There are many approaches to quantizing a model, varying in amount of pre-processing required, data
+types used, and impact on model accuracy and performance.
 
-This example targets CPU acceleration using the XNNPACK delegate. As such, we need to use the XNNPACK-specific quantizer. Targeting a different backend will require use of the corresponding quantizer.
+Because compute and memory are highly constrained on mobile devices, some form of quantization is necessary to ship
+large models on consumer electronics. In particular, large language models, such as Llama2, may require quantizing
+model weights to 4 bits or less.
 
-To use 8-bit integer dynamic quantization with the XNNPACK delegate, perform the following calls prior to calling export. This will update and annotate the computational graph to use quantized operators, where available.
+Leveraging quantization requires transforming the model before export. PyTorch provides the pt2e (PyTorch 2 Export)
+API for this purpose. This example targets CPU acceleration using the XNNPACK delegate. As such, it needs to use the
+ XNNPACK-specific quantizer. Targeting a different backend will require use of the corresponding quantizer.
+
+To use 8-bit integer dynamic quantization with the XNNPACK delegate, call `prepare_pt2e`, calibrate the model by
+running with a representative input, and then call `convert_pt2e`. This updates the computational graph to use
+quantized operators where available.
+
+```python
+# export_nanogpt.py
 
-```
 from executorch.backends.transforms.duplicate_dynamic_quant_chain import (
     DuplicateDynamicQuantChainPass,
 )
@@ -296,7 +555,9 @@ from torch.ao.quantization.quantizer.xnnpack_quantizer import (
     XNNPACKQuantizer,
 )
 from torch.ao.quantization.quantize_pt2e import convert_pt2e, prepare_pt2e
+```
 
+```python
 # Use dynamic, per-channel quantization.
 xnnpack_quant_config = get_symmetric_quantization_config(
     is_per_channel=True, is_dynamic=True
@@ -318,48 +579,53 @@ m = convert_pt2e(m, fold_quantize=False)
 DuplicateDynamicQuantChainPass()(m)
 
 traced_model = export(m, example_inputs)
-
 ```
 
-Additionally, add or update the to_backend() call to use XnnpackDynamicallyQuantizedPartitioner. This will instruct the lowering logic to emit the correct quantized operators.
+Additionally, add or update the `to_backend()` call to use `XnnpackPartitioner`. This instructs ExecuTorch to
+optimize the model for CPU execution via the XNNPACK backend.
 
-```
+```python
 from executorch.backends.xnnpack.partition.xnnpack_partitioner import (
-    XnnpackDynamicallyQuantizedPartitioner,
+    XnnpackPartitioner,
 )
+```
 
+```python
 edge_manager = to_edge(traced_model, compile_config=edge_config)
-
-# Lower to XNNPACK using the appropriate quantized partitioner.
-edge_manager = edge_manager.to_backend(XnnpackDynamicallyQuantizedPartitioner())
-
+edge_manager = edge_manager.to_backend(XnnpackPartitioner()) # Lower to XNNPACK.
 et_program = edge_manager.to_executorch()
 ```
-Finally, update the CMakeLists.txt to link the XNNPACK backend with the runner.
+
+Finally, ensure that the runner links against the `xnnpack_backend` target in CMakeLists.txt.
 
 ```
-add_executable(nanogpt_runner nanogpt_runner.cpp)
+add_executable(nanogpt_runner main.cpp)
 target_link_libraries(
     nanogpt_runner
     PRIVATE
-    etdump
-    extension_module
-    portable_ops_lib
-    xnnpack_backend) # Link the XNNPACK backend
+    executorch
+    extension_module_static # Provides the Module class
+    optimized_native_cpu_ops_lib # Provides baseline cross-platform kernels
+    xnnpack_backend) # Provides the XNNPACK CPU acceleration backend
 ```
 
-## Debugging and Profiling
-After lowering a model by calling to_backend(), you might want to see what got delegated and what didn’t. We provide util functions to help you get insight on the delegation, and with such information, you can debug and maybe improve the delegation.
+For more information, see [Quantization in ExecuTorch](https://pytorch.org/executorch/stable/quantization-overview.html).
 
-### Debug the Delegation
+## Profiling and Debugging
+After lowering a model by calling `to_backend()`, you may want to see what got delegated and what didn’t. ExecuTorch
+provides utility methods to give insight on the delegation. You can use this information to gain visibility into
+the underlying computation and diagnose potential performance issues. Model authors can use this information to
+structure the model in a way that is compatible with the target backend.
 
-1.  Get high level information
-get_delegation_info gives you a summary of what happened to the model after the to_backend() call:
+### Visualizing the Delegation
 
-```
+The `get_delegation_info()` method provides a summary of what happened to the model after the `to_backend()` call:
+
+```python
 from executorch.exir.backend.utils import get_delegation_info
 from tabulate import tabulate
 
+# ... After call to to_backend(), but before to_executorch()
 graph_module = edge_manager.exported_program().graph_module
 delegation_info = get_delegation_info(graph_module)
 print(delegation_info.get_summary())
@@ -367,8 +633,7 @@ df = delegation_info.get_operator_delegation_dataframe()
 print(tabulate(df, headers="keys", tablefmt="fancy_grid"))
 ```
 
-
-Take NanoGPT lowered to XNNPACK as an example:
+For NanoGPT targeting the XNNPACK backend, you might see the following:
 ```
 Total  delegated  subgraphs:  86
 Number  of  delegated  nodes:  473
@@ -382,115 +647,116 @@ Number  of  non-delegated  nodes:  430
 |  1  |  aten_add_tensor  |  37  |  0  |
 |  2  |  aten_addmm_default  |  48  |  0  |
 |  3  |  aten_arange_start_step  |  0  |  25  |
-|  4  |  aten_bmm_default  |  24  |  0  |
-|  5  |  aten_clone_default  |  0  |  38  |
-|  6  |  aten_embedding_default  |  0  |  2  |
-|  7  |  aten_expand_copy_default  |  48  |  0  |
-|  8  |  aten_full_default  |  0  |  12  |
-|  9  |  aten_full_like_default  |  0  |  12  |
-|  10  |  aten_gelu_default  |  0  |  12  |
-|  11  |  aten_index_tensor  |  0  |  1  |
-|  12  |  aten_le_scalar  |  0  |  12  |
-|  13  |  aten_logical_and_default  |  0  |  12  |
-|  14  |  aten_logical_not_default  |  0  |  12  |
-|  15  |  aten_mm_default  |  1  |  0  |
-|  16  |  aten_mul_scalar  |  24  |  0  |
-|  17  |  aten_native_layer_norm_default  |  0  |  25  |
-|  18  |  aten_permute_copy_default  |  109  |  0  |
-|  19  |  aten_scalar_tensor_default  |  0  |  12  |
-|  20  |  aten_split_with_sizes_copy_default  |  0  |  12  |
-|  21  |  aten_sub_tensor  |  0  |  12  |
-|  22  |  aten_unsqueeze_copy_default  |  0  |  24  |
+|      |  ...  |    |    |
 |  23  |  aten_view_copy_default  |  170  |  48  |
-|  24  |  aten_where_self  |  0  |  12  |
-|  25  |  getitem  |  0  |  147  |
+|      |  ...  |    |    |
 |  26  |  Total  |  473  |  430  |
 
-In the table, we see that op type aten_view_copy_default appears 170 times in delegate graphs and 48 times in non-delegated graphs.
+From the table, the operator `aten_view_copy_default` appears 170 times in delegate graphs and 48 times in non-delegated graphs.
+To see a more detailed view, use the `print_delegated_graph()` method to display a printout of the whole graph.
 
-| 23 | aten_view_copy_default | 170 | 48 |
-
-From here, we might want to know in which part of the graph it wasn’t delegated. For that, you can use the `print_delegated_graph` util function to see a printout of the whole graph with highlighted lowered graphs.
-
-2.  Print graph module
-Call this function right after you call `to_backend()`
-
-```
+```python
 from executorch.exir.backend.utils import print_delegated_graph
-graph_module = self.edge_manager.exported_program().graph_module
+graph_module = edge_manager.exported_program().graph_module
 print(print_delegated_graph(graph_module))
 ```
+This may generate a large amount of output for large models. Consider using "Control+F" or "Command+F" to locate the operator you’re interested in
+(e.g. “aten_view_copy_default”). Observe which instances are not under lowered graphs.
 
-On the printed graph, you can do "Control+F" (or "Command+F" on a Mac) on the operator type you’re interested in (e.g. “aten_view_copy_default”) and observe which ones of them are not under “lowered graph()”s.
+In the fragment of the output for NanoGPT below, observe that embedding and add operators are delegated to XNNPACK while the sub operator is not.
 
-### Performance Analysis (Optional)
+```
+%aten_unsqueeze_copy_default_22 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.unsqueeze_copy.default](args = (%aten_arange_start_step_23, -2), kwargs = {})
+  %aten_unsqueeze_copy_default_23 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.unsqueeze_copy.default](args = (%aten_arange_start_step_24, -1), kwargs = {})
+  %lowered_module_0 : [num_users=1] = get_attr[target=lowered_module_0]
+    backend_id: XnnpackBackend
+    lowered graph():
+      %aten_embedding_default : [num_users=1] = placeholder[target=aten_embedding_default]
+      %aten_embedding_default_1 : [num_users=1] = placeholder[target=aten_embedding_default_1]
+      %aten_add_tensor : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.add.Tensor](args = (%aten_embedding_default, %aten_embedding_default_1), kwargs = {})
+      return (aten_add_tensor,)
+  %executorch_call_delegate : [num_users=1] = call_function[target=torch.ops.higher_order.executorch_call_delegate](args = (%lowered_module_0, %aten_embedding_default, %aten_embedding_default_1), kwargs = {})
+  %aten_sub_tensor : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.sub.Tensor](args = (%aten_unsqueeze_copy_default, %aten_unsqueeze_copy_default_1), kwargs = {})
+```
+
+### Performance Analysis
 
-Through the ExecuTorch SDK, users are able to profile a model and inspect its latency performance.
+Through the ExecuTorch SDK, users are able to profile model execution, giving timing information for each operator in the model.
 
 #### Prerequisites
 
 ##### ETRecord generation (Optional)
 
-ETRecord contains model graphs and metadata for linking runtime results (such as profiling) to the eager model. You will be able to view all profiling events with just ETDump (see next section), but with ETRecord, you will also be able to link each event to the types of operators being executed, module hierarchy, and stack traces of the original PyTorch source code. For more information, see [https://pytorch.org/executorch/main/sdk-etrecord.html](https://pytorch.org/executorch/main/sdk-etrecord.html)
-
+An ETRecord is an artifact generated at the time of export that contains model graphs and source-level metadata linking the ExecuTorch program to the original PyTorch model. You can view all profiling events without an ETRecord, though with an ETRecord, you will also be able to link each event to the types of operators being executed, module hierarchy, and stack traces of the original PyTorch source code. For more information, see [https://pytorch.org/executorch/main/sdk-etrecord.html](https://pytorch.org/executorch/main/sdk-etrecord.html)
 
 
-**Steps for enablement:**
-ETRecord is created during export. In your export script, you just called `to_edge() `and it returned edge_program_manager
+In your export script, after calling `to_edge()` and `to_executorch()`, call `generate_etrecord()` with the `EdgeProgramManager` from `to_edge()` and the `ExecuTorchProgramManager` from `to_executorch()`. Make sure to copy the `EdgeProgramManager`, as the call to `to_backend()` mutates the graph in-place.
 
 ```
 import copy
+from executorch.sdk import generate_etrecord
 
-# Make the deep copy right after your call to to_edge()
-edge_program_manager_copy  =  copy.deepcopy(edge_program_manager)
+# Make the deep copy immediately after to to_edge()
+edge_manager_copy = copy.deepcopy(edge_manager)
 
 # ...
-# Then generate ETRecord right after your call to to_executorch()
-etrecord_path  =  "etrecord.bin"
-generate_etrecord(etrecord_path,  edge_program_manager_copy,  et_program_manager)
+# Generate ETRecord right after to_executorch()
+etrecord_path = "etrecord.bin"
+generate_etrecord(etrecord_path, edge_manager_copy, et_program)
 ```
-Run the export script, then the ETRecord should be generated under path ./etrecord.bin.
 
-##### ETDump generation
-
-ETDump contains runtime results from executing an ExecuTorch model. For more information, see [https://pytorch.org/executorch/main/sdk-etdump.html](https://pytorch.org/executorch/main/sdk-etdump.html)
+Run the export script and the ETRecord will be generated as `etrecord.bin`.
 
+##### ETDump generation
 
+An ETDump is an artifact generated at runtime containing a trace of the model execution. For more information, see [https://pytorch.org/executorch/main/sdk-etdump.html](https://pytorch.org/executorch/main/sdk-etdump.html)
 
-**Steps for enablement:**
-You need to enable ETDump generation in your nanogpt_runner.cpp.
+Include the ETDump header in your code.
+```cpp
+// main.cpp
 
-1.  Include the ETDump header in your code.
-```
-#include  <executorch/sdk/etdump/etdump_flatcc.h>
+#include <executorch/sdk/etdump/etdump_flatcc.h>
 ```
 
-2.  Create an Instance of the ETDumpGen class and pass it into the Module constructor
-```
+Create an Instance of the ETDumpGen class and pass it to the Module constructor.
+```cpp
 std::unique_ptr<torch::executor::ETDumpGen> etdump_gen_ = std::make_unique<torch::executor::ETDumpGen>();
-Module llm_model("nanogpt.pte", Module::MlockConfig::UseMlock, std::move(etdump_gen_));
+Module model("nanogpt.pte", torch::executor::Module::MlockConfig::UseMlockIgnoreErrors, std::move(etdump_gen_));
 ```
 
-3.  Dump out the ETDump buffer after call to generate()
-```
+After calling `generate()`, save the ETDump to a file. You can capture multiple
+model runs in a single trace, if desired.
+```cpp
 torch::executor::ETDumpGen* etdump_gen =
-static_cast<torch::executor::ETDumpGen*>(llm_model.event_tracer());
+    static_cast<torch::executor::ETDumpGen*>(model.event_tracer());
 
 ET_LOG(Info, "ETDump size: %zu blocks", etdump_gen->get_num_blocks());
 etdump_result result = etdump_gen->get_etdump_data();
 if (result.buf != nullptr && result.size > 0) {
-// On a device with a file system users can just write it out
-// to the file-system.
-FILE* f = fopen("etdump.etdp", "w+");
-fwrite((uint8_t*)result.buf, 1, result.size, f);
-fclose(f);
-free(result.buf);
+    // On a device with a file system, users can just write it to a file.
+    FILE* f = fopen("etdump.etdp", "w+");
+    fwrite((uint8_t*)result.buf, 1, result.size, f);
+    fclose(f);
+    free(result.buf);
 }
 ```
 
-4.  Compile your binary with the `ET_EVENT_TRACER_ENABLED` pre-processor flag to enable events to be traced and logged into ETDump inside the ExecuTorch runtime. Add these to your CMakeLists.txt
+Additionally, update CMakeLists.txt to build with SDK and enable events to be traced and logged into ETDump:
 
 ```
+option(EXECUTORCH_BUILD_SDK "" ON)
+
+# ...
+
+target_link_libraries(
+    nanogpt_runner
+    PRIVATE
+    executorch
+    extension_module_static # Provides the Module class
+    optimized_native_cpu_ops_lib # Provides baseline cross-platform kernels
+    xnnpack_backend # Provides the XNNPACK CPU acceleration backend
+    etdump) # Provides event tracing and logging
+
 target_compile_options(executorch PUBLIC -DET_EVENT_TRACER_ENABLED)
 target_compile_options(portable_ops_lib PUBLIC -DET_EVENT_TRACER_ENABLED)
 ```
@@ -498,45 +764,35 @@ Run the runner, you will see “etdump.etdp” generated.
 
 #### Analyze with Inspector APIs
 
-Once you’ve collected debug artifacts ETDump (and the optional ETRecord), you can feed them into Inspector APIs in order to get performance details.
+Once you’ve collected debug artifacts ETDump (and optionally an ETRecord), you can use the Inspector API to view performance information.
 
-##### Creating an Inspector
-```
+```python
 from executorch.sdk import Inspector
 
-inspector = Inspector(etdump_path="etdump.etdp", etrecord="etrecord.bin")
-# If you did not generate an ETRecord, then just pass in ETDump: `inspector = Inspector(etdump_path="etdump.etdp")`
-```
+inspector = Inspector(etdump_path="etdump.etdp")
+# If you also generated an ETRecord, then pass that in as well: `inspector = Inspector(etdump_path="etdump.etdp", etrecord="etrecord.bin")`
 
-Using an Inspector
-```
-with  open("inspector_out.txt", "w") as file:
+with open("inspector_out.txt", "w") as file:
     inspector.print_data_tabular(file)
 ```
-This saves the performance data in a tabular format in “inspector_out.txt”, with each row being a profiling event. Top rows:
-
-|  |  event_block_name  |  event_name  |  p10  (ms)  |  p50  (ms)  |  p90  (ms)  |  avg  (ms)  |  min  (ms)  |  max  (ms)  |  op_types  |  is_delegated_op  |  delegate_backend_name  |
-|---|----------------------|------------------|-----------|---------------|--------------|-------------|-------------|--------------|-------------|---------------------------|----------|
-|  0  |  Default  |  Method::init  |  60.502  |  60.502  |  60.502  |  60.502  |  60.502  |  60.502  |  []  |  False  |  |
-|  1  |  Default  |  Program::load_method  |  60.5114  |  60.5114  |  60.5114  |  60.5114  |  60.5114  |  60.5114  |  []  |  False  |  |
-|  2  |  Execute  |  native_call_arange.start_out  |  0.029583  |  0.029583  |  0.029583  |  0.029583  |  0.029583  |  0.029583  |  []  |  False  |  |
-|  3  |  Execute  |  native_call_embedding.out  |  0.022916  |  0.022916  |  0.022916  |  0.022916  |  0.022916  |  0.022916  |  []  |  False  |  |
-|  4  |  Execute  |  native_call_embedding.out  |  0.001084  |  0.001084  |  0.001084  |  0.001084  |  0.001084  |  0.001084  |  []  |  False  |  |
+This prints the performance data in a tabular format in “inspector_out.txt”, with each row being a profiling event. Top rows look like this:
+![](../_static/img/llm_manual_print_data_tabular.png)
+<a href="../_static/img/llm_manual_print_data_tabular.png" target="_blank">View in full size</a>
 
-For more information about Inspector APIs and the rich functionality it provides, see [https://pytorch.org/executorch/main/sdk-inspector.html](https://pytorch.org/executorch/main/sdk-inspector.html).
+To learn more about the Inspector and the rich functionality it provides, see the [Inspector API Reference](https://pytorch.org/executorch/main/sdk-inspector.html).
 
-## How to use custom kernels
-With our new custom op APIs, custom op/kernel authors can easily bring in their op/kernel into PyTorch/ExecuTorch and the process is streamlined.
+## Custom Kernels
+With the ExecuTorch custom operator APIs, custom operator and kernel authors can easily bring in their kernel into PyTorch/ExecuTorch.
 
 There are three steps to use custom kernels in ExecuTorch:
 
-1.  Prepare the kernel implementation using ExecuTorch types.
-2.  Compile and link the custom kernel to both AOT Python environment as well as the runner binary.
+1.  Write the custom kernel using ExecuTorch types.
+2.  Compile and link the custom kernel to both AOT Python environment as well as the runtime binary.
 3.  Source-to-source transformation to swap an operator with a custom op.
 
-### Prepare custom kernel implementation
+### Writing a Custom Kernel
 
-Define your custom operator schema for both functional variant (used in AOT compilation) and out variant (used in ExecuTorch runtime). The schema needs to follow PyTorch ATen convention (see [native_functions.yaml](https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/native/native_functions.yaml)). For example:
+Define your custom operator schema for both functional variant (used in AOT compilation) and out variant (used in ExecuTorch runtime). The schema needs to follow PyTorch ATen convention (see [native_functions.yaml](https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/native/native_functions.yaml)).
 
 ```
 custom_linear(Tensor weight, Tensor input, Tensor(?) bias) -> Tensor
@@ -544,89 +800,88 @@ custom_linear(Tensor weight, Tensor input, Tensor(?) bias) -> Tensor
 custom_linear.out(Tensor weight, Tensor input, Tensor(?) bias, *, Tensor(a!) out) -> Tensor(a!)
 ```
 
-Then write your custom kernel according to the schema using ExecuTorch types, along with APIs to register to ExecuTorch runtime:
-```
-// custom_linear.h/custom_linear.cpp
+Write your custom kernel according to the schema defined above. Use the `EXECUTORCH_LIBRARY` macro to make the kernel available to the ExecuTorch runtime.
+
+```cpp
+// custom_linear.h / custom_linear.cpp
 #include <executorch/runtime/kernel/kernel_includes.h>
 
 Tensor& custom_linear_out(const Tensor& weight, const Tensor& input, optional<Tensor> bias, Tensor& out) {
-
-// calculation
-return out;
+    // calculation
+    return out;
 }
 
-// opset namespace myop
+// Register as myop::custom_linear.out
 EXECUTORCH_LIBRARY(myop, "custom_linear.out", custom_linear_out);
 ```
 
-Now we need to write some wrapper for this op to show up in PyTorch, but don’t worry we don’t need to rewrite the kernel. Create a separate .cpp for this purpose:
+To make this operator available in PyTorch, you can define a wrapper around the ExecuTorch custom kernel. Note that the ExecuTorch
+implementation uses ExecuTorch tensor types, while the PyTorch wrapper uses ATen tensors.
 
-```
+```cpp
 // custom_linear_pytorch.cpp
+
 #include "custom_linear.h"
 #include <torch/library.h>
 
 at::Tensor custom_linear(const at::Tensor& weight, const at::Tensor& input, std::optional<at::Tensor> bias) {
 
-// initialize out
-at::Tensor out = at::empty({weight.size(1), input.size(1)});
+    // initialize out
+    at::Tensor out = at::empty({weight.size(1), input.size(1)});
 
-// wrap kernel in custom_linear.cpp into ATen kernel
-WRAP_TO_ATEN(custom_linear_out, 3)(weight, input, bias, out);
+    // wrap kernel in custom_linear.cpp into ATen kernel
+    WRAP_TO_ATEN(custom_linear_out, 3)(weight, input, bias, out);
 
-return out;
+    return out;
 }
 
-// standard API to register ops into PyTorch
+// Register the operator with PyTorch.
 TORCH_LIBRARY(myop,  m) {
-
-m.def("custom_linear(Tensor weight, Tensor input, Tensor(?) bias) -> Tensor", custom_linear);
-
-m.def("custom_linear.out(Tensor weight, Tensor input, Tensor(?) bias, *, Tensor(a!) out) -> Tensor(a!)", WRAP_TO_ATEN(custom_linear_out, 3));
+    m.def("custom_linear(Tensor weight, Tensor input, Tensor(?) bias) -> Tensor", custom_linear);
+    m.def("custom_linear.out(Tensor weight, Tensor input, Tensor(?) bias, *, Tensor(a!) out) -> Tensor(a!)", WRAP_TO_ATEN(custom_linear_out, 3));
 }
 ```
 
-### Compile and link the custom kernel
-
-Link it into ExecuTorch runtime: In our runner CMakeLists.txt we just need to add custom_linear.h/cpp into the binary target. We can build a dynamically loaded library (.so or .dylib) and link it as well.
+### Compile and Link the Custom Kernel
 
+To make it available to the ExecuTorch runtime, compile custom_linear.h/cpp into the binary target. You can also build the kernel as a dynamically loaded library (.so or .dylib) and link it as well.
 
+To make it available to PyTorch, package custom_linear.h, custom_linear.cpp and custom_linear_pytorch.cpp into a dynamically loaded library (.so or .dylib) and load it into the python environment.
+This is needed to make PyTorch aware of the custom operator at the time of export.
 
-Link it into PyTorch runtime: We need to package custom_linear.h, custom_linear.cpp and custom_linear_pytorch.cpp into a dynamically loaded library (.so or .dylib) and load it into our python environment. One way of doing this is:
-
-```
+```python
 import torch
-torch.ops.load_library("libcustom_linear.so/dylib")
+torch.ops.load_library("libcustom_linear.so")
 ```
 
+Once loaded, you can use the custom operator in PyTorch code.
 
-Once loaded we can perform the next step, of introducing the custom op into PyTorch environment.
-
-### Source-to-source transformation to introduce the custom op
+For more information, see [PyTorch Custom Operators](https://pytorch.org/tutorials/advanced/torch_script_custom_ops.html) and
+and [ExecuTorch Kernel Registration](https://pytorch.org/executorch/stable/kernel-library-custom-aten-kernel.html).
 
-Easier way to introduce our customized linear is by rewriting the eager model. However, that may miss some occurrences of torch.nn.Linear in our example. A safer option is to walk through all the modules in the module hierarchy and perform the swapping.
+### Using a Custom Operator in a Model
 
-For example, we can do the following to swap torch.nn.Linear with our custom linear op:
+The custom operator can explicitly used in the PyTorch model, or you can write a transformation to replace instances of a core operator with the custom variant. For this example, you could find
+all instances of `torch.nn.Linear` and replace them with `CustomLinear`.
 
-```
+```python
 def  replace_linear_with_custom_linear(module):
-    for  name,  child  in  module.named_children():
-        if  isinstance(child,  nn.Linear):
+    for name, child in module.named_children():
+        if isinstance(child, nn.Linear):
             setattr(
                 module,
                 name,
                 CustomLinear(child.in_features,  child.out_features, child.bias),
         )
-    else:
-        replace_linear_with_custom_linear(child)
+        else:
+            replace_linear_with_custom_linear(child)
 ```
 
-The rest of the steps will be the same as the normal flow. Now you can run this module in eager as well as export it to ExecuTorch and run on the runner.
+The remaining steps are the same as the normal flow. Now you can run this module in eager mode as well as export to ExecuTorch.
 
 ## How to build Mobile Apps
-You can also execute an LLM using ExecuTorch on iOS and Android
-
-**For iOS details see the [iOS Sample App](https://github.com/pytorch/executorch/tree/main/examples/demo-apps/apple_ios).**
+You can execute an LLM using ExecuTorch on iOS and Android.
 
+**For iOS see the [iLLaMA App](https://pytorch.org/executorch/main/llm/llama-demo-ios.html).**
 
-**For Android see the [Android Instructions](https://pytorch.org/executorch/main/llm/llama-demo-android.html).**
+**For Android, see the [Android Sample App](https://pytorch.org/executorch/main/llm/llama-demo-android.html).**
diff --git a/docs/source/llm/llama-demo-ios.md b/docs/source/llm/llama-demo-ios.md
new file mode 100644
index 00000000000..cc25a24f335
--- /dev/null
+++ b/docs/source/llm/llama-demo-ios.md
@@ -0,0 +1,2 @@
+```{include} ../../../examples/demo-apps/apple_ios/LLaMA/README.md
+```
\ No newline at end of file
diff --git a/docs/source/native-delegates-executorch-vulkan-delegate.md b/docs/source/native-delegates-executorch-vulkan-delegate.md
new file mode 100644
index 00000000000..2c83c7f899c
--- /dev/null
+++ b/docs/source/native-delegates-executorch-vulkan-delegate.md
@@ -0,0 +1 @@
+```{include} ../../backends/vulkan/README.md
diff --git a/examples/README.md b/examples/README.md
index bce3e08b58f..6865a5c35ac 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -30,6 +30,9 @@ examples
 
 A user's journey may commence by exploring the demos located in the [`portable/`](./portable) directory. Here, you will gain insights into the fundamental end-to-end workflow to generate a binary file from a ML model in [portable mode](../docs/source/concepts.md##portable-mode-lean-mode) and run it on the ExecuTorch runtime.
 
+## Demo of Llama2
+
+[This page](./models/llama2/README.md) demonstrates how to run a Llama 2 7B model on mobile via ExecuTorch. We use XNNPACK to accelerate the performance and 4-bit groupwise PTQ quantization to fit the model on Android and iOS mobile phones.
 
 ## Demo of Selective Build
 
@@ -37,7 +40,7 @@ To understand how to deploy the ExecuTorch runtime with optimization for binary
 
 ## Demo of ExecuTorch SDK
 
-You will find demos of [ExecuTorch SDK](./sdk/) in the [`sdk/`](./sdk/) directory. The examples focuses on exporting and executing BundledProgram for ExecuTorch model verification, and ETDump generation.
+You will find demos of [ExecuTorch SDK](./sdk/) in the [`sdk/`](./sdk/) directory. The examples focuses on exporting and executing BundledProgram for ExecuTorch model verification and ETDump for collecting profiling and debug data.
 
 ## Demo Apps
 
@@ -63,11 +66,6 @@ You will find demos of [ExecuTorch QNN Backend](./qualcomm) in the [`qualcomm/`]
 
 The [`xtensa/`](./xtensa) directory hosts a demo that showcases the process of exporting and executing a model on Xtensa Hifi4 DSP. You can utilize [this tutorial](../docs/source/build-run-xtensa.md) to guide you in configuring the demo and running it.
 
-
-## Demo of ExecuTorch SDK
-
-You will find demos of [ExecuTorch SDK](./sdk/) in the [`sdk/`](./sdk/) directory. The examples focuses on exporting and executing BundledProgram for ExecuTorch model verification and ETDump for collecting profiling and debug data.
-
 ## Dependencies
 
 Various models and workflows listed in this directory have dependencies on some other packages. You need to follow the setup guide in [Setting up ExecuTorch from GitHub](https://pytorch.org/executorch/stable/getting-started-setup) to have appropriate packages installed.
diff --git a/examples/apple/coreml/executor_runner/coreml_executor_runner.xcodeproj/project.pbxproj b/examples/apple/coreml/executor_runner/coreml_executor_runner.xcodeproj/project.pbxproj
index 66c0b182cd5..9f52b0e1e07 100644
--- a/examples/apple/coreml/executor_runner/coreml_executor_runner.xcodeproj/project.pbxproj
+++ b/examples/apple/coreml/executor_runner/coreml_executor_runner.xcodeproj/project.pbxproj
@@ -16,6 +16,7 @@
 		C94D51662ACFCBCB00AF47FD /* Accelerate.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = C94D51652ACFCBCB00AF47FD /* Accelerate.framework */; };
 		C94D51682ACFCC7100AF47FD /* libcoremldelegate.a in Frameworks */ = {isa = PBXBuildFile; fileRef = C94D51672ACFCC7100AF47FD /* libcoremldelegate.a */; };
 		C988D69D2B998CDE00979CF6 /* libprotobuf-lite.a in Frameworks */ = {isa = PBXBuildFile; fileRef = C988D69C2B998CD700979CF6 /* libprotobuf-lite.a */; };
+		F24817E72BC65B2000E80D98 /* libexecutorch_no_prim_ops.a in Frameworks */ = {isa = PBXBuildFile; fileRef = F24817E62BC65B2000E80D98 /* libexecutorch_no_prim_ops.a */; };
 /* End PBXBuildFile section */
 
 /* Begin PBXCopyFilesBuildPhase section */
@@ -41,6 +42,7 @@
 		C94D51652ACFCBCB00AF47FD /* Accelerate.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Accelerate.framework; path = System/Library/Frameworks/Accelerate.framework; sourceTree = SDKROOT; };
 		C94D51672ACFCC7100AF47FD /* libcoremldelegate.a */ = {isa = PBXFileReference; lastKnownFileType = archive.ar; name = libcoremldelegate.a; path = libraries/libcoremldelegate.a; sourceTree = "<group>"; };
 		C988D69C2B998CD700979CF6 /* libprotobuf-lite.a */ = {isa = PBXFileReference; lastKnownFileType = archive.ar; name = "libprotobuf-lite.a"; path = "libraries/libprotobuf-lite.a"; sourceTree = "<group>"; };
+		F24817E62BC65B2000E80D98 /* libexecutorch_no_prim_ops.a */ = {isa = PBXFileReference; lastKnownFileType = archive.ar; name = libexecutorch_no_prim_ops.a; path = libraries/libexecutorch_no_prim_ops.a; sourceTree = "<group>"; };
 /* End PBXFileReference section */
 
 /* Begin PBXFrameworksBuildPhase section */
@@ -49,6 +51,7 @@
 			buildActionMask = 2147483647;
 			files = (
 				38626BB52B225A890059413D /* libetdump.a in Frameworks */,
+				F24817E72BC65B2000E80D98 /* libexecutorch_no_prim_ops.a in Frameworks */,
 				38626BB42B225A560059413D /* libflatccrt.a in Frameworks */,
 				C94D51682ACFCC7100AF47FD /* libcoremldelegate.a in Frameworks */,
 				C94D51662ACFCBCB00AF47FD /* Accelerate.framework in Frameworks */,
@@ -90,6 +93,7 @@
 				C94D515C2ACFCBA000AF47FD /* libexecutorch.a */,
 				C94D51612ACFCBBA00AF47FD /* libsqlite3.tbd */,
 				C94D51672ACFCC7100AF47FD /* libcoremldelegate.a */,
+				F24817E62BC65B2000E80D98 /* libexecutorch_no_prim_ops.a */,
 			);
 			name = Frameworks;
 			sourceTree = "<group>";
diff --git a/examples/apple/coreml/scripts/build_executor_runner.sh b/examples/apple/coreml/scripts/build_executor_runner.sh
index ad63d2a942c..d47bdf8b0ff 100755
--- a/examples/apple/coreml/scripts/build_executor_runner.sh
+++ b/examples/apple/coreml/scripts/build_executor_runner.sh
@@ -61,6 +61,7 @@ cp -rf "$COREML_DIR_PATH/runtime/include/" "$INCLUDE_DIR_PATH"
 echo "ExecuTorch: Copying libraries"
 mkdir "$LIBRARIES_DIR_PATH"
 find "$CMAKE_BUILD_DIR_PATH/" -name 'libexecutorch.a' -exec cp -f "{}" "$LIBRARIES_DIR_PATH"  \;
+find "$CMAKE_BUILD_DIR_PATH/" -name 'libexecutorch_no_prim_ops.a' -exec cp -f "{}" "$LIBRARIES_DIR_PATH"  \;
 find "$CMAKE_BUILD_DIR_PATH/" -name 'libetdump.a' -exec cp -f "{}" "$LIBRARIES_DIR_PATH"  \;
 find "$CMAKE_BUILD_DIR_PATH/" -name 'libcoremldelegate.a' -exec cp -f "{}" "$LIBRARIES_DIR_PATH"  \;
 find "$CMAKE_BUILD_DIR_PATH/" -name 'libprotobuf-lite.a' -exec cp -f "{}" "$LIBRARIES_DIR_PATH"  \;
diff --git a/examples/arm/executor_runner/CMakeLists.txt b/examples/arm/executor_runner/CMakeLists.txt
index c738a9502bf..6836f8a79ca 100644
--- a/examples/arm/executor_runner/CMakeLists.txt
+++ b/examples/arm/executor_runner/CMakeLists.txt
@@ -43,6 +43,11 @@ add_library(executorch STATIC IMPORTED)
 set_property(TARGET executorch PROPERTY IMPORTED_LOCATION
   "${ET_BUILD_DIR_PATH}/libexecutorch.a")
 
+add_library(executorch_no_prim_ops STATIC IMPORTED)
+set_property(TARGET executorch_no_prim_ops PROPERTY IMPORTED_LOCATION
+  "${ET_BUILD_DIR_PATH}/libexecutorch_no_prim_ops.a")
+target_link_libraries(executorch INTERFACE executorch_no_prim_ops)
+
 add_library(executorch_delegate_ethos_u STATIC IMPORTED)
 set_property(TARGET executorch_delegate_ethos_u PROPERTY IMPORTED_LOCATION
   "${ET_BUILD_DIR_PATH}/backends/arm/libexecutorch_delegate_ethos_u.a")
diff --git a/examples/demo-apps/android/ExecuTorchDemo/README.md b/examples/demo-apps/android/ExecuTorchDemo/README.md
index 990dcfadc53..27e4d14876a 100644
--- a/examples/demo-apps/android/ExecuTorchDemo/README.md
+++ b/examples/demo-apps/android/ExecuTorchDemo/README.md
@@ -17,7 +17,7 @@ This guide explains how to setup ExecuTorch for Android using a demo app. The ap
 * Refer to [Setting up ExecuTorch](https://pytorch.org/executorch/stable/getting-started-setup) to set up the repo and dev environment.
 * Download and install [Android Studio and SDK](https://developer.android.com/studio).
 * Supported Host OS: CentOS, macOS Ventura (M1/x86_64). See below for Qualcomm HTP specific requirements.
-* *Qualcomm HTP Only[^1]:* To build and run on Qualcomm's AI Engine Direct, please follow [Building and Running ExecuTorch with Qualcomm AI Engine Direct Backend](build-run-qualcomm-ai-engine-direct-backend.md) for hardware and software pre-requisites.
+* *Qualcomm HTP Only[^1]:* To build and run on Qualcomm's AI Engine Direct, please follow [Building and Running ExecuTorch with Qualcomm AI Engine Direct Backend](build-run-qualcomm-ai-engine-direct-backend.md) for hardware and software pre-requisites. The version we use for this tutorial is 2.19. The chip we use for this tutorial is SM8450.
 :::
 ::::
 
@@ -39,7 +39,7 @@ We generate the model file for the ExecuTorch runtime in Android Demo App.
 For delegating DeepLab v3 to XNNPACK backend, please do the following to export the model:
 
 ```bash
-export FLATC_EXECUTABLE=$(realpath third-party/flatbuffers/cmake-out/flatc)
+export FLATC_EXECUTABLE=$(realpath third-party/flatbuffers/cmake-android-out/flatc)
 python3 -m examples.xnnpack.aot_compiler --model_name="dl3" --delegate
 mkdir -p examples/demo-apps/android/ExecuTorchDemo/app/src/main/assets/
 cp dl3_xnnpack_fp32.pte examples/demo-apps/android/ExecuTorchDemo/app/src/main/assets/
@@ -54,7 +54,7 @@ For delegating to Qualcomm Hexagon NPU, please follow the tutorial [here](build-
 After generating the model, copy the model to `assets` directory.
 
 ```bash
-python -m examples.qualcomm.scripts.deeplab_v3 -b build_android -m SM8550 -s <adb_connected_device_serial>
+python -m examples.qualcomm.scripts.deeplab_v3 -b build_android -m SM8450 -s <adb_connected_device_serial>
 cp deeplab_v3/dlv3_qnn.pte examples/demo-apps/android/ExecuTorchDemo/app/src/main/assets/
 ```
 
@@ -68,22 +68,20 @@ We build the required ExecuTorch runtime library to run the model.
 
 ```bash
 export ANDROID_NDK=<path-to-android-ndk>
-export BUCK2=/tmp/buck2 # Or your buck path
+export ANDROID_ABI=arm64-v8a
 
-rm -rf cmake-out && mkdir cmake-out && cd cmake-out
+rm -rf cmake-android-out && mkdir cmake-android-out
 
 # Build the core executorch library
-cmake .. -DCMAKE_INSTALL_PREFIX=cmake-out \
+cmake . -DCMAKE_INSTALL_PREFIX=cmake-android-out \
   -DCMAKE_TOOLCHAIN_FILE="${ANDROID_NDK}/build/cmake/android.toolchain.cmake" \
   -DANDROID_ABI="${ANDROID_ABI}" \
-  -DBUCK2="${BUCK2}" \
   -DEXECUTORCH_BUILD_XNNPACK=ON \
-  -DEXECUTORCH_BUILD_FLATC=OFF \
   -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
-  -DFLATC_EXECUTABLE="${FLATC}" \
-  -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON
+  -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+  -Bcmake-android-out
 
-cmake --build . -j16 --target install
+cmake --build cmake-android-out -j16 --target install
 ```
 
 When we set `EXECUTORCH_BUILD_XNNPACK=ON`, we will build the target [`xnnpack_backend`](https://github.com/pytorch/executorch/blob/main/backends/xnnpack/CMakeLists.txt) which in turn is linked into libexecutorch_jni via [CMake](https://github.com/pytorch/executorch/blob/main/examples/demo-apps/android/jni/CMakeLists.txt).
@@ -93,45 +91,53 @@ When we set `EXECUTORCH_BUILD_XNNPACK=ON`, we will build the target [`xnnpack_ba
 ```bash
 
 # Build the android extension
-cmake ../extension/android -DBUCK2="${BUCK2}" \
-  -DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK}/build/cmake/android.toolchain.cmake \
+cmake extension/android \
+  -DCMAKE_TOOLCHAIN_FILE="${ANDROID_NDK}"/build/cmake/android.toolchain.cmake \
   -DANDROID_ABI="${ANDROID_ABI}" \
-  -DCMAKE_INSTALL_PREFIX=cmake-out \
-  -Bextension/android
+  -DCMAKE_INSTALL_PREFIX=cmake-android-out \
+  -Bcmake-android-out/extension/android
 
-cmake --build ./extension/android -j16
+cmake --build cmake-android-out/extension/android -j16
 ```
 
 `libexecutorch_jni.so` wraps up the required XNNPACK Backend runtime library from `xnnpack_backend`, and adds an additional JNI layer using fbjni. This is later exposed to Java app.
 
 #### Qualcomm Hexagon NPU
 
-1. Configure the CMake target for the library with Qualcomm Hexagon NPU (HTP) backend (XNNPACK also included):
+1. Build the CMake target for the library with Qualcomm Hexagon NPU (HTP) backend (XNNPACK also included):
 
 ```bash
 export ANDROID_NDK=<path-to-android-ndk>
-export QNN_SDK=<path-to-qnn-sdk>
-
-rm -rf cmake-out && mkdir cmake-out && cd cmake-out
-cmake .. \
-    -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \
-    -DANDROID_ABI=arm64-v8a \
-    -DBUCK2=/tmp/buck2 \
-    -DEXECUTORCH_BUILD_ANDROID_JNI=ON \
+export ANDROID_ABI=arm64-v8a
+export QNN_SDK_ROOT=<path-to-qnn-sdk>
+
+rm -rf cmake-android-out && mkdir cmake-android-out && cd cmake-android-out
+cmake . -DCMAKE_INSTALL_PREFIX=cmake-android-out \
+    -DCMAKE_TOOLCHAIN_FILE="${ANDROID_NDK}/build/cmake/android.toolchain.cmake" \
+    -DANDROID_ABI="${ANDROID_ABI}" \
     -DEXECUTORCH_BUILD_XNNPACK=ON \
-    -DEXECUTORCH_BUILD_FLATC=OFF \
     -DEXECUTORCH_BUILD_QNN=ON \
-    -DQNN_SDK_ROOT=$QNN_SDK \
-    -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON
+    -DQNN_SDK_ROOT="${QNN_SDK_ROOT}" \
+    -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
+    -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+    -Bcmake-android-out
+
+cmake --build cmake-android-out -j16 --target install
 ```
 Similar to the XNNPACK library, with this setup, we compile `libexecutorch_jni.so` but it adds an additional static library `qnn_executorch_backend` which wraps up Qualcomm HTP runtime library and registers the Qualcomm HTP backend. This is later exposed to Java app.
 
 `qnn_executorch_backend` is built when we turn on CMake option `EXECUTORCH_BUILD_QNN`. It will include the [CMakeLists.txt](https://github.com/pytorch/executorch/blob/main/backends/qualcomm/CMakeLists.txt) from backends/qualcomm where we `add_library(qnn_executorch_backend STATIC)`.
 
-2. Build the libraries:
+2. Build the Android extension:
 
 ```bash
-cmake --build . -j16
+cmake extension/android \
+  -DCMAKE_TOOLCHAIN_FILE="${ANDROID_NDK}"/build/cmake/android.toolchain.cmake \
+  -DANDROID_ABI="${ANDROID_ABI}" \
+  -DCMAKE_INSTALL_PREFIX=cmake-android-out \
+  -Bcmake-android-out/extension/android
+
+cmake --build cmake-android-out/extension/android -j16
 ```
 
 ## Deploying on Device via Demo App
@@ -139,14 +145,9 @@ cmake --build . -j16
 ### Steps for Deploying Model via XNNPACK
 
 ```bash
-mkdir -p ../examples/demo-apps/android/ExecuTorchDemo/app/src/main/jniLibs/arm64-v8a
-```
-
-Copy the core libraries:
-
-```bash
-cp ./examples/demo-apps/android/jni/libexecutorch_jni.so \
-   ../examples/demo-apps/android/ExecuTorchDemo/app/src/main/jniLibs/arm64-v8a/libexecutorch.so
+mkdir -p examples/demo-apps/android/ExecuTorchDemo/app/src/main/jniLibs/arm64-v8a
+cp cmake-android-out/extension/android/libexecutorch_jni.so \
+   examples/demo-apps/android/ExecuTorchDemo/app/src/main/jniLibs/arm64-v8a/libexecutorch.so
 ```
 
 This allows the Android app to load ExecuTorch runtime with XNNPACK backend as a JNI library. Later, this shared library will be loaded by `NativePeer.java` in Java code.
@@ -160,15 +161,17 @@ mkdir -p ../examples/demo-apps/android/ExecuTorchDemo/app/src/main/jniLibs/arm64
 We need to push some additional Qualcomm HTP backend libraries to the app. Please refer to [Qualcomm docs](build-run-qualcomm-ai-engine-direct-backend.md) here.
 
 ```bash
-cp ${QNN_SDK_ROOT}/lib/aarch64-android/libQnnHtp.so ${QNN_SDK_ROOT}/lib/aarch64-android/libQnnHtpV69Skel.so ${QNN_SDK_ROOT}/lib/aarch64-android/libQnnHtpStub.so ${QNN_SDK_ROOT}/lib/aarch64-android/libQnnSystem.so \
-   ../examples/demo-apps/android/ExecuTorchDemo/app/src/main/jniLibs/arm64-v8a
+cp ${QNN_SDK_ROOT}/lib/aarch64-android/libQnnHtp.so ${QNN_SDK_ROOT}/lib/hexagon-v69/unsigned/libQnnHtpV69Skel.so ${QNN_SDK_ROOT}/lib/aarch64-android/libQnnHtpV69Stub.so ${QNN_SDK_ROOT}/lib/aarch64-android/libQnnSystem.so \
+   examples/demo-apps/android/ExecuTorchDemo/app/src/main/jniLibs/arm64-v8a
 ```
 
 Copy the core libraries:
 
 ```bash
-cp ./examples/demo-apps/android/jni/libexecutorch_jni.so \
-   ../examples/demo-apps/android/ExecuTorchDemo/app/src/main/jniLibs/arm64-v8a/libexecutorch.so
+cp cmake-android-out/extension/android/libexecutorch_jni.so \
+   examples/demo-apps/android/ExecuTorchDemo/app/src/main/jniLibs/arm64-v8a/libexecutorch.so
+cp cmake-android-out/lib/libqnn_executorch_backend.so \
+   examples/demo-apps/android/ExecuTorchDemo/app/src/main/jniLibs/arm64-v8a/libqnn_executorch_backend.so
 ```
 
 ## Running the App
diff --git a/examples/demo-apps/android/ExecuTorchDemo/app/build.gradle.kts b/examples/demo-apps/android/ExecuTorchDemo/app/build.gradle.kts
index 4407fbc3fe6..615fee860f8 100644
--- a/examples/demo-apps/android/ExecuTorchDemo/app/build.gradle.kts
+++ b/examples/demo-apps/android/ExecuTorchDemo/app/build.gradle.kts
@@ -68,3 +68,12 @@ dependencies {
   debugImplementation("androidx.compose.ui:ui-tooling")
   debugImplementation("androidx.compose.ui:ui-test-manifest")
 }
+
+tasks.register("setup") {
+  doFirst {
+    exec {
+      commandLine("sh", "examples/demo-apps/android/LlamaDemo/setup.sh")
+      workingDir("../../../../../")
+    }
+  }
+}
diff --git a/examples/demo-apps/android/ExecuTorchDemo/setup.sh b/examples/demo-apps/android/ExecuTorchDemo/setup.sh
index 66be7da3157..8ff65bee59b 100644
--- a/examples/demo-apps/android/ExecuTorchDemo/setup.sh
+++ b/examples/demo-apps/android/ExecuTorchDemo/setup.sh
@@ -1,40 +1,40 @@
 #!/usr/bin/env bash
+# Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
-# Copyright 2023-2024 Arm Limited and/or its affiliates.
-#
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
 set -eu
 
-# Note: Set up ANDROID_NDK, ANDROID_ABI, BUCK2, and FLATC
-cmake . -DCMAKE_INSTALL_PREFIX=cmake-out \
+CMAKE_OUT="${CMAKE_OUT:-cmake-out-android}"
+# Note: Set up ANDROID_NDK and ANDROID_ABI
+cmake . -DCMAKE_INSTALL_PREFIX="${CMAKE_OUT}" \
   -DCMAKE_TOOLCHAIN_FILE="${ANDROID_NDK}/build/cmake/android.toolchain.cmake" \
   -DANDROID_ABI="${ANDROID_ABI}" \
-  -DBUCK2="${BUCK2}" \
   -DEXECUTORCH_BUILD_XNNPACK=ON \
-  -DEXECUTORCH_BUILD_FLATC=OFF \
   -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
-  -DFLATC_EXECUTABLE="${FLATC}" \
   -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
-  -Bcmake-out
+  -DEXECUTORCH_BUILD_OPTIMIZED=ON \
+  -DCMAKE_BUILD_TYPE=Release \
+  -B"${CMAKE_OUT}"
 
 if [ "$(uname)" == "Darwin" ]; then
   CMAKE_JOBS=$(( $(sysctl -n hw.ncpu) - 1 ))
 else
   CMAKE_JOBS=$(( $(nproc) - 1 ))
 fi
-cmake --build cmake-out -j "${CMAKE_JOBS}" --target install
+cmake --build "${CMAKE_OUT}" -j "${CMAKE_JOBS}" --target install --config Release
 
-cmake extension/android -DBUCK2="${BUCK2}" \
+cmake extension/android \
   -DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK}/build/cmake/android.toolchain.cmake \
   -DANDROID_ABI="${ANDROID_ABI}" \
-  -DCMAKE_INSTALL_PREFIX=cmake-out \
-  -Bcmake-out/extension/android
+  -DCMAKE_INSTALL_PREFIX="${CMAKE_OUT}" \
+  -DCMAKE_BUILD_TYPE=Release \
+  -B"${CMAKE_OUT}"/extension/android
 
-cmake --build cmake-out/extension/android -j "${CMAKE_JOBS}"
+cmake --build "${CMAKE_OUT}"/extension/android -j "${CMAKE_JOBS}" --config Release
 
 JNI_LIBS_PATH="examples/demo-apps/android/ExecuTorchDemo/app/src/main/jniLibs"
 mkdir -p "${JNI_LIBS_PATH}/${ANDROID_ABI}"
-cp cmake-out/extension/android/libexecutorch_jni.so "${JNI_LIBS_PATH}/${ANDROID_ABI}/libexecutorch.so"
+cp "${CMAKE_OUT}"/extension/android/libexecutorch_jni.so "${JNI_LIBS_PATH}/${ANDROID_ABI}/"
diff --git a/examples/demo-apps/android/LlamaDemo/README.md b/examples/demo-apps/android/LlamaDemo/README.md
index fccc4288f53..0c70ec1620a 100644
--- a/examples/demo-apps/android/LlamaDemo/README.md
+++ b/examples/demo-apps/android/LlamaDemo/README.md
@@ -13,9 +13,7 @@ This app demonstrates the use of the LLaMA chat app demonstrating local inferenc
  * Alternatively, you can follow [this guide](https://github.com/pytorch/executorch/blob/856e085b9344c8b0bf220a97976140a5b76356aa/examples/demo-apps/android/LlamaDemo/SDK.md) to set up Java/SDK/NDK with CLI.
 * Supported Host OS: CentOS, macOS Sonoma on Apple Silicon.
 
-```{note}
-This demo app and tutorial has only been validated with arm64-v8a [ABI](https://developer.android.com/ndk/guides/abis), with NDK 25.0.8775105.
-```
+Note: This demo app and tutorial has only been validated with arm64-v8a [ABI](https://developer.android.com/ndk/guides/abis), with NDK 25.0.8775105.
 
 ## Getting models
 Please refer to the [ExecuTorch Llama2 docs](https://github.com/pytorch/executorch/blob/main/examples/models/llama2/README.md) to export the model.
@@ -27,22 +25,19 @@ adb push llama2.pte /data/local/tmp/llama
 adb push tokenizer.bin /data/local/tmp/llama
 ```
 
-```{note}
-The demo app searches in `/data/local/tmp/llama` for .pte and .bin files as LLAMA model and tokenizer.
-```
+Note: The demo app searches in `/data/local/tmp/llama` for .pte and .bin files as LLAMA model and tokenizer.
 
 ## Build JNI library
 1. Open a terminal window and navigate to the root directory of the `executorch`.
 2. Set the following environment variables:
-```{note}
-<path_to_android_ndk> is the root for the NDK, which is usually under
-~/Library/Android/sdk/ndk/XX.Y.ZZZZZ for macOS, and contains NOTICE and README.md.
-We use <path_to_android_ndk>/build/cmake/android.toolchain.cmake for CMake to cross-compile.
-```
 ```bash
 export ANDROID_NDK=<path_to_android_ndk>
 export ANDROID_ABI=arm64-v8a
 ```
+Note: `<path_to_android_ndk>` is the root for the NDK, which is usually under
+`~/Library/Android/sdk/ndk/XX.Y.ZZZZZ` for macOS, and contains NOTICE and README.md.
+We use `<path_to_android_ndk>/build/cmake/android.toolchain.cmake` for CMake to cross-compile.
+
 3. Run the following command set up the required JNI library:
 ```bash
 pushd examples/demo-apps/android/LlamaDemo
diff --git a/examples/demo-apps/apple_ios/ExecuTorchDemo/ExecuTorchDemo.xcodeproj/project.pbxproj b/examples/demo-apps/apple_ios/ExecuTorchDemo/ExecuTorchDemo.xcodeproj/project.pbxproj
index ce20a78e8e4..fd54315b721 100644
--- a/examples/demo-apps/apple_ios/ExecuTorchDemo/ExecuTorchDemo.xcodeproj/project.pbxproj
+++ b/examples/demo-apps/apple_ios/ExecuTorchDemo/ExecuTorchDemo.xcodeproj/project.pbxproj
@@ -784,7 +784,7 @@
 			isa = XCRemoteSwiftPackageReference;
 			repositoryURL = "https://github.com/pytorch/executorch";
 			requirement = {
-				branch = main;
+				branch = release/0.2;
 				kind = branch;
 			};
 		};
diff --git a/examples/demo-apps/apple_ios/README.md b/examples/demo-apps/apple_ios/ExecuTorchDemo/README.md
similarity index 100%
rename from examples/demo-apps/apple_ios/README.md
rename to examples/demo-apps/apple_ios/ExecuTorchDemo/README.md
diff --git a/examples/demo-apps/apple_ios/LLaMA/LLaMA.xcodeproj/project.pbxproj b/examples/demo-apps/apple_ios/LLaMA/LLaMA.xcodeproj/project.pbxproj
index 80ab3c34b0d..5ee7fc57247 100644
--- a/examples/demo-apps/apple_ios/LLaMA/LLaMA.xcodeproj/project.pbxproj
+++ b/examples/demo-apps/apple_ios/LLaMA/LLaMA.xcodeproj/project.pbxproj
@@ -796,7 +796,7 @@
 			isa = XCRemoteSwiftPackageReference;
 			repositoryURL = "https://github.com/pytorch/executorch";
 			requirement = {
-				branch = main;
+				branch = release/0.2;
 				kind = branch;
 			};
 		};
diff --git a/examples/demo-apps/apple_ios/LLaMA/LLaMA/Application/ContentView.swift b/examples/demo-apps/apple_ios/LLaMA/LLaMA/Application/ContentView.swift
index 5d7ddbc388f..e3db6125c49 100644
--- a/examples/demo-apps/apple_ios/LLaMA/LLaMA/Application/ContentView.swift
+++ b/examples/demo-apps/apple_ios/LLaMA/LLaMA/Application/ContentView.swift
@@ -215,7 +215,7 @@ struct ContentView: View {
           tokens.append(token)
           if tokens.count > 2 {
             let text = tokens.joined()
-            let count = text.count
+            let count = tokens.count
             tokens = []
             DispatchQueue.main.async {
               withAnimation {
diff --git a/examples/demo-apps/apple_ios/LLaMA/README.md b/examples/demo-apps/apple_ios/LLaMA/README.md
new file mode 100644
index 00000000000..04a6eaef671
--- /dev/null
+++ b/examples/demo-apps/apple_ios/LLaMA/README.md
@@ -0,0 +1,37 @@
+# Building ExecuTorch LLaMA iOS Demo App
+
+This app demonstrates the use of the LLaMA chat app demonstrating local inference use case with ExecuTorch.
+
+<img src="../_static/img/llama_ios_app.png" alt="iOS LLaMA App" /><br>
+
+## Prerequisites
+* [Xcode 15](https://developer.apple.com/xcode).
+* [iOS 17 SDK](https://developer.apple.com/ios).
+* Set up your ExecuTorch repo and environment if you haven’t done so by following the [Setting up ExecuTorch](https://pytorch.org/executorch/stable/getting-started-setup) to set up the repo and dev environment.
+
+## Exporting models
+Please refer to the [ExecuTorch Llama2 docs](https://github.com/pytorch/executorch/blob/main/examples/models/llama2/README.md) to export the model.
+
+## Run the App
+
+1. Open the [project](https://github.com/pytorch/executorch/blob/main/examples/demo-apps/apple_ios/LLaMA/LLaMA.xcodeproj) in Xcode.
+2. Run the app (cmd+R).
+3. In app UI pick a model and tokenizer to use, type a prompt and tap the arrow buton as on the [video](../_static/img/llama_ios_app.mp4).
+
+```{note}
+ExecuTorch runtime is distributed as a Swift package providing some .xcframework as prebuilt binary targets. Xcode will dowload and cache the package on the first run, which will take some time.
+```
+
+## Copy the model to Simulator
+
+1. Drag&drop the model and tokenizer files onto the Simulator window and save them somewhere inside the iLLaMA folder.
+2. Pick the files in the app dialog, type a prompt and click the arrow-up button.
+
+## Copy the model to Device
+
+1. Wire-connect the device and open the contents in Finder.
+2. Navigate to the Files tab and drag&drop the model and tokenizer files onto the iLLaMA folder.
+3. Wait until the files are copied.
+
+## Reporting Issues
+If you encountered any bugs or issues following this tutorial please file a bug/issue here on [Github](https://github.com/pytorch/executorch/issues/new).
diff --git a/examples/models/llama2/README.md b/examples/models/llama2/README.md
index d392673d34a..07a0534ff2c 100644
--- a/examples/models/llama2/README.md
+++ b/examples/models/llama2/README.md
@@ -32,15 +32,19 @@ Note that groupsize less than 128 was not enabled, since such model were still t
 
 ## Performance
 
-Performance was measured on Samsung Galaxy S22, S23, S24 and One Plus 12. Measurement performance is in terms of tokens/second.
+Performance was measured on Samsung Galaxy S22, S24, One Plus 12 and iPhone 15 max Pro. Measurement performance is in terms of tokens/second.
 
 |Device  | Groupwise 4-bit (128) | Groupwise 4-bit (256)
 |--------| ---------------------- | ---------------
-|Galaxy S22 | 8.15 tokens/second | 8.3 tokens/second |
-|Galaxy S24 | 10.66 tokens/second | 11.26 tokens/second |
-|One plus 12 | 11.55 tokens/second | 11.6 tokens/second |
-|iPhone 15 pro | x | x |
+|Galaxy S22*  | 8.15 tokens/second | 8.3 tokens/second |
+|Galaxy S24* | 10.66 tokens/second | 11.26 tokens/second |
+|One plus 12* | 11.55 tokens/second | 11.6 tokens/second |
+|Galaxy S22** | 5.5 tokens/second | 5.9 tokens/second |
+|iPhone 15 pro** | ~6 tokens/second | ~6 tokens/second |
 
+*: Measured via adb binary based [workflow](#step-5-run-benchmark-on)
+
+**: Measured via app based [workflow](#step-6-build-mobile-apps)
 
 # Instructions
 
@@ -61,10 +65,17 @@ You can export and run the original Llama2 7B model.
 
 1. Llama2 pretrained parameters can be downloaded from [Meta's official website](https://ai.meta.com/resources/models-and-libraries/llama-downloads/) or from [Hugging Face](https://huggingface.co/meta-llama/Llama-2-7b).
 
-2. Export model and generate `.pte` file:
+2. Edit `params.json` file. Replace `"vocab_size": -1` with `"vocab_size": 32000`. This is a short-term workaround.
+
+3. Export model and generate `.pte` file:
     ```
     python -m examples.models.llama2.export_llama --checkpoint <checkpoint.pth> --params <params.json> -kv --use_sdpa_with_kv_cache -X -qmode 8da4w --group_size 128 -d fp32
     ```
+4. Create tokenizer.bin.
+
+    ```
+    python -m examples.models.llama2.tokenizer.tokenizer -t tokenizer.model -o tokenizer.bin
+    ```
 
 ### Option B: Download and export stories110M model
 
@@ -208,20 +219,23 @@ cmake --build cmake-out-android/examples/models/llama2 -j16 --config Release
 
 **2.2 Upload model, tokenizer and llama runner binary to phone**
 ```
-adb push <model.pte> /data/local/tmp/
-adb push <tokenizer.bin> /data/local/tmp/
-adb push cmake-out-android/examples/models/llama2/llama_main /data/local/tmp/
+adb shell mkdir -p /data/local/tmp/llama
+adb push <model.pte> /data/local/tmp/llama/
+adb push <tokenizer.bin> /data/local/tmp/llama/
+adb push cmake-out-android/examples/models/llama2/llama_main /data/local/tmp/llama/
 ```
 
 **2.3 Run model**
 ```
-adb shell "cd /data/local/tmp && ./llama_main --model_path <model.pte> --tokenizer_path <tokenizer.bin> --prompt "Once upon a time" --seq_len 120
+adb shell "cd /data/local/tmp/llama && ./llama_main --model_path <model.pte> --tokenizer_path <tokenizer.bin> --prompt "Once upon a time" --seq_len 120
 ```
-## Step 6: Build iOS and/or Android apps
+## Step 6: Build Mobile apps
+
+### iOS
 
-TODO
+Please refer to [this tutorial](https://pytorch.org/executorch/main/llm/llama-demo-ios.html) to for full instructions on building the iOS LLAMA Demo App.
 
-### Android app
+### Android
 Please refer to [this tutorial](https://pytorch.org/executorch/main/llm/llama-demo-android.html) to for full instructions on building the Android LLAMA Demo App.
 
 # What is coming next?
@@ -238,7 +252,6 @@ Please refer to [this tutorial](https://pytorch.org/executorch/main/llm/llama-de
 - Enabling LLama2 7b and other architectures via Vulkan
 - Enabling performant execution of widely used quantization schemes.
 
-TODO
 
 # Notes
 This example tries to reuse the Python code, with minimal modifications to make it compatible with current ExecuTorch:
diff --git a/examples/models/llama2/TARGETS b/examples/models/llama2/TARGETS
index c93ea6149ff..b132962963a 100644
--- a/examples/models/llama2/TARGETS
+++ b/examples/models/llama2/TARGETS
@@ -18,7 +18,6 @@ runtime.python_library(
     ],
     deps = [
         "//caffe2:torch",
-        "//executorch/examples/models/llama2/custom_ops:llama_custom_ops_aot_lib",
     ],
 )
 
@@ -85,6 +84,7 @@ runtime.python_library(
         "//executorch/backends/vulkan/partitioner:vulkan_partitioner",
         "//executorch/examples/models:model_base",
         "//executorch/examples/models:models",
+        "//executorch/examples/models/llama2/custom_ops:custom_ops_aot_py",
         "//executorch/examples/portable:utils",
         "//executorch/exir:lib",
         "//executorch/sdk/etrecord:etrecord",
diff --git a/examples/models/llama2/builder.py b/examples/models/llama2/builder.py
index 3473391b641..cb1a82e9618 100644
--- a/examples/models/llama2/builder.py
+++ b/examples/models/llama2/builder.py
@@ -202,11 +202,7 @@ def source_transform(
     def _get_dynamic_shape(self) -> Any:
         dim = torch.export.Dim("token_dim", max=self.model.params.max_seq_len - 1)
         if self.use_kv_cache:
-            if self.use_sdpa_with_kv_cache:
-                return None
-            else:
-                # return {1: dim}, {0: dim}} TODO update xnnpack to be able to handle dynamic shape kv cache
-                return None
+            return None
         else:
             return ({1: dim},)
 
diff --git a/examples/models/llama2/export_llama_lib.py b/examples/models/llama2/export_llama_lib.py
index de1e711a2c9..18a7950d58b 100644
--- a/examples/models/llama2/export_llama_lib.py
+++ b/examples/models/llama2/export_llama_lib.py
@@ -9,6 +9,7 @@
 import argparse
 import copy
 import logging
+import math
 import os
 import shlex
 
@@ -23,7 +24,11 @@
     XnnpackDynamicallyQuantizedPartitioner,
 )
 
-from executorch.examples.models.llama2.llama_transformer import Transformer
+from executorch.examples.models.llama2.llama_transformer import (
+    KVCache,
+    SDPA,
+    Transformer,
+)
 from executorch.exir.backend.backend_details import CompileSpec
 
 from executorch.sdk.etrecord import generate_etrecord
@@ -88,6 +93,131 @@ def materialze_broadcast_of_rope_freq_cis(
     return module
 
 
+class SDPACustom(torch.nn.Module):
+    def __init__(
+        self,
+        kv_cache: KVCache,
+        dim: int,
+    ):
+        super().__init__()
+        self.kv_cache = kv_cache
+        self.dim = dim
+
+    def forward(
+        self,
+        input_pos: torch.Tensor,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        bsz,
+        seqlen,
+        mask,
+    ):
+        output = torch.ops.llama.sdpa_with_kv_cache(
+            q,
+            k,
+            v,
+            self.kv_cache.k_cache,
+            self.kv_cache.v_cache,
+            input_pos[-1].item(),
+            seqlen,
+        )
+        return output.view(bsz, seqlen, self.dim)
+
+
+def _replace_sdpa_with_custom_op(module: torch.nn.Module):
+    for name, child in module.named_children():
+        if isinstance(child, SDPA):
+            setattr(
+                module,
+                name,
+                SDPACustom(child.kv_cache, child.dim),
+            )
+        else:
+            _replace_sdpa_with_custom_op(child)
+
+
+def replace_sdpa_with_custom_op(module: torch.nn.Module) -> torch.nn.Module:
+    from executorch.examples.models.llama2.custom_ops import sdpa_with_kv_cache  # noqa
+
+    _replace_sdpa_with_custom_op(module)
+    return module
+
+
+class SDPASimple(torch.nn.Module):
+
+    def __init__(
+        self,
+        kv_cache: KVCache,
+        dim: int,
+        head_dim: int,
+        n_rep: int,
+    ):
+        super().__init__()
+        self.kv_cache = kv_cache
+        self.dim = dim
+        self.head_dim = head_dim
+        self.n_rep = n_rep
+
+    def forward(
+        self,
+        input_pos: torch.Tensor,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        bsz,
+        seqlen,
+        mask,
+    ):
+        q = q.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)
+        k = k.transpose(1, 2)
+        v = v.transpose(1, 2)
+
+        k, v = self.kv_cache.update(input_pos, k, v)
+        attn_mask = mask[None, None, input_pos]
+
+        k = k.repeat_interleave(self.n_rep, dim=1)
+        v = v.repeat_interleave(self.n_rep, dim=1)
+        scale_factor = 1 / math.sqrt(q.size(-1))
+        attn_weight = q @ k.transpose(-2, -1) * scale_factor
+        attn_weight += attn_mask
+        attn_weight = torch.softmax(attn_weight, dim=-1)
+        y = attn_weight @ v
+
+        return y.transpose(1, 2).contiguous().view(bsz, seqlen, self.dim)
+
+
+def replace_sdpa_with_simple_sdpa(module: torch.nn.Module):
+    for name, child in module.named_children():
+        if isinstance(child, SDPA):
+            setattr(
+                module,
+                name,
+                SDPASimple(child.kv_cache, child.dim, child.head_dim, child.n_rep),
+            )
+        else:
+            replace_sdpa_with_simple_sdpa(child)
+    return module
+
+
+def replace_causal_mask(module: torch.nn.Module):
+    for buffer_fqn_name, buffer in module.named_buffers():
+        buffer_name = buffer_fqn_name.split(".")[-1]
+        if buffer_name == "mask":
+            max_seq_len = buffer.shape[-1]
+            mask = torch.full(
+                (max_seq_len, max_seq_len),
+                float("-inf"),
+                device="cpu",
+            )
+
+            mask = torch.triu(mask, diagonal=1)
+            module.register_buffer(buffer_name, mask)
+    for _, child in module.named_children():
+        replace_causal_mask(child)
+    return module
+
+
 def quantize(
     model: torch.nn.Module,
     qmode: str,
@@ -482,6 +612,9 @@ def _prepare_for_llama_export(modelname: str, args) -> LlamaEdgeManager:
     if args.expand_rope_table:
         transforms.append(materialze_broadcast_of_rope_freq_cis)
 
+    if args.use_sdpa_with_kv_cache:
+        transforms.append(replace_sdpa_with_custom_op)
+
     return (
         load_llama_model(
             checkpoint=checkpoint_path,
@@ -605,9 +738,6 @@ def _export_llama(modelname, args) -> str:  # noqa: C901
         partitioners.append(
             # pyre-ignore: Undefined attribute [16]: Module `executorch.backends` has no attribute `apple`
             CoreMLPartitioner(
-                skip_ops_for_coreml_delegation=[
-                    "aten.index_put.default",
-                ],
                 compile_specs=compile_specs,
             )
         )
diff --git a/examples/models/llama2/llama_transformer.py b/examples/models/llama2/llama_transformer.py
index 2a259af59cb..189280bb8a5 100644
--- a/examples/models/llama2/llama_transformer.py
+++ b/examples/models/llama2/llama_transformer.py
@@ -193,6 +193,44 @@ def update(
         return k_out, v_out
 
 
+class SDPA(nn.Module):
+    def __init__(
+        self,
+        kv_cache: KVCache,
+        dim: int,
+        head_dim: int,
+        n_rep: int,
+    ):
+        super().__init__()
+        self.kv_cache = kv_cache
+        self.dim = dim
+        self.head_dim = head_dim
+        self.n_rep = n_rep
+
+    def forward(
+        self,
+        input_pos: torch.Tensor,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        bsz,
+        seqlen,
+        mask: torch.Tensor,
+    ) -> torch.Tensor:
+        q = q.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)
+        k = k.transpose(1, 2)
+        v = v.transpose(1, 2)
+
+        k, v = self.kv_cache.update(input_pos, k, v)
+        attn_mask = mask[None, None, input_pos]
+
+        k = k.repeat_interleave(self.n_rep, dim=1)
+        v = v.repeat_interleave(self.n_rep, dim=1)
+        y = F.scaled_dot_product_attention(q, k, v, attn_mask=attn_mask, dropout_p=0.0)
+
+        return y.transpose(1, 2).contiguous().view(bsz, seqlen, self.dim)
+
+
 class Attention(nn.Module):
     def __init__(self, args: ModelArgs, layer_id: int):
         super().__init__()
@@ -213,7 +251,6 @@ def __init__(self, args: ModelArgs, layer_id: int):
         self.wv = nn.Linear(args.dim, self.n_kv_heads * self.head_dim, bias=False)
         self.wo = nn.Linear(args.n_heads * self.head_dim, args.dim, bias=False)
 
-        self.use_sdpa_with_kv_cache_op = args.use_sdpa_with_kv_cache_op
         self.layer_id = layer_id
 
         causal_mask = torch.tril(
@@ -234,6 +271,12 @@ def __init__(self, args: ModelArgs, layer_id: int):
                 self.head_dim,
                 not args.use_sdpa_with_kv_cache_op,  # if we are using the custom op dont transpose the cache. Expect untransposed q k v
             )
+            self.SDPA = SDPA(
+                kv_cache=self.kv_cache,
+                dim=self.dim,
+                head_dim=self.head_dim,
+                n_rep=self.n_rep,
+            )
 
     def forward(
         self,
@@ -256,41 +299,8 @@ def forward(
 
         if self.use_kv_cache:
             assert input_pos is not None
-
-            if not self.use_sdpa_with_kv_cache_op:
-
-                q = q.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)
-                k = k.transpose(1, 2)
-                v = v.transpose(1, 2)
-
-                k, v = self.kv_cache.update(input_pos, k, v)
-                mask = self.mask[None, None, input_pos]
-
-                k = k.repeat_interleave(self.n_rep, dim=1)
-                v = v.repeat_interleave(self.n_rep, dim=1)
-                y = F.scaled_dot_product_attention(
-                    q, k, v, attn_mask=mask, dropout_p=0.0
-                )
-
-                y = y.transpose(1, 2).contiguous().view(bsz, seqlen, self.dim)
-
-                y = self.wo(y)
-                return y
-            else:
-                from .custom_ops.sdpa_with_kv_cache import sdpa_with_kv_cache  # noqa
-
-                output = torch.ops.llama.sdpa_with_kv_cache(
-                    q,
-                    k,
-                    v,
-                    self.kv_cache.k_cache,
-                    self.kv_cache.v_cache,
-                    input_pos[-1].item(),
-                    seqlen,
-                )
-                output = output.view(bsz, seqlen, -1)
-                output = self.wo(output)
-                return output
+            output = self.SDPA(input_pos, q, k, v, bsz, seqlen, self.mask)
+            return self.wo(output)
 
         q = q.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)
         k = k.transpose(1, 2)
diff --git a/examples/models/llama2/model.py b/examples/models/llama2/model.py
index 68882433679..461c0844435 100644
--- a/examples/models/llama2/model.py
+++ b/examples/models/llama2/model.py
@@ -173,11 +173,7 @@ def get_eager_model(self):
 
     def get_example_inputs(self):
         if self.use_kv_cache:
-            if self.use_sdpa_with_kv_cache_op:
-                return self.get_example_inputs_kvcache_sdpa()
-            else:
-                # return self.get_example_inputs_kvcache() TODO xnnpack does not handle forwarding symints, update partitioner to not partition symints
-                return self.get_example_inputs_kvcache_sdpa()
+            return self.get_example_inputs_kvcache_sdpa()
         else:
             return (
                 torch.tensor(
@@ -195,13 +191,3 @@ def get_example_inputs_kvcache_sdpa(self):
                 [0], dtype=torch.long
             ),  # start_pos, what token of output are we on.)
         )
-
-    def get_example_inputs_kvcache(self):
-        return (
-            torch.tensor(
-                [[1, 2, 3]], dtype=torch.long
-            ),  # tokens, with kv cache our input token length is always just 1 token.
-            torch.tensor(
-                [0, 1, 2], dtype=torch.long
-            ),  # start_pos, what token of output are we on.
-        )
diff --git a/examples/models/llama2/tests/TARGETS b/examples/models/llama2/tests/TARGETS
new file mode 100644
index 00000000000..3d2aef6209f
--- /dev/null
+++ b/examples/models/llama2/tests/TARGETS
@@ -0,0 +1,15 @@
+load("@fbcode_macros//build_defs:python_unittest.bzl", "python_unittest")
+
+oncall("executorch")
+
+python_unittest(
+    name = "test_simple_sdpa",
+    srcs = [
+        "test_simple_sdpa.py",
+    ],
+    deps = [
+        "//caffe2:torch",
+        "//executorch/examples/models/llama2:export_library",
+        "//executorch/examples/models/llama2:llama_transformer",
+    ],
+)
diff --git a/examples/models/llama2/tests/test_simple_sdpa.py b/examples/models/llama2/tests/test_simple_sdpa.py
new file mode 100644
index 00000000000..e5360f0e0fa
--- /dev/null
+++ b/examples/models/llama2/tests/test_simple_sdpa.py
@@ -0,0 +1,54 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import copy
+import unittest
+
+import torch
+from executorch.examples.models.llama2.export_llama_lib import SDPASimple
+from executorch.examples.models.llama2.llama_transformer import KVCache, SDPA
+
+
+class SDPATest(unittest.TestCase):
+    def test_simple_sdpa(self):
+        # Verify the correctness between the simple SDPA and the original SDPA module defined in llama_transformer.py
+        max_batch_size = 1
+        max_seq_length = 128
+        n_heads = 8
+        head_dim = 8
+        dim = 64
+        n_rep = 1
+        bsz = 1
+        seqlen = 1
+        n_local_heads = n_heads
+        kv_cache = KVCache(
+            max_batch_size=max_batch_size,
+            max_seq_length=max_seq_length,
+            n_heads=n_heads,
+            head_dim=head_dim,
+            transpose_cache=True,
+        )
+        sdpa = SDPA(
+            kv_cache=copy.deepcopy(kv_cache), dim=dim, head_dim=head_dim, n_rep=n_rep
+        )
+        input_pos = torch.tensor([0])
+        query = torch.randn(1, 1, n_local_heads, head_dim)
+        key = torch.randn(1, 1, n_local_heads, head_dim)
+        value = torch.randn(1, 1, n_local_heads, head_dim)
+        mask = torch.randn(max_seq_length, max_seq_length)
+        sdpa_output = sdpa(
+            input_pos, query, key, value, bsz=bsz, seqlen=seqlen, mask=mask
+        )
+
+        simple_sdpa = SDPASimple(
+            kv_cache=copy.deepcopy(kv_cache), dim=dim, head_dim=head_dim, n_rep=n_rep
+        )
+        simple_sdpa_output = simple_sdpa(
+            input_pos, query, key, value, bsz=bsz, seqlen=seqlen, mask=mask
+        )
+
+        # Compare the output from output from two sdpa implementation
+        self.assertTrue(torch.allclose(sdpa_output, simple_sdpa_output))
diff --git a/examples/qualcomm/executor_runner/qnn_executor_runner.cpp b/examples/qualcomm/executor_runner/qnn_executor_runner.cpp
index b2691da2ec7..8998ee634e0 100644
--- a/examples/qualcomm/executor_runner/qnn_executor_runner.cpp
+++ b/examples/qualcomm/executor_runner/qnn_executor_runner.cpp
@@ -202,8 +202,10 @@ int main(int argc, char** argv) {
   // be used by a single thread at at time, but it can be reused.
   //
   torch::executor::ETDumpGen etdump_gen = torch::executor::ETDumpGen();
+  // TODO: So far we have issues with etdump_gen during load_method. Enable it
+  // after the issues are fixed.
   Result<Method> method =
-      program->load_method(method_name, &memory_manager, &etdump_gen);
+      program->load_method(method_name, &memory_manager, nullptr);
   ET_CHECK_MSG(
       method.ok(),
       "Loading of method %s failed with status 0x%" PRIx32,
diff --git a/exir/backend/test/test_partitioner.py b/exir/backend/test/test_partitioner.py
index 74974d16231..d492c291f34 100644
--- a/exir/backend/test/test_partitioner.py
+++ b/exir/backend/test/test_partitioner.py
@@ -26,7 +26,7 @@
 from executorch.exir.backend.test.demos.rpc.executor_backend_preprocess import (
     ExecutorBackend,
 )
-from executorch.exir.backend.utils import get_delegates
+from executorch.exir.backend.utils import get_delegates, tag_constant_data
 
 from executorch.exir.dialects._ops import ops as exir_ops
 
@@ -523,3 +523,85 @@ def partition(
             "constant data node (b_const) is tagged with (tag0) but has user (aten_sub_tensor) which has tag (None)",
             str(error.exception),
         )
+
+    def test_not_delegate_mutable_buffers(self) -> None:
+        """
+        A test case to check the mutated buffer is not delegated. We'll need to add a test case
+        to consider when the delegate can consume the mutable buffer.
+        """
+
+        class MutableStateModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.register_buffer("my_state", torch.zeros(1))
+
+            def forward(self, x):
+                y = x + self.my_state
+                self.my_state.add_(1)
+                return y
+
+        edge = exir.to_edge(
+            torch.export.export(
+                MutableStateModule(),
+                (torch.zeros(1),),
+            )
+        )
+        self.assertGreater(
+            len(edge.exported_program().graph_signature.buffers_to_mutate),
+            0,
+            "The test case should at leaset one mutable buffer",
+        )
+
+        class PartitionerTagData(Partitioner):
+            def __init__(self):
+                super().__init__()
+                self.delegation_spec = DelegationSpec(
+                    ExecutorBackend.__name__,
+                    [CompileSpec(key, value) for key, value in self.spec.items()],
+                )
+
+            def partition(
+                self, edge_exported_program: ExportedProgram
+            ) -> PartitionResult:
+                partition_tags = {}
+                for node in edge_exported_program.graph.nodes:
+                    if node.op == "call_function" and node.target in [
+                        exir_ops.edge.aten.add.Tensor
+                    ]:
+                        delegation_tag = "tag0"
+                        node.meta["delegation_tag"] = delegation_tag
+                        partition_tags[delegation_tag] = self.delegation_spec
+                tag_constant_data(edge_exported_program)
+                return PartitionResult(
+                    tagged_exported_program=edge_exported_program,
+                    partition_tags=partition_tags,
+                )
+
+        # Check the edge program inital buffers_to_mutate
+        mutate_op = "aten_add_tensor_1"
+        self.assertEqual(
+            edge.exported_program().graph_signature.buffers_to_mutate[mutate_op],
+            "my_state",
+        )
+        edge = edge.to_backend(PartitionerTagData())
+        # After to_backend, add is delegated and is no longer in buffers_to_mutate.
+        self.assertNotIn(
+            mutate_op,
+            edge.exported_program().graph_signature.buffers_to_mutate,
+        )
+
+        mutate_op = "getitem_1"
+        # Ensure the mutated buffer is not delegated, and the new mutate node is getitem (from call_delegate)
+        self.assertEqual(
+            edge.exported_program().graph_signature.buffers_to_mutate[mutate_op],
+            "my_state",
+        )
+        # Check the copy_ node is inserted
+        edge = edge.to_executorch()
+        copy_node = [
+            node
+            for node in edge.exported_program().graph.nodes
+            if node.op == "call_function"
+            and node.target == torch.ops.aten.copy_.default
+        ]
+        self.assertEqual(len(copy_node), 1)
diff --git a/exir/backend/utils.py b/exir/backend/utils.py
index f4c1c28f8bd..b299ba4be8a 100644
--- a/exir/backend/utils.py
+++ b/exir/backend/utils.py
@@ -508,6 +508,20 @@ def tag_constant_data(edge_program: ExportedProgram) -> None:
     subgraph. Throw error when const/param/buffers is used across different partitions. That is the
     underlying data will be owned by multiple delegates.
     """
+    mutated_buffer = set()
+    for node in edge_program.graph.nodes:
+        if node.op == "placeholder" and (
+            is_param(edge_program, node)
+            or is_buffer(edge_program, node)
+            or is_lifted_tensor_constant(edge_program, node)
+        ):
+            for node_user in node.users:
+                if node_user.name in edge_program.graph_signature.buffers_to_mutate:
+                    logging.info(
+                        "The buffer node is a mutated buffer node, which is not constant."
+                    )
+                    mutated_buffer.add(node)
+
     for node in edge_program.graph.nodes:
         # go through const/param/buffer nodes, if all users of const/param/buffer nodes are partitioned then partition
         if node.op == "placeholder" and (
@@ -515,20 +529,21 @@ def tag_constant_data(edge_program: ExportedProgram) -> None:
             or is_buffer(edge_program, node)
             or is_lifted_tensor_constant(edge_program, node)
         ):
-            user_tags = set()
-            for user in node.users:
-                user_tag = user.meta.get("delegation_tag", None)
-                if user_tag is not None:
-                    user_tags.add(user_tag)
-            if len(user_tags) > 1:
-                logging.info(
-                    f"The data node is used across multiple partitions, including {user_tags}. "
-                    "If the data is too large and it's not preferred to copy, please tag the "
-                    "constant node like node.['no_copy'] = True and they won't be copied."
-                )
-            # tag the data node with the same tag as the last user
-            if len(user_tags) > 0:
-                node.meta["delegation_tag"] = user_tags.pop()
+            if node not in mutated_buffer:
+                user_tags = set()
+                for user in node.users:
+                    user_tag = user.meta.get("delegation_tag", None)
+                    if user_tag is not None:
+                        user_tags.add(user_tag)
+                if len(user_tags) > 1:
+                    logging.info(
+                        f"The data node is used across multiple partitions, including {user_tags}. "
+                        "If the data is too large and it's not preferred to copy, please tag the "
+                        "constant node like node.['no_copy'] = True and they won't be copied."
+                    )
+                # tag the data node with the same tag as the last user
+                if len(user_tags) > 0:
+                    node.meta["delegation_tag"] = user_tags.pop()
 
 
 # TODO - style: use templated types
diff --git a/extension/android/CMakeLists.txt b/extension/android/CMakeLists.txt
index c69cea0323e..4d4460203c0 100644
--- a/extension/android/CMakeLists.txt
+++ b/extension/android/CMakeLists.txt
@@ -58,14 +58,9 @@ if(EXECUTORCH_BUILD_LLAMA_JNI)
   add_library(llama_runner STATIC IMPORTED)
   set_property(TARGET llama_runner PROPERTY IMPORTED_LOCATION ${LLAMA_RUNNER_PATH})
 
-  set(CUSTOM_OPS_LIB_PATH ${CMAKE_CURRENT_BINARY_DIR}/../../examples/models/llama2/custom_ops/libcustom_ops_lib.a)
-  add_library(custom_ops_lib STATIC IMPORTED)
-  set_property(TARGET custom_ops_lib PROPERTY IMPORTED_LOCATION ${CUSTOM_OPS_LIB_PATH})
-
   set(CUSTOM_OPS_PATH ${CMAKE_CURRENT_BINARY_DIR}/../../examples/models/llama2/custom_ops/libcustom_ops.a)
   add_library(custom_ops STATIC IMPORTED)
   set_property(TARGET custom_ops PROPERTY IMPORTED_LOCATION ${CUSTOM_OPS_PATH})
-  target_link_options_shared_lib(custom_ops_lib)
 
   if(TARGET pthreadpool)
     set(LLAMA_JNI_SRCS jni/jni_layer_llama.cpp ../../backends/xnnpack/threadpool/cpuinfo_utils.cpp)
@@ -82,6 +77,6 @@ if(EXECUTORCH_BUILD_LLAMA_JNI)
   endif()
   target_include_directories(executorch_llama_jni PRIVATE ${_common_include_directories})
   target_link_libraries(executorch_llama_jni ${link_libraries} llama_runner
-                        custom_ops custom_ops_lib cpublas eigen_blas)
+                        custom_ops cpublas eigen_blas)
   target_compile_options(executorch_llama_jni PUBLIC ${_common_compile_options})
 endif()
diff --git a/install_requirements.sh b/install_requirements.sh
index c96aefc5628..8432e2184a9 100755
--- a/install_requirements.sh
+++ b/install_requirements.sh
@@ -9,7 +9,7 @@
 # Dependencies are defined in .pyproject.toml
 if [[ -z $PYTHON_EXECUTABLE ]];
 then
-  if [[ -z $CONDA_DEFAULT_ENV ]] || [[ $CONDA_DEFAULT_ENV == "base" ]];
+  if [[ -z $CONDA_DEFAULT_ENV ]] || [[ $CONDA_DEFAULT_ENV == "base" ]] || [[ ! -x "$(command -v python)" ]];
   then
     PYTHON_EXECUTABLE=python3
   else
@@ -17,6 +17,14 @@ then
   fi
 fi
 
+if [[ "$PYTHON_EXECUTABLE" == "python" ]];
+then
+  PIP_EXECUTABLE=pip
+else
+  PIP_EXECUTABLE=pip3
+fi
+
+
 # Parse options.
 EXECUTORCH_BUILD_PYBIND=OFF
 CMAKE_ARGS=""
@@ -49,24 +57,20 @@ done
 # Since ExecuTorch often uses main-branch features of pytorch, only the nightly
 # pip versions will have the required features. The NIGHTLY_VERSION value should
 # agree with the third-party/pytorch pinned submodule commit.
-#
-# NOTE: If a newly-fetched version of the executorch repo changes the value of
-# NIGHTLY_VERSION, you should re-run this script to install the necessary
-# package versions.
-NIGHTLY_VERSION=dev20240324
 
 # The pip repository that hosts nightly torch packages.
-TORCH_NIGHTLY_URL="https://download.pytorch.org/whl/nightly/cpu"
+TORCH_URL="https://download.pytorch.org/whl/test/cpu"
 
 # pip packages needed by exir.
 EXIR_REQUIREMENTS=(
-  torch=="2.4.0.${NIGHTLY_VERSION}"
-  torchvision=="0.19.0.${NIGHTLY_VERSION}"  # For testing.
+  torch=="2.3.0"
+  torchvision=="0.18.0"
 )
 
 # pip packages needed for development.
 DEVEL_REQUIREMENTS=(
   cmake  # For building binary targets.
+  pyyaml  # Imported by the kernel codegen tools.
   setuptools  # For building the pip package.
   tomli  # Imported by extract_sources.py when using python < 3.11.
   wheel  # For building the pip package archive.
@@ -77,7 +81,7 @@ DEVEL_REQUIREMENTS=(
 # TODO(dbort): Make each example publish its own requirements.txt
 EXAMPLES_REQUIREMENTS=(
   timm==0.6.13
-  torchaudio=="2.2.0.${NIGHTLY_VERSION}"
+  torchaudio=="2.3.0"
   torchsr==1.0.4
   transformers==4.38.2
 )
@@ -92,7 +96,7 @@ REQUIREMENTS_TO_INSTALL=(
 
 # Install the requirements. `--extra-index-url` tells pip to look for package
 # versions on the provided URL if they aren't available on the default URL.
-pip install --extra-index-url "${TORCH_NIGHTLY_URL}" \
+$PIP_EXECUTABLE install --extra-index-url "${TORCH_URL}" \
     "${REQUIREMENTS_TO_INSTALL[@]}"
 
 #
@@ -101,4 +105,4 @@ pip install --extra-index-url "${TORCH_NIGHTLY_URL}" \
 
 EXECUTORCH_BUILD_PYBIND="${EXECUTORCH_BUILD_PYBIND}" \
     CMAKE_ARGS="${CMAKE_ARGS}" \
-    pip install . --no-build-isolation -v
+    $PIP_EXECUTABLE install . --no-build-isolation -v
diff --git a/kernels/portable/cpu/op_isinf.cpp b/kernels/portable/cpu/op_isinf.cpp
index da8599d5fac..0ac1fa11955 100644
--- a/kernels/portable/cpu/op_isinf.cpp
+++ b/kernels/portable/cpu/op_isinf.cpp
@@ -14,8 +14,18 @@ namespace torch {
 namespace executor {
 namespace native {
 
+namespace {
+// Passing std::isinf directly to unary_ufunc_realhb_to_bool can cause "error:
+// cannot resolve overloaded function ‘isinf’ based on conversion to type
+// ‘torch::executor::FunctionRef<bool(double)>’" in some compilation
+// environments.
+bool isinf_wrapper(double num) {
+  return std::isinf(num);
+}
+} // namespace
+
 Tensor& isinf_out(RuntimeContext& ctx, const Tensor& in, Tensor& out) {
-  return internal::unary_ufunc_realhb_to_bool(std::isinf, ctx, in, out);
+  return internal::unary_ufunc_realhb_to_bool(isinf_wrapper, ctx, in, out);
 }
 
 } // namespace native
diff --git a/kernels/portable/cpu/op_isnan.cpp b/kernels/portable/cpu/op_isnan.cpp
index 2a82b127d3e..d9ef038b73a 100644
--- a/kernels/portable/cpu/op_isnan.cpp
+++ b/kernels/portable/cpu/op_isnan.cpp
@@ -14,8 +14,18 @@ namespace torch {
 namespace executor {
 namespace native {
 
+namespace {
+// Passing std::isnan directly to unary_ufunc_realhb_to_bool can cause "error:
+// cannot resolve overloaded function ‘isnan’ based on conversion to type
+// ‘torch::executor::FunctionRef<bool(double)>’" in some compilation
+// environments.
+bool isnan_wrapper(double num) {
+  return std::isnan(num);
+}
+} // namespace
+
 Tensor& isnan_out(RuntimeContext& ctx, const Tensor& in, Tensor& out) {
-  return internal::unary_ufunc_realhb_to_bool(std::isnan, ctx, in, out);
+  return internal::unary_ufunc_realhb_to_bool(isnan_wrapper, ctx, in, out);
 }
 
 } // namespace native
diff --git a/pyproject.toml b/pyproject.toml
index ddd7bb0914c..347db21ec42 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,10 +1,20 @@
 [build-system]
-requires = ["setuptools", "wheel"]
+requires = [
+  "cmake",  # For building binary targets in the wheel.
+  "pyyaml",  # Imported by the kernel codegen tools.
+  "setuptools",  # For building the pip package contents.
+  "tomli",  # Imported by extract_sources.py when using python < 3.11.
+  "wheel",  # For building the pip package archive.
+  "zstd",  # Imported by resolve_buck.py.
+]
 build-backend = "setuptools.build_meta"
 
 [project]
 name = "executorch"
-version = "0.1.0"
+# TODO(dbort): Use setuptools-git-versioning or setuptools-scm to get the
+# version from the git branch state. For now, use a version that doesn't look
+# like a real release.
+version = "0.2.0.dev0+unknown"
 # Python dependencies required for development
 dependencies=[
   "expecttest",
diff --git a/runtime/executor/targets.bzl b/runtime/executor/targets.bzl
index b3db52720c7..0f451b1a70a 100644
--- a/runtime/executor/targets.bzl
+++ b/runtime/executor/targets.bzl
@@ -44,9 +44,20 @@ def define_common_targets():
 
     for aten_mode in (True, False):
         aten_suffix = "_aten" if aten_mode else ""
-
         runtime.cxx_library(
             name = "program" + aten_suffix,
+            exported_deps = [
+                ":program_no_prim_ops" + aten_suffix,
+                "//executorch/kernels/prim_ops:prim_ops_registry" + aten_suffix,
+            ],
+            visibility = [
+                "//executorch/runtime/executor/...",
+                "@EXECUTORCH_CLIENTS",
+            ],
+        )
+
+        runtime.cxx_library(
+            name = "program_no_prim_ops" + aten_suffix,
             srcs = [
                 "method.cpp",
                 "method_meta.cpp",
@@ -54,34 +65,26 @@ def define_common_targets():
                 "tensor_parser_exec_aten.cpp",
                 "tensor_parser{}.cpp".format(aten_suffix if aten_mode else "_portable"),
             ],
-            headers = [
-                "tensor_parser.h",
-            ],
             exported_headers = [
                 "method.h",
                 "method_meta.h",
                 "program.h",
+                "tensor_parser.h",
             ],
-            deps = [
-                "//executorch/kernels/prim_ops:prim_ops_registry" + aten_suffix,
+            preprocessor_flags = _program_preprocessor_flags(),
+            exported_deps = [
+                ":memory_manager",
                 "//executorch/runtime/backend:interface",
-                "//executorch/runtime/core/exec_aten/util:tensor_util" + aten_suffix,
                 "//executorch/runtime/core:core",
+                "//executorch/runtime/core:evalue" + aten_suffix,
+                "//executorch/runtime/core:event_tracer" + aten_suffix,
+                "//executorch/runtime/core/exec_aten:lib" + aten_suffix,
+                "//executorch/runtime/core/exec_aten/util:tensor_util" + aten_suffix,
                 "//executorch/runtime/kernel:kernel_runtime_context" + aten_suffix,
                 "//executorch/runtime/kernel:operator_registry",
                 "//executorch/runtime/platform:platform",
                 "//executorch/schema:extended_header",
                 "//executorch/schema:program",
-                ":memory_manager",
-            ],
-            preprocessor_flags = _program_preprocessor_flags(),
-            exported_deps = [
-                "//executorch/runtime/core/exec_aten:lib" + aten_suffix,
-                "//executorch/runtime/core:core",
-                "//executorch/runtime/core:evalue" + aten_suffix,
-                "//executorch/runtime/platform:platform",
-                "//executorch/runtime/core:event_tracer" + aten_suffix,
-                ":memory_manager",
             ],
             visibility = [
                 "//executorch/runtime/executor/...",
diff --git a/setup.py b/setup.py
index bef57764b9d..6458fdc10c6 100644
--- a/setup.py
+++ b/setup.py
@@ -83,11 +83,6 @@ def _is_env_enabled(env_var: str, default: bool = False) -> bool:
     def pybindings(cls) -> bool:
         return cls._is_env_enabled("EXECUTORCH_BUILD_PYBIND", default=False)
 
-    @classmethod
-    @property
-    def xnnpack(cls) -> bool:
-        return cls._is_env_enabled("EXECUTORCH_BUILD_XNNPACK", default=False)
-
 
 class _BaseExtension(Extension):
     """A base class that maps an abstract source to an abstract destination."""
@@ -372,13 +367,9 @@ def run(self):
                 "-DEXECUTORCH_BUILD_PYBIND=ON",
             ]
             build_args += ["--target", "portable_lib"]
-            if ShouldBuild.xnnpack:
-                cmake_args += [
-                    "-DEXECUTORCH_BUILD_XNNPACK=ON",
-                ]
-                # No target needed; the cmake arg will link xnnpack
-                # into the portable_lib target.
-            # TODO(dbort): Add MPS/CoreML backends when building on macos.
+            # To link backends into the portable_lib target, callers should
+            # add entries like `-DEXECUTORCH_BUILD_XNNPACK=ON` to the CMAKE_ARGS
+            # environment variable.
 
         # Allow adding extra cmake args through the environment. Used by some
         # tests and demos to expand the set of targets included in the pip
@@ -398,7 +389,17 @@ def run(self):
         if not self.dry_run:
             # Dry run should log the command but not actually run it.
             (Path(cmake_cache_dir) / "CMakeCache.txt").unlink(missing_ok=True)
-        self.spawn(["cmake", "-S", repo_root, "-B", cmake_cache_dir, *cmake_args])
+        try:
+            # This script is sometimes run as root in docker containers. buck2
+            # doesn't allow running as root unless $HOME is owned by root or
+            # does not exist. So temporarily undefine it while configuring
+            # cmake, which runs buck2 to get some source lists.
+            old_home = os.environ.pop("HOME", None)
+            # Generate the build system files.
+            self.spawn(["cmake", "-S", repo_root, "-B", cmake_cache_dir, *cmake_args])
+        finally:
+            if old_home is not None:
+                os.environ["HOME"] = old_home
 
         # Build the system.
         self.spawn(["cmake", "--build", cmake_cache_dir, *build_args])
diff --git a/third-party/pytorch b/third-party/pytorch
index 0a038cf0cff..23961cef856 160000
--- a/third-party/pytorch
+++ b/third-party/pytorch
@@ -1 +1 @@
-Subproject commit 0a038cf0cff2d071b7359ac0491fd2ba7798a438
+Subproject commit 23961cef8565b2d01db5280ab518939b74bd5ff5